diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py index 38d2186d..d8a4ef5a 100644 --- a/server/voice_changer/Local/AudioDeviceList.py +++ b/server/voice_changer/Local/AudioDeviceList.py @@ -15,7 +15,11 @@ class ServerAudioDevice: def list_audio_device(): - audioDeviceList = sd.query_devices() + try: + audioDeviceList = sd.query_devices() + except Exception as e: + print("[Voice Changer] ex: query_devices", e) + return [], [] inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0] outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0] diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py index 396f9d8c..7505ded6 100644 --- a/server/voice_changer/Local/ServerDevice.py +++ b/server/voice_changer/Local/ServerDevice.py @@ -4,34 +4,66 @@ import numpy as np import librosa import sounddevice as sd +from dataclasses import dataclass, asdict, field + from voice_changer.Local.AudioDeviceList import ServerAudioDevice -from voice_changer.VoiceChanger import VoiceChanger +from voice_changer.VoiceChangerManager import VoiceChangerManager from voice_changer.utils.Timer import Timer +@dataclass() +class ServerDeviceSettings: + enableServerAudio: int = 0 # 0:off, 1:on + serverAudioStated: int = 0 # 0:off, 1:on + serverInputAudioSampleRate: int = 44100 + serverOutputAudioSampleRate: int = 44100 + serverInputDeviceId: int = -1 + serverOutputDeviceId: int = -1 + serverReadChunkSize: int = 256 + serverInputAudioGain: float = 1.0 + serverOutputAudioGain: float = 1.0 + + +EditableServerDeviceSettings = { + "intData": [ + "enableServerAudio", + "serverAudioStated", + "serverInputAudioSampleRate", + "serverOutputAudioSampleRate", + "serverInputDeviceId", + "serverOutputDeviceId", + "serverReadChunkSize", + ], + "floatData": [ + "serverInputAudioGain", + "serverOutputAudioGain", + ], +} + + class ServerDevice: - def __init__(self): - self.voiceChanger: VoiceChanger | None = None - pass + def __init__(self, voiceChangerManager: VoiceChangerManager): + self.settings = ServerDeviceSettings() + self.voiceChangerManager: VoiceChangerManager = voiceChangerManager def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): - if self.voiceChanger is None: + if self.voiceChangerManager.voiceChanger is None: print("[Voice Changer] voiceChanger is None") return try: - indata = indata * self.voiceChanger.settings.serverInputAudioGain + indata = indata * self.settings.serverInputAudioGain with Timer("all_inference_time") as t: unpackedData = librosa.to_mono(indata.T) * 32768.0 - out_wav, times = self.voiceChanger.on_request(unpackedData) + out_wav, times = self.voiceChangerManager.voiceChanger.on_request(unpackedData) outputChunnels = outdata.shape[1] outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0 - outdata[:] = outdata * self.voiceChanger.settings.serverOutputAudioGain + outdata[:] = outdata * self.settings.serverOutputAudioGain all_inference_time = t.secs performance = [all_inference_time] + times - if self.voiceChanger.emitTo is not None: - self.voiceChanger.emitTo(performance) - self.voiceChanger.settings.performance = [round(x * 1000) for x in performance] + if self.voiceChangerManager.voiceChanger.emitTo is not None: + self.voiceChangerManager.voiceChanger.emitTo(performance) + self.voiceChangerManager.voiceChanger.settings.performance = [round(x * 1000) for x in performance] except Exception as e: print("[Voice Changer] ex:", e) @@ -42,31 +74,26 @@ class ServerDevice: else: return None - def serverLocal(self, _vc: VoiceChanger): - self.voiceChanger = _vc - vc = self.voiceChanger - + def serverLocal(self): currentInputDeviceId = -1 currentModelSamplingRate = -1 currentOutputDeviceId = -1 currentInputChunkNum = -1 while True: - if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc is None: - vc.settings.inputSampleRate = 48000 + if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1 or self.voiceChangerManager is None: + self.voiceChangerManager.voiceChanger.settings.inputSampleRate = 48000 time.sleep(2) else: sd._terminate() sd._initialize() - sd.default.device[0] = vc.settings.serverInputDeviceId - currentInputDeviceId = vc.settings.serverInputDeviceId - sd.default.device[1] = vc.settings.serverOutputDeviceId - currentOutputDeviceId = vc.settings.serverOutputDeviceId + sd.default.device[0] = self.settings.serverInputDeviceId + currentInputDeviceId = self.settings.serverInputDeviceId + sd.default.device[1] = self.settings.serverOutputDeviceId + currentOutputDeviceId = self.settings.serverOutputDeviceId - currentInputChannelNum = vc.settings.serverAudioInputDevices - - serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId) - serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId) + serverInputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioInputDevices, currentInputDeviceId) + serverOutputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioOutputDevices, currentOutputDeviceId) print(serverInputAudioDevice, serverOutputAudioDevice) if serverInputAudioDevice is None or serverOutputAudioDevice is None: time.sleep(2) @@ -76,12 +103,12 @@ class ServerDevice: currentInputChannelNum = serverInputAudioDevice.maxInputChannels currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels - currentInputChunkNum = vc.settings.serverReadChunkSize + currentInputChunkNum = self.settings.serverReadChunkSize block_frame = currentInputChunkNum * 128 # sample rate precheck(alsa cannot use 40000?) try: - currentModelSamplingRate = self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() + currentModelSamplingRate = self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate() except Exception as e: print("[Voice Changer] ex: get_processing_sampling_rate", e) continue @@ -94,38 +121,38 @@ class ServerDevice: channels=[currentInputChannelNum, currentOutputChannelNum], ): pass - vc.settings.serverInputAudioSampleRate = currentModelSamplingRate - vc.settings.inputSampleRate = currentModelSamplingRate - print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}") + self.settings.serverInputAudioSampleRate = currentModelSamplingRate + self.voiceChangerManager.voiceChanger.settings.inputSampleRate = currentModelSamplingRate + print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}") except Exception as e: print( "[Voice Changer] ex: fallback to device default samplerate", e, ) - vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate - vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate + self.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate + self.voiceChangerManager.voiceChanger.settings.inputSampleRate = self.settings.serverInputAudioSampleRate # main loop try: with sd.Stream( callback=self.audio_callback, blocksize=block_frame, - samplerate=vc.settings.serverInputAudioSampleRate, + samplerate=self.settings.serverInputAudioSampleRate, dtype="float32", channels=[currentInputChannelNum, currentOutputChannelNum], ): - while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize: + while self.settings.serverAudioStated == 1 and currentInputDeviceId == self.settings.serverInputDeviceId and currentOutputDeviceId == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize: time.sleep(2) print( "[Voice Changer] server audio", - vc.settings.performance, + self.voiceChangerManager.settings.performance, ) print( "[Voice Changer] info:", - vc.settings.serverAudioStated, + self.settings.serverAudioStated, currentInputDeviceId, currentOutputDeviceId, - vc.settings.serverInputAudioSampleRate, + self.settings.serverInputAudioSampleRate, currentInputChunkNum, ) diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 7d500d6d..5b64a342 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -51,22 +51,7 @@ class VoiceChangerSettings: crossFadeOverlapSize: int = 4096 recordIO: int = 0 # 0:off, 1:on - serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: []) - serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: []) - enableServerAudio: int = 0 # 0:off, 1:on - serverAudioStated: int = 0 # 0:off, 1:on - # serverInputAudioSampleRate: int = 48000 - # serverOutputAudioSampleRate: int = 48000 - serverInputAudioSampleRate: int = 44100 - serverOutputAudioSampleRate: int = 44100 - # serverInputAudioBufferSize: int = 1024 * 24 - # serverOutputAudioBufferSize: int = 1024 * 24 - serverInputDeviceId: int = -1 - serverOutputDeviceId: int = -1 - serverReadChunkSize: int = 256 - serverInputAudioGain: float = 1.0 - serverOutputAudioGain: float = 1.0 performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) # ↓mutableな物だけ列挙 @@ -75,23 +60,12 @@ class VoiceChangerSettings: "inputSampleRate", "crossFadeOverlapSize", "recordIO", - "enableServerAudio", - "serverAudioStated", - "serverInputAudioSampleRate", - "serverOutputAudioSampleRate", - # "serverInputAudioBufferSize", - # "serverOutputAudioBufferSize", - "serverInputDeviceId", - "serverOutputDeviceId", - "serverReadChunkSize", ] ) floatData: list[str] = field( default_factory=lambda: [ "crossFadeOffsetRate", "crossFadeEndRate", - "serverInputAudioGain", - "serverOutputAudioGain", ] ) strData: list[str] = field(default_factory=lambda: []) @@ -108,120 +82,6 @@ class VoiceChanger: # emitTo = None - # def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): - # try: - # indata = indata * self.settings.serverInputAudioGain - # with Timer("all_inference_time") as t: - # unpackedData = librosa.to_mono(indata.T) * 32768.0 - # out_wav, times = self.on_request(unpackedData) - # outputChunnels = outdata.shape[1] - # outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0 - # outdata[:] = outdata * self.settings.serverOutputAudioGain - # all_inference_time = t.secs - # performance = [all_inference_time] + times - # if self.emitTo is not None: - # self.emitTo(performance) - # self.settings.performance = [round(x * 1000) for x in performance] - # except Exception as e: - # print("[Voice Changer] ex:", e) - - # def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int): - # serverAudioDevice = [x for x in audioDeviceList if x.index == index] - # if len(serverAudioDevice) > 0: - # return serverAudioDevice[0] - # else: - # return None - - # def serverLocal(self, _vc): - # vc: VoiceChanger = _vc - - # currentInputDeviceId = -1 - # currentModelSamplingRate = -1 - # currentOutputDeviceId = -1 - # currentInputChunkNum = -1 - # while True: - # if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None: - # vc.settings.inputSampleRate = 48000 - # time.sleep(2) - # else: - # sd._terminate() - # sd._initialize() - - # sd.default.device[0] = vc.settings.serverInputDeviceId - # currentInputDeviceId = vc.settings.serverInputDeviceId - # sd.default.device[1] = vc.settings.serverOutputDeviceId - # currentOutputDeviceId = vc.settings.serverOutputDeviceId - - # currentInputChannelNum = vc.settings.serverAudioInputDevices - - # serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId) - # serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId) - # print(serverInputAudioDevice, serverOutputAudioDevice) - # if serverInputAudioDevice is None or serverOutputAudioDevice is None: - # time.sleep(2) - # print("serverInputAudioDevice or serverOutputAudioDevice is None") - # continue - - # currentInputChannelNum = serverInputAudioDevice.maxInputChannels - # currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels - - # currentInputChunkNum = vc.settings.serverReadChunkSize - # block_frame = currentInputChunkNum * 128 - - # # sample rate precheck(alsa cannot use 40000?) - # try: - # currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate() - # except Exception as e: - # print("[Voice Changer] ex: get_processing_sampling_rate", e) - # continue - # try: - # with sd.Stream( - # callback=self.audio_callback, - # blocksize=block_frame, - # samplerate=currentModelSamplingRate, - # dtype="float32", - # channels=[currentInputChannelNum, currentOutputChannelNum], - # ): - # pass - # vc.settings.serverInputAudioSampleRate = currentModelSamplingRate - # vc.settings.inputSampleRate = currentModelSamplingRate - # print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}") - # except Exception as e: - # print( - # "[Voice Changer] ex: fallback to device default samplerate", - # e, - # ) - # vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate - # vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate - - # # main loop - # try: - # with sd.Stream( - # callback=self.audio_callback, - # blocksize=block_frame, - # samplerate=vc.settings.serverInputAudioSampleRate, - # dtype="float32", - # channels=[currentInputChannelNum, currentOutputChannelNum], - # ): - # while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize: - # time.sleep(2) - # print( - # "[Voice Changer] server audio", - # self.settings.performance, - # ) - # print( - # "[Voice Changer] info:", - # vc.settings.serverAudioStated, - # currentInputDeviceId, - # currentOutputDeviceId, - # vc.settings.serverInputAudioSampleRate, - # currentInputChunkNum, - # ) - - # except Exception as e: - # print("[Voice Changer] ex:", e) - # time.sleep(2) - def __init__(self, params: VoiceChangerParams, slotIndex: int): # 初期化 self.settings = VoiceChangerSettings() @@ -238,9 +98,9 @@ class VoiceChanger: self.ioRecorder: IORecorder | None = None self.sola_buffer: AudioInOut | None = None - audioinput, audiooutput = list_audio_device() - self.settings.serverAudioInputDevices = audioinput - self.settings.serverAudioOutputDevices = audiooutput + # audioinput, audiooutput = list_audio_device() + # self.settings.serverAudioInputDevices = audioinput + # self.settings.serverAudioOutputDevices = audiooutput self.slotIndex = slotIndex self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex) diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 93eeb7e0..a15b3202 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -3,6 +3,7 @@ import threading from data.ModelSample import ModelSamples from data.ModelSlot import ModelSlots, loadSlotInfo from utils.downloader.SampleDownloader import downloadSample, getSampleInfos +from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device from voice_changer.Local.ServerDevice import ServerDevice from voice_changer.RVC.ModelSlotGenerator import setSlotAsRVC @@ -26,6 +27,7 @@ class GPUInfo: @dataclass() class VoiceChangerManagerSettings: slotIndex: int + intData: list[str] = field(default_factory=lambda: ["slotIndex"]) @@ -43,6 +45,14 @@ class VoiceChangerManager(object): self.sampleModels: list[ModelSamples] = getSampleInfos(self.params.sample_mode) self.gpus: list[GPUInfo] = self._get_gpuInfos() + audioinput, audiooutput = list_audio_device() + self.serverAudioInputDevices: list[ServerAudioDevice] = audioinput + self.serverAudioOutputDevices: list[ServerAudioDevice] = audiooutput + + # ServerDevice + thread = threading.Thread(target=self.serverDevice.serverLocal, args=(self,)) + thread.start() + def _get_gpuInfos(self): devCount = torch.cuda.device_count() gpus = [] @@ -63,8 +73,6 @@ class VoiceChangerManager(object): print(f"VoiceChanger Initialized (GPU_NUM:{gpu_num}, mps_enabled:{mps_enabled})") cls._instance.voiceChanger = VoiceChanger(params, cls._instance.settings.slotIndex) - thread = threading.Thread(target=cls._instance.serverDevice.serverLocal, args=(cls._instance.voiceChanger,)) - thread.start() cls._instance.voiceChanger.prepareModel() return cls._instance @@ -94,6 +102,8 @@ class VoiceChangerManager(object): data["slotInfos"] = slotInfos data["gpus"] = self.gpus data["sampleModels"] = self.sampleModels + data["serverAudioInputDevices"] = self.serverAudioInputDevices + data["serverAudioOutputDevices"] = self.serverAudioOutputDevices data["status"] = "OK" if hasattr(self, "voiceChanger"):