diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py index ccfe2da5..9e2fda5b 100644 --- a/server/voice_changer/Local/AudioDeviceList.py +++ b/server/voice_changer/Local/AudioDeviceList.py @@ -1,10 +1,5 @@ -import pyaudio - -# import json - - +import sounddevice as sd from dataclasses import dataclass - from const import ServerAudioDeviceTypes @@ -14,44 +9,42 @@ class ServerAudioDevice: index: int = 0 name: str = "" hostAPI: str = "" + maxInputChannels: int = 0 + maxOutputChannels: int = 0 def list_audio_device(): - audio = pyaudio.PyAudio() - audio_input_devices: list[ServerAudioDevice] = [] - audio_output_devices: list[ServerAudioDevice] = [] - host_apis = [] + audioDeviceList = sd.query_devices() - for api_index in range(audio.get_host_api_count()): - host_apis.append(audio.get_host_api_info_by_index(api_index)["name"]) + inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0] + outputDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0] + hostapis = sd.query_hostapis() - for x in range(0, audio.get_device_count()): - device = audio.get_device_info_by_index(x) - try: - deviceName = device["name"].encode("shift-jis").decode("utf-8") - except (UnicodeDecodeError, UnicodeEncodeError): - deviceName = device["name"] + print("input:", inputAudioDeviceList) + print("output:", outputDeviceList) + print("hostapis", hostapis) - deviceIndex = device["index"] - hostAPI = host_apis[device["hostApi"]] + serverAudioInputDevices = [] + serverAudioOutputDevices = [] + for d in inputAudioDeviceList: + serverInputAudioDevice: ServerAudioDevice = ServerAudioDevice( + kind=ServerAudioDeviceTypes.audioinput, + index=d["index"], + name=d["name"], + hostAPI=hostapis[d["hostapi"]]["name"], + maxInputChannels=d["max_input_channels"], + maxOutputChannels=d["max_output_channels"], + ) + serverAudioInputDevices.append(serverInputAudioDevice) + for d in outputDeviceList: + serverOutputAudioDevice: ServerAudioDevice = ServerAudioDevice( + kind=ServerAudioDeviceTypes.audiooutput, + index=d["index"], + name=d["name"], + hostAPI=hostapis[d["hostapi"]]["name"], + maxInputChannels=d["max_input_channels"], + maxOutputChannels=d["max_output_channels"], + ) + serverAudioOutputDevices.append(serverOutputAudioDevice) - if device["maxInputChannels"] > 0: - audio_input_devices.append( - ServerAudioDevice( - kind=ServerAudioDeviceTypes.audioinput, - index=deviceIndex, - name=deviceName, - hostAPI=hostAPI, - ) - ) - if device["maxOutputChannels"] > 0: - audio_output_devices.append( - ServerAudioDevice( - kind=ServerAudioDeviceTypes.audiooutput, - index=deviceIndex, - name=deviceName, - hostAPI=hostAPI, - ) - ) - - return audio_input_devices, audio_output_devices + return serverAudioInputDevices, serverAudioOutputDevices diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index fed8590f..95419e87 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -102,17 +102,29 @@ class VoiceChanger: def audio_callback( self, indata: np.ndarray, outdata: np.ndarray, frames, times, status ): - # print(indata) try: with Timer("all_inference_time") as t: unpackedData = librosa.to_mono(indata.T) * 32768.0 out_wav, times = self.on_request(unpackedData) - outdata[:] = np.repeat(out_wav, 2).reshape(-1, 2) / 32768.0 + outputChunnels = outdata.shape[1] + outdata[:] = ( + np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) + / 32768.0 + ) all_inference_time = t.secs performance = [all_inference_time] + times self.settings.performance = [round(x * 1000) for x in performance] except Exception as e: - print(e) + print("[Voice Changer] ex:", e) + + def getServerAudioDevice( + self, audioDeviceList: list[ServerAudioDevice], index: int + ): + serverAudioDevice = [x for x in audioDeviceList if x.index == index] + if len(serverAudioDevice) > 0: + return serverAudioDevice[0] + else: + return None def serverLocal(self, _vc): vc: VoiceChanger = _vc @@ -131,12 +143,28 @@ class VoiceChanger: else: sd._terminate() sd._initialize() - if currentInputDeviceId != vc.settings.serverInputDeviceId: - sd.default.device[0] = vc.settings.serverInputDeviceId - currentInputDeviceId = vc.settings.serverInputDeviceId - if currentOutputDeviceId != vc.settings.serverOutputDeviceId: - sd.default.device[1] = vc.settings.serverOutputDeviceId - currentOutputDeviceId = vc.settings.serverOutputDeviceId + + sd.default.device[0] = vc.settings.serverInputDeviceId + currentInputDeviceId = vc.settings.serverInputDeviceId + sd.default.device[1] = vc.settings.serverOutputDeviceId + currentOutputDeviceId = vc.settings.serverOutputDeviceId + + currentInputChannelNum = vc.settings.serverAudioInputDevices + + serverInputAudioDevice = self.getServerAudioDevice( + vc.settings.serverAudioInputDevices, currentInputDeviceId + ) + serverOutputAudioDevice = self.getServerAudioDevice( + vc.settings.serverAudioOutputDevices, currentOutputDeviceId + ) + print(serverInputAudioDevice, serverOutputAudioDevice) + if serverInputAudioDevice is None or serverOutputAudioDevice is None: + time.sleep(2) + print("serverInputAudioDevice or serverOutputAudioDevice is None") + continue + + currentInputChannelNum = serverInputAudioDevice.maxInputChannels + currentOutputChannelNum = serverInputAudioDevice.maxOutputChannels vc.settings.serverInputAudioSampleRate = ( self.voiceChanger.get_processing_sampling_rate() @@ -144,14 +172,16 @@ class VoiceChanger: currentInputSampleRate = vc.settings.serverInputAudioSampleRate currentInputChunkNum = vc.settings.serverReadChunkSize block_frame = currentInputChunkNum * 128 + try: with sd.Stream( callback=self.audio_callback, blocksize=block_frame, samplerate=currentInputSampleRate, dtype="float32", - channels=1, + channels=[currentInputChannelNum, currentOutputChannelNum], ): + print() while ( vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId @@ -182,8 +212,7 @@ class VoiceChanger: ) except Exception as e: - print(e) - print() + print("[Voice Changer] ex:", e) time.sleep(2) def __init__(self, params: VoiceChangerParams):