fix: server device mode channel num

2025-01-23 13:35:12 +03:00 · 2023-05-13 14:30:15 +09:00 · 2023-05-13 14:30:15 +09:00 · 6fecb5f908
commit 6fecb5f908
parent ea3c5b5740
2 changed files with 74 additions and 52 deletions
--- a/server/voice_changer/Local/AudioDeviceList.py
+++ b/server/voice_changer/Local/AudioDeviceList.py
@ -1,10 +1,5 @@
-import pyaudio
-
-# import json
-
-
+import sounddevice as sd
 from dataclasses import dataclass
-
 from const import ServerAudioDeviceTypes


@ -14,44 +9,42 @@ class ServerAudioDevice:
    index: int = 0
    name: str = ""
    hostAPI: str = ""
+    maxInputChannels: int = 0
+    maxOutputChannels: int = 0


 def list_audio_device():
-    audio = pyaudio.PyAudio()
-    audio_input_devices: list[ServerAudioDevice] = []
-    audio_output_devices: list[ServerAudioDevice] = []
-    host_apis = []
+    audioDeviceList = sd.query_devices()

-    for api_index in range(audio.get_host_api_count()):
-        host_apis.append(audio.get_host_api_info_by_index(api_index)["name"])
+    inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0]
+    outputDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0]
+    hostapis = sd.query_hostapis()

-    for x in range(0, audio.get_device_count()):
-        device = audio.get_device_info_by_index(x)
-        try:
-            deviceName = device["name"].encode("shift-jis").decode("utf-8")
-        except (UnicodeDecodeError, UnicodeEncodeError):
-            deviceName = device["name"]
+    print("input:", inputAudioDeviceList)
+    print("output:", outputDeviceList)
+    print("hostapis", hostapis)

-        deviceIndex = device["index"]
-        hostAPI = host_apis[device["hostApi"]]
+    serverAudioInputDevices = []
+    serverAudioOutputDevices = []
+    for d in inputAudioDeviceList:
+        serverInputAudioDevice: ServerAudioDevice = ServerAudioDevice(
+            kind=ServerAudioDeviceTypes.audioinput,
+            index=d["index"],
+            name=d["name"],
+            hostAPI=hostapis[d["hostapi"]]["name"],
+            maxInputChannels=d["max_input_channels"],
+            maxOutputChannels=d["max_output_channels"],
+        )
+        serverAudioInputDevices.append(serverInputAudioDevice)
+    for d in outputDeviceList:
+        serverOutputAudioDevice: ServerAudioDevice = ServerAudioDevice(
+            kind=ServerAudioDeviceTypes.audiooutput,
+            index=d["index"],
+            name=d["name"],
+            hostAPI=hostapis[d["hostapi"]]["name"],
+            maxInputChannels=d["max_input_channels"],
+            maxOutputChannels=d["max_output_channels"],
+        )
+        serverAudioOutputDevices.append(serverOutputAudioDevice)

-        if device["maxInputChannels"] > 0:
-            audio_input_devices.append(
-                ServerAudioDevice(
-                    kind=ServerAudioDeviceTypes.audioinput,
-                    index=deviceIndex,
-                    name=deviceName,
-                    hostAPI=hostAPI,
-                )
-            )
-        if device["maxOutputChannels"] > 0:
-            audio_output_devices.append(
-                ServerAudioDevice(
-                    kind=ServerAudioDeviceTypes.audiooutput,
-                    index=deviceIndex,
-                    name=deviceName,
-                    hostAPI=hostAPI,
-                )
-            )
-
-    return audio_input_devices, audio_output_devices
+    return serverAudioInputDevices, serverAudioOutputDevices
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -102,17 +102,29 @@ class VoiceChanger:
    def audio_callback(
        self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
    ):
-        # print(indata)
        try:
            with Timer("all_inference_time") as t:
                unpackedData = librosa.to_mono(indata.T) * 32768.0
                out_wav, times = self.on_request(unpackedData)
-                outdata[:] = np.repeat(out_wav, 2).reshape(-1, 2) / 32768.0
+                outputChunnels = outdata.shape[1]
+                outdata[:] = (
+                    np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels)
+                    / 32768.0
+                )
            all_inference_time = t.secs
            performance = [all_inference_time] + times
            self.settings.performance = [round(x * 1000) for x in performance]
        except Exception as e:
-            print(e)
+            print("[Voice Changer] ex:", e)
+
+    def getServerAudioDevice(
+        self, audioDeviceList: list[ServerAudioDevice], index: int
+    ):
+        serverAudioDevice = [x for x in audioDeviceList if x.index == index]
+        if len(serverAudioDevice) > 0:
+            return serverAudioDevice[0]
+        else:
+            return None

    def serverLocal(self, _vc):
        vc: VoiceChanger = _vc
@ -131,12 +143,28 @@ class VoiceChanger:
            else:
                sd._terminate()
                sd._initialize()
-                if currentInputDeviceId != vc.settings.serverInputDeviceId:
-                    sd.default.device[0] = vc.settings.serverInputDeviceId
-                    currentInputDeviceId = vc.settings.serverInputDeviceId
-                if currentOutputDeviceId != vc.settings.serverOutputDeviceId:
-                    sd.default.device[1] = vc.settings.serverOutputDeviceId
-                    currentOutputDeviceId = vc.settings.serverOutputDeviceId
+
+                sd.default.device[0] = vc.settings.serverInputDeviceId
+                currentInputDeviceId = vc.settings.serverInputDeviceId
+                sd.default.device[1] = vc.settings.serverOutputDeviceId
+                currentOutputDeviceId = vc.settings.serverOutputDeviceId
+
+                currentInputChannelNum = vc.settings.serverAudioInputDevices
+
+                serverInputAudioDevice = self.getServerAudioDevice(
+                    vc.settings.serverAudioInputDevices, currentInputDeviceId
+                )
+                serverOutputAudioDevice = self.getServerAudioDevice(
+                    vc.settings.serverAudioOutputDevices, currentOutputDeviceId
+                )
+                print(serverInputAudioDevice, serverOutputAudioDevice)
+                if serverInputAudioDevice is None or serverOutputAudioDevice is None:
+                    time.sleep(2)
+                    print("serverInputAudioDevice or serverOutputAudioDevice is None")
+                    continue
+
+                currentInputChannelNum = serverInputAudioDevice.maxInputChannels
+                currentOutputChannelNum = serverInputAudioDevice.maxOutputChannels

                vc.settings.serverInputAudioSampleRate = (
                    self.voiceChanger.get_processing_sampling_rate()
@ -144,14 +172,16 @@ class VoiceChanger:
                currentInputSampleRate = vc.settings.serverInputAudioSampleRate
                currentInputChunkNum = vc.settings.serverReadChunkSize
                block_frame = currentInputChunkNum * 128
+
                try:
                    with sd.Stream(
                        callback=self.audio_callback,
                        blocksize=block_frame,
                        samplerate=currentInputSampleRate,
                        dtype="float32",
-                        channels=1,
+                        channels=[currentInputChannelNum, currentOutputChannelNum],
                    ):
+                        print()
                        while (
                            vc.settings.serverAudioStated == 1
                            and currentInputDeviceId == vc.settings.serverInputDeviceId
@ -182,8 +212,7 @@ class VoiceChanger:
                            )

                except Exception as e:
-                    print(e)
-                    print()
+                    print("[Voice Changer] ex:", e)
                    time.sleep(2)

    def __init__(self, params: VoiceChangerParams):