From 9806ce2f3dcf5fba7919b9034f8aeb60444ae70d Mon Sep 17 00:00:00 2001
From: wataru <wataru@fdev.local.com>
Date: Fri, 16 Jun 2023 01:49:49 +0900
Subject: [PATCH] WIP:refactoring

---
 server/voice_changer/Local/AudioDeviceList.py |   6 +-
 server/voice_changer/Local/ServerDevice.py    | 101 +++++++-----
 server/voice_changer/VoiceChanger.py          | 146 +-----------------
 server/voice_changer/VoiceChangerManager.py   |  14 +-
 4 files changed, 84 insertions(+), 183 deletions(-)

diff --git a/server/voice_changer/Local/AudioDeviceList.py b/server/voice_changer/Local/AudioDeviceList.py
index 38d2186d..d8a4ef5a 100644
--- a/server/voice_changer/Local/AudioDeviceList.py
+++ b/server/voice_changer/Local/AudioDeviceList.py
@@ -15,7 +15,11 @@ class ServerAudioDevice:
 
 
 def list_audio_device():
-    audioDeviceList = sd.query_devices()
+    try:
+        audioDeviceList = sd.query_devices()
+    except Exception as e:
+        print("[Voice Changer] ex: query_devices", e)
+        return [], []
 
     inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0]
     outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0]
diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py
index 396f9d8c..7505ded6 100644
--- a/server/voice_changer/Local/ServerDevice.py
+++ b/server/voice_changer/Local/ServerDevice.py
@@ -4,34 +4,66 @@ import numpy as np
 import librosa
 import sounddevice as sd
 
+from dataclasses import dataclass, asdict, field
+
 from voice_changer.Local.AudioDeviceList import ServerAudioDevice
-from voice_changer.VoiceChanger import VoiceChanger
+from voice_changer.VoiceChangerManager import VoiceChangerManager
 from voice_changer.utils.Timer import Timer
 
 
+@dataclass()
+class ServerDeviceSettings:
+    enableServerAudio: int = 0  # 0:off, 1:on
+    serverAudioStated: int = 0  # 0:off, 1:on
+    serverInputAudioSampleRate: int = 44100
+    serverOutputAudioSampleRate: int = 44100
+    serverInputDeviceId: int = -1
+    serverOutputDeviceId: int = -1
+    serverReadChunkSize: int = 256
+    serverInputAudioGain: float = 1.0
+    serverOutputAudioGain: float = 1.0
+
+
+EditableServerDeviceSettings = {
+    "intData": [
+        "enableServerAudio",
+        "serverAudioStated",
+        "serverInputAudioSampleRate",
+        "serverOutputAudioSampleRate",
+        "serverInputDeviceId",
+        "serverOutputDeviceId",
+        "serverReadChunkSize",
+    ],
+    "floatData": [
+        "serverInputAudioGain",
+        "serverOutputAudioGain",
+    ],
+}
+
+
 class ServerDevice:
-    def __init__(self):
-        self.voiceChanger: VoiceChanger | None = None
-        pass
+    def __init__(self, voiceChangerManager: VoiceChangerManager):
+        self.settings = ServerDeviceSettings()
+        self.voiceChangerManager: VoiceChangerManager = voiceChangerManager
 
     def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
-        if self.voiceChanger is None:
+        if self.voiceChangerManager.voiceChanger is None:
             print("[Voice Changer] voiceChanger is None")
             return
 
         try:
-            indata = indata * self.voiceChanger.settings.serverInputAudioGain
+            indata = indata * self.settings.serverInputAudioGain
             with Timer("all_inference_time") as t:
                 unpackedData = librosa.to_mono(indata.T) * 32768.0
-                out_wav, times = self.voiceChanger.on_request(unpackedData)
+                out_wav, times = self.voiceChangerManager.voiceChanger.on_request(unpackedData)
                 outputChunnels = outdata.shape[1]
                 outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
-                outdata[:] = outdata * self.voiceChanger.settings.serverOutputAudioGain
+                outdata[:] = outdata * self.settings.serverOutputAudioGain
             all_inference_time = t.secs
             performance = [all_inference_time] + times
-            if self.voiceChanger.emitTo is not None:
-                self.voiceChanger.emitTo(performance)
-            self.voiceChanger.settings.performance = [round(x * 1000) for x in performance]
+            if self.voiceChangerManager.voiceChanger.emitTo is not None:
+                self.voiceChangerManager.voiceChanger.emitTo(performance)
+            self.voiceChangerManager.voiceChanger.settings.performance = [round(x * 1000) for x in performance]
         except Exception as e:
             print("[Voice Changer] ex:", e)
 
@@ -42,31 +74,26 @@ class ServerDevice:
         else:
             return None
 
-    def serverLocal(self, _vc: VoiceChanger):
-        self.voiceChanger = _vc
-        vc = self.voiceChanger
-
+    def serverLocal(self):
         currentInputDeviceId = -1
         currentModelSamplingRate = -1
         currentOutputDeviceId = -1
         currentInputChunkNum = -1
         while True:
-            if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc is None:
-                vc.settings.inputSampleRate = 48000
+            if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1 or self.voiceChangerManager is None:
+                self.voiceChangerManager.voiceChanger.settings.inputSampleRate = 48000
                 time.sleep(2)
             else:
                 sd._terminate()
                 sd._initialize()
 
-                sd.default.device[0] = vc.settings.serverInputDeviceId
-                currentInputDeviceId = vc.settings.serverInputDeviceId
-                sd.default.device[1] = vc.settings.serverOutputDeviceId
-                currentOutputDeviceId = vc.settings.serverOutputDeviceId
+                sd.default.device[0] = self.settings.serverInputDeviceId
+                currentInputDeviceId = self.settings.serverInputDeviceId
+                sd.default.device[1] = self.settings.serverOutputDeviceId
+                currentOutputDeviceId = self.settings.serverOutputDeviceId
 
-                currentInputChannelNum = vc.settings.serverAudioInputDevices
-
-                serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
-                serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
+                serverInputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioInputDevices, currentInputDeviceId)
+                serverOutputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioOutputDevices, currentOutputDeviceId)
                 print(serverInputAudioDevice, serverOutputAudioDevice)
                 if serverInputAudioDevice is None or serverOutputAudioDevice is None:
                     time.sleep(2)
@@ -76,12 +103,12 @@ class ServerDevice:
                 currentInputChannelNum = serverInputAudioDevice.maxInputChannels
                 currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
 
-                currentInputChunkNum = vc.settings.serverReadChunkSize
+                currentInputChunkNum = self.settings.serverReadChunkSize
                 block_frame = currentInputChunkNum * 128
 
                 # sample rate precheck(alsa cannot use 40000?)
                 try:
-                    currentModelSamplingRate = self.voiceChanger.voiceChangerModel.get_processing_sampling_rate()
+                    currentModelSamplingRate = self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate()
                 except Exception as e:
                     print("[Voice Changer] ex: get_processing_sampling_rate", e)
                     continue
@@ -94,38 +121,38 @@ class ServerDevice:
                         channels=[currentInputChannelNum, currentOutputChannelNum],
                     ):
                         pass
-                    vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
-                    vc.settings.inputSampleRate = currentModelSamplingRate
-                    print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
+                    self.settings.serverInputAudioSampleRate = currentModelSamplingRate
+                    self.voiceChangerManager.voiceChanger.settings.inputSampleRate = currentModelSamplingRate
+                    print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}")
                 except Exception as e:
                     print(
                         "[Voice Changer] ex: fallback to device default samplerate",
                         e,
                     )
-                    vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
-                    vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
+                    self.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
+                    self.voiceChangerManager.voiceChanger.settings.inputSampleRate = self.settings.serverInputAudioSampleRate
 
                 # main loop
                 try:
                     with sd.Stream(
                         callback=self.audio_callback,
                         blocksize=block_frame,
-                        samplerate=vc.settings.serverInputAudioSampleRate,
+                        samplerate=self.settings.serverInputAudioSampleRate,
                         dtype="float32",
                         channels=[currentInputChannelNum, currentOutputChannelNum],
                     ):
-                        while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
+                        while self.settings.serverAudioStated == 1 and currentInputDeviceId == self.settings.serverInputDeviceId and currentOutputDeviceId == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize:
                             time.sleep(2)
                             print(
                                 "[Voice Changer] server audio",
-                                vc.settings.performance,
+                                self.voiceChangerManager.settings.performance,
                             )
                             print(
                                 "[Voice Changer] info:",
-                                vc.settings.serverAudioStated,
+                                self.settings.serverAudioStated,
                                 currentInputDeviceId,
                                 currentOutputDeviceId,
-                                vc.settings.serverInputAudioSampleRate,
+                                self.settings.serverInputAudioSampleRate,
                                 currentInputChunkNum,
                             )
 
diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py
index 7d500d6d..5b64a342 100755
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@@ -51,22 +51,7 @@ class VoiceChangerSettings:
     crossFadeOverlapSize: int = 4096
 
     recordIO: int = 0  # 0:off, 1:on
-    serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
-    serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
 
-    enableServerAudio: int = 0  # 0:off, 1:on
-    serverAudioStated: int = 0  # 0:off, 1:on
-    # serverInputAudioSampleRate: int = 48000
-    # serverOutputAudioSampleRate: int = 48000
-    serverInputAudioSampleRate: int = 44100
-    serverOutputAudioSampleRate: int = 44100
-    # serverInputAudioBufferSize: int = 1024 * 24
-    # serverOutputAudioBufferSize: int = 1024 * 24
-    serverInputDeviceId: int = -1
-    serverOutputDeviceId: int = -1
-    serverReadChunkSize: int = 256
-    serverInputAudioGain: float = 1.0
-    serverOutputAudioGain: float = 1.0
     performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
 
     # ↓mutableな物だけ列挙
@@ -75,23 +60,12 @@ class VoiceChangerSettings:
             "inputSampleRate",
             "crossFadeOverlapSize",
             "recordIO",
-            "enableServerAudio",
-            "serverAudioStated",
-            "serverInputAudioSampleRate",
-            "serverOutputAudioSampleRate",
-            # "serverInputAudioBufferSize",
-            # "serverOutputAudioBufferSize",
-            "serverInputDeviceId",
-            "serverOutputDeviceId",
-            "serverReadChunkSize",
         ]
     )
     floatData: list[str] = field(
         default_factory=lambda: [
             "crossFadeOffsetRate",
             "crossFadeEndRate",
-            "serverInputAudioGain",
-            "serverOutputAudioGain",
         ]
     )
     strData: list[str] = field(default_factory=lambda: [])
@@ -108,120 +82,6 @@ class VoiceChanger:
 
     # emitTo = None
 
-    # def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
-    #     try:
-    #         indata = indata * self.settings.serverInputAudioGain
-    #         with Timer("all_inference_time") as t:
-    #             unpackedData = librosa.to_mono(indata.T) * 32768.0
-    #             out_wav, times = self.on_request(unpackedData)
-    #             outputChunnels = outdata.shape[1]
-    #             outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
-    #             outdata[:] = outdata * self.settings.serverOutputAudioGain
-    #         all_inference_time = t.secs
-    #         performance = [all_inference_time] + times
-    #         if self.emitTo is not None:
-    #             self.emitTo(performance)
-    #         self.settings.performance = [round(x * 1000) for x in performance]
-    #     except Exception as e:
-    #         print("[Voice Changer] ex:", e)
-
-    # def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int):
-    #     serverAudioDevice = [x for x in audioDeviceList if x.index == index]
-    #     if len(serverAudioDevice) > 0:
-    #         return serverAudioDevice[0]
-    #     else:
-    #         return None
-
-    # def serverLocal(self, _vc):
-    #     vc: VoiceChanger = _vc
-
-    #     currentInputDeviceId = -1
-    #     currentModelSamplingRate = -1
-    #     currentOutputDeviceId = -1
-    #     currentInputChunkNum = -1
-    #     while True:
-    #         if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None:
-    #             vc.settings.inputSampleRate = 48000
-    #             time.sleep(2)
-    #         else:
-    #             sd._terminate()
-    #             sd._initialize()
-
-    #             sd.default.device[0] = vc.settings.serverInputDeviceId
-    #             currentInputDeviceId = vc.settings.serverInputDeviceId
-    #             sd.default.device[1] = vc.settings.serverOutputDeviceId
-    #             currentOutputDeviceId = vc.settings.serverOutputDeviceId
-
-    #             currentInputChannelNum = vc.settings.serverAudioInputDevices
-
-    #             serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
-    #             serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
-    #             print(serverInputAudioDevice, serverOutputAudioDevice)
-    #             if serverInputAudioDevice is None or serverOutputAudioDevice is None:
-    #                 time.sleep(2)
-    #                 print("serverInputAudioDevice or serverOutputAudioDevice is None")
-    #                 continue
-
-    #             currentInputChannelNum = serverInputAudioDevice.maxInputChannels
-    #             currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
-
-    #             currentInputChunkNum = vc.settings.serverReadChunkSize
-    #             block_frame = currentInputChunkNum * 128
-
-    #             # sample rate precheck(alsa cannot use 40000?)
-    #             try:
-    #                 currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate()
-    #             except Exception as e:
-    #                 print("[Voice Changer] ex: get_processing_sampling_rate", e)
-    #                 continue
-    #             try:
-    #                 with sd.Stream(
-    #                     callback=self.audio_callback,
-    #                     blocksize=block_frame,
-    #                     samplerate=currentModelSamplingRate,
-    #                     dtype="float32",
-    #                     channels=[currentInputChannelNum, currentOutputChannelNum],
-    #                 ):
-    #                     pass
-    #                 vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
-    #                 vc.settings.inputSampleRate = currentModelSamplingRate
-    #                 print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
-    #             except Exception as e:
-    #                 print(
-    #                     "[Voice Changer] ex: fallback to device default samplerate",
-    #                     e,
-    #                 )
-    #                 vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
-    #                 vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
-
-    #             # main loop
-    #             try:
-    #                 with sd.Stream(
-    #                     callback=self.audio_callback,
-    #                     blocksize=block_frame,
-    #                     samplerate=vc.settings.serverInputAudioSampleRate,
-    #                     dtype="float32",
-    #                     channels=[currentInputChannelNum, currentOutputChannelNum],
-    #                 ):
-    #                     while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
-    #                         time.sleep(2)
-    #                         print(
-    #                             "[Voice Changer] server audio",
-    #                             self.settings.performance,
-    #                         )
-    #                         print(
-    #                             "[Voice Changer] info:",
-    #                             vc.settings.serverAudioStated,
-    #                             currentInputDeviceId,
-    #                             currentOutputDeviceId,
-    #                             vc.settings.serverInputAudioSampleRate,
-    #                             currentInputChunkNum,
-    #                         )
-
-    #             except Exception as e:
-    #                 print("[Voice Changer] ex:", e)
-    #                 time.sleep(2)
-
     def __init__(self, params: VoiceChangerParams, slotIndex: int):
         # 初期化
         self.settings = VoiceChangerSettings()
@@ -238,9 +98,9 @@ class VoiceChanger:
         self.ioRecorder: IORecorder | None = None
         self.sola_buffer: AudioInOut | None = None
 
-        audioinput, audiooutput = list_audio_device()
-        self.settings.serverAudioInputDevices = audioinput
-        self.settings.serverAudioOutputDevices = audiooutput
+        # audioinput, audiooutput = list_audio_device()
+        # self.settings.serverAudioInputDevices = audioinput
+        # self.settings.serverAudioOutputDevices = audiooutput
 
         self.slotIndex = slotIndex
         self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex)
diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py
index 93eeb7e0..a15b3202 100644
--- a/server/voice_changer/VoiceChangerManager.py
+++ b/server/voice_changer/VoiceChangerManager.py
@@ -3,6 +3,7 @@ import threading
 from data.ModelSample import ModelSamples
 from data.ModelSlot import ModelSlots, loadSlotInfo
 from utils.downloader.SampleDownloader import downloadSample, getSampleInfos
+from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
 from voice_changer.Local.ServerDevice import ServerDevice
 from voice_changer.RVC.ModelSlotGenerator import setSlotAsRVC
 
@@ -26,6 +27,7 @@ class GPUInfo:
 @dataclass()
 class VoiceChangerManagerSettings:
     slotIndex: int
+
     intData: list[str] = field(default_factory=lambda: ["slotIndex"])
 
 
@@ -43,6 +45,14 @@ class VoiceChangerManager(object):
         self.sampleModels: list[ModelSamples] = getSampleInfos(self.params.sample_mode)
         self.gpus: list[GPUInfo] = self._get_gpuInfos()
 
+        audioinput, audiooutput = list_audio_device()
+        self.serverAudioInputDevices: list[ServerAudioDevice] = audioinput
+        self.serverAudioOutputDevices: list[ServerAudioDevice] = audiooutput
+
+        # ServerDevice
+        thread = threading.Thread(target=self.serverDevice.serverLocal, args=(self,))
+        thread.start()
+
     def _get_gpuInfos(self):
         devCount = torch.cuda.device_count()
         gpus = []
@@ -63,8 +73,6 @@ class VoiceChangerManager(object):
             print(f"VoiceChanger Initialized (GPU_NUM:{gpu_num}, mps_enabled:{mps_enabled})")
 
             cls._instance.voiceChanger = VoiceChanger(params, cls._instance.settings.slotIndex)
-            thread = threading.Thread(target=cls._instance.serverDevice.serverLocal, args=(cls._instance.voiceChanger,))
-            thread.start()
             cls._instance.voiceChanger.prepareModel()
         return cls._instance
 
@@ -94,6 +102,8 @@ class VoiceChangerManager(object):
         data["slotInfos"] = slotInfos
         data["gpus"] = self.gpus
         data["sampleModels"] = self.sampleModels
+        data["serverAudioInputDevices"] = self.serverAudioInputDevices
+        data["serverAudioOutputDevices"] = self.serverAudioOutputDevices
 
         data["status"] = "OK"
         if hasattr(self, "voiceChanger"):