WIP:refactoring

2025-02-09 03:37:51 +03:00 · 2023-06-16 01:49:49 +09:00 · 2023-06-16 01:49:49 +09:00 · 9806ce2f3d
commit 9806ce2f3d
parent be42bb682d
4 changed files with 84 additions and 183 deletions
--- a/server/voice_changer/Local/AudioDeviceList.py
+++ b/server/voice_changer/Local/AudioDeviceList.py
@ -15,7 +15,11 @@ class ServerAudioDevice:
 def list_audio_device():
-    audioDeviceList = sd.query_devices()
+    try:
        audioDeviceList = sd.query_devices()
    except Exception as e:
        print("[Voice Changer] ex: query_devices", e)
        return [], []
    inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0]
    outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0]
--- a/server/voice_changer/Local/ServerDevice.py
+++ b/server/voice_changer/Local/ServerDevice.py
@ -4,34 +4,66 @@ import numpy as np
 import librosa
 import sounddevice as sd
 from dataclasses import dataclass, asdict, field
 from voice_changer.Local.AudioDeviceList import ServerAudioDevice
-from voice_changer.VoiceChanger import VoiceChanger
+from voice_changer.VoiceChangerManager import VoiceChangerManager
 from voice_changer.utils.Timer import Timer
@dataclass()
 class ServerDeviceSettings:
    enableServerAudio: int = 0  # 0:off, 1:on
    serverAudioStated: int = 0  # 0:off, 1:on
    serverInputAudioSampleRate: int = 44100
    serverOutputAudioSampleRate: int = 44100
    serverInputDeviceId: int = -1
    serverOutputDeviceId: int = -1
    serverReadChunkSize: int = 256
    serverInputAudioGain: float = 1.0
    serverOutputAudioGain: float = 1.0
 EditableServerDeviceSettings = {
    "intData": [
        "enableServerAudio",
        "serverAudioStated",
        "serverInputAudioSampleRate",
        "serverOutputAudioSampleRate",
        "serverInputDeviceId",
        "serverOutputDeviceId",
        "serverReadChunkSize",
    ],
    "floatData": [
        "serverInputAudioGain",
        "serverOutputAudioGain",
    ],
 }
 class ServerDevice:
-    def __init__(self):
+    def __init__(self, voiceChangerManager: VoiceChangerManager):
-        self.voiceChanger: VoiceChanger | None = None
+        self.settings = ServerDeviceSettings()
-        pass
+        self.voiceChangerManager: VoiceChangerManager = voiceChangerManager
    def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
-        if self.voiceChanger is None:
+        if self.voiceChangerManager.voiceChanger is None:
            print("[Voice Changer] voiceChanger is None")
            return
        try:
-            indata = indata * self.voiceChanger.settings.serverInputAudioGain
+            indata = indata * self.settings.serverInputAudioGain
            with Timer("all_inference_time") as t:
                unpackedData = librosa.to_mono(indata.T) * 32768.0
-                out_wav, times = self.voiceChanger.on_request(unpackedData)
+                out_wav, times = self.voiceChangerManager.voiceChanger.on_request(unpackedData)
                outputChunnels = outdata.shape[1]
                outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
-                outdata[:] = outdata * self.voiceChanger.settings.serverOutputAudioGain
+                outdata[:] = outdata * self.settings.serverOutputAudioGain
            all_inference_time = t.secs
            performance = [all_inference_time] + times
-            if self.voiceChanger.emitTo is not None:
+            if self.voiceChangerManager.voiceChanger.emitTo is not None:
-                self.voiceChanger.emitTo(performance)
+                self.voiceChangerManager.voiceChanger.emitTo(performance)
-            self.voiceChanger.settings.performance = [round(x * 1000) for x in performance]
+            self.voiceChangerManager.voiceChanger.settings.performance = [round(x * 1000) for x in performance]
        except Exception as e:
            print("[Voice Changer] ex:", e)
@ -42,31 +74,26 @@ class ServerDevice:
        else:
            return None
-    def serverLocal(self, _vc: VoiceChanger):
+    def serverLocal(self):
        self.voiceChanger = _vc
        vc = self.voiceChanger
        currentInputDeviceId = -1
        currentModelSamplingRate = -1
        currentOutputDeviceId = -1
        currentInputChunkNum = -1
        while True:
-            if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc is None:
+            if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1 or self.voiceChangerManager is None:
-                vc.settings.inputSampleRate = 48000
+                self.voiceChangerManager.voiceChanger.settings.inputSampleRate = 48000
                time.sleep(2)
            else:
                sd._terminate()
                sd._initialize()
-                sd.default.device[0] = vc.settings.serverInputDeviceId
+                sd.default.device[0] = self.settings.serverInputDeviceId
-                currentInputDeviceId = vc.settings.serverInputDeviceId
+                currentInputDeviceId = self.settings.serverInputDeviceId
-                sd.default.device[1] = vc.settings.serverOutputDeviceId
+                sd.default.device[1] = self.settings.serverOutputDeviceId
-                currentOutputDeviceId = vc.settings.serverOutputDeviceId
+                currentOutputDeviceId = self.settings.serverOutputDeviceId
-                currentInputChannelNum = vc.settings.serverAudioInputDevices
+                serverInputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioInputDevices, currentInputDeviceId)
-
+                serverOutputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioOutputDevices, currentOutputDeviceId)
                serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
                serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
                print(serverInputAudioDevice, serverOutputAudioDevice)
                if serverInputAudioDevice is None or serverOutputAudioDevice is None:
                    time.sleep(2)
@ -76,12 +103,12 @@ class ServerDevice:
                currentInputChannelNum = serverInputAudioDevice.maxInputChannels
                currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
-                currentInputChunkNum = vc.settings.serverReadChunkSize
+                currentInputChunkNum = self.settings.serverReadChunkSize
                block_frame = currentInputChunkNum * 128
                # sample rate precheck(alsa cannot use 40000?)
                try:
-                    currentModelSamplingRate = self.voiceChanger.voiceChangerModel.get_processing_sampling_rate()
+                    currentModelSamplingRate = self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate()
                except Exception as e:
                    print("[Voice Changer] ex: get_processing_sampling_rate", e)
                    continue
@ -94,38 +121,38 @@ class ServerDevice:
                        channels=[currentInputChannelNum, currentOutputChannelNum],
                    ):
                        pass
-                    vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
+                    self.settings.serverInputAudioSampleRate = currentModelSamplingRate
-                    vc.settings.inputSampleRate = currentModelSamplingRate
+                    self.voiceChangerManager.voiceChanger.settings.inputSampleRate = currentModelSamplingRate
-                    print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
+                    print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}")
                except Exception as e:
                    print(
                        "[Voice Changer] ex: fallback to device default samplerate",
                        e,
                    )
-                    vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
+                    self.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
-                    vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
+                    self.voiceChangerManager.voiceChanger.settings.inputSampleRate = self.settings.serverInputAudioSampleRate
                # main loop
                try:
                    with sd.Stream(
                        callback=self.audio_callback,
                        blocksize=block_frame,
-                        samplerate=vc.settings.serverInputAudioSampleRate,
+                        samplerate=self.settings.serverInputAudioSampleRate,
                        dtype="float32",
                        channels=[currentInputChannelNum, currentOutputChannelNum],
                    ):
-                        while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
+                        while self.settings.serverAudioStated == 1 and currentInputDeviceId == self.settings.serverInputDeviceId and currentOutputDeviceId == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize:
                            time.sleep(2)
                            print(
                                "[Voice Changer] server audio",
-                                vc.settings.performance,
+                                self.voiceChangerManager.settings.performance,
                            )
                            print(
                                "[Voice Changer] info:",
-                                vc.settings.serverAudioStated,
+                                self.settings.serverAudioStated,
                                currentInputDeviceId,
                                currentOutputDeviceId,
-                                vc.settings.serverInputAudioSampleRate,
+                                self.settings.serverInputAudioSampleRate,
                                currentInputChunkNum,
                            )
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -51,22 +51,7 @@ class VoiceChangerSettings:
    crossFadeOverlapSize: int = 4096
    recordIO: int = 0  # 0:off, 1:on
    serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
    serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
    enableServerAudio: int = 0  # 0:off, 1:on
    serverAudioStated: int = 0  # 0:off, 1:on
    # serverInputAudioSampleRate: int = 48000
    # serverOutputAudioSampleRate: int = 48000
    serverInputAudioSampleRate: int = 44100
    serverOutputAudioSampleRate: int = 44100
    # serverInputAudioBufferSize: int = 1024 * 24
    # serverOutputAudioBufferSize: int = 1024 * 24
    serverInputDeviceId: int = -1
    serverOutputDeviceId: int = -1
    serverReadChunkSize: int = 256
    serverInputAudioGain: float = 1.0
    serverOutputAudioGain: float = 1.0
    performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
    # ↓mutableな物だけ列挙
@ -75,23 +60,12 @@ class VoiceChangerSettings:
            "inputSampleRate",
            "crossFadeOverlapSize",
            "recordIO",
            "enableServerAudio",
            "serverAudioStated",
            "serverInputAudioSampleRate",
            "serverOutputAudioSampleRate",
            # "serverInputAudioBufferSize",
            # "serverOutputAudioBufferSize",
            "serverInputDeviceId",
            "serverOutputDeviceId",
            "serverReadChunkSize",
        ]
    )
    floatData: list[str] = field(
        default_factory=lambda: [
            "crossFadeOffsetRate",
            "crossFadeEndRate",
            "serverInputAudioGain",
            "serverOutputAudioGain",
        ]
    )
    strData: list[str] = field(default_factory=lambda: [])
@ -108,120 +82,6 @@ class VoiceChanger:
    # emitTo = None
    # def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
    #     try:
    #         indata = indata * self.settings.serverInputAudioGain
    #         with Timer("all_inference_time") as t:
    #             unpackedData = librosa.to_mono(indata.T) * 32768.0
    #             out_wav, times = self.on_request(unpackedData)
    #             outputChunnels = outdata.shape[1]
    #             outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
    #             outdata[:] = outdata * self.settings.serverOutputAudioGain
    #         all_inference_time = t.secs
    #         performance = [all_inference_time] + times
    #         if self.emitTo is not None:
    #             self.emitTo(performance)
    #         self.settings.performance = [round(x * 1000) for x in performance]
    #     except Exception as e:
    #         print("[Voice Changer] ex:", e)
    # def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int):
    #     serverAudioDevice = [x for x in audioDeviceList if x.index == index]
    #     if len(serverAudioDevice) > 0:
    #         return serverAudioDevice[0]
    #     else:
    #         return None
    # def serverLocal(self, _vc):
    #     vc: VoiceChanger = _vc
    #     currentInputDeviceId = -1
    #     currentModelSamplingRate = -1
    #     currentOutputDeviceId = -1
    #     currentInputChunkNum = -1
    #     while True:
    #         if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None:
    #             vc.settings.inputSampleRate = 48000
    #             time.sleep(2)
    #         else:
    #             sd._terminate()
    #             sd._initialize()
    #             sd.default.device[0] = vc.settings.serverInputDeviceId
    #             currentInputDeviceId = vc.settings.serverInputDeviceId
    #             sd.default.device[1] = vc.settings.serverOutputDeviceId
    #             currentOutputDeviceId = vc.settings.serverOutputDeviceId
    #             currentInputChannelNum = vc.settings.serverAudioInputDevices
    #             serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
    #             serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
    #             print(serverInputAudioDevice, serverOutputAudioDevice)
    #             if serverInputAudioDevice is None or serverOutputAudioDevice is None:
    #                 time.sleep(2)
    #                 print("serverInputAudioDevice or serverOutputAudioDevice is None")
    #                 continue
    #             currentInputChannelNum = serverInputAudioDevice.maxInputChannels
    #             currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
    #             currentInputChunkNum = vc.settings.serverReadChunkSize
    #             block_frame = currentInputChunkNum * 128
    #             # sample rate precheck(alsa cannot use 40000?)
    #             try:
    #                 currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate()
    #             except Exception as e:
    #                 print("[Voice Changer] ex: get_processing_sampling_rate", e)
    #                 continue
    #             try:
    #                 with sd.Stream(
    #                     callback=self.audio_callback,
    #                     blocksize=block_frame,
    #                     samplerate=currentModelSamplingRate,
    #                     dtype="float32",
    #                     channels=[currentInputChannelNum, currentOutputChannelNum],
    #                 ):
    #                     pass
    #                 vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
    #                 vc.settings.inputSampleRate = currentModelSamplingRate
    #                 print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
    #             except Exception as e:
    #                 print(
    #                     "[Voice Changer] ex: fallback to device default samplerate",
    #                     e,
    #                 )
    #                 vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
    #                 vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
    #             # main loop
    #             try:
    #                 with sd.Stream(
    #                     callback=self.audio_callback,
    #                     blocksize=block_frame,
    #                     samplerate=vc.settings.serverInputAudioSampleRate,
    #                     dtype="float32",
    #                     channels=[currentInputChannelNum, currentOutputChannelNum],
    #                 ):
    #                     while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
    #                         time.sleep(2)
    #                         print(
    #                             "[Voice Changer] server audio",
    #                             self.settings.performance,
    #                         )
    #                         print(
    #                             "[Voice Changer] info:",
    #                             vc.settings.serverAudioStated,
    #                             currentInputDeviceId,
    #                             currentOutputDeviceId,
    #                             vc.settings.serverInputAudioSampleRate,
    #                             currentInputChunkNum,
    #                         )
    #             except Exception as e:
    #                 print("[Voice Changer] ex:", e)
    #                 time.sleep(2)
    def __init__(self, params: VoiceChangerParams, slotIndex: int):
        # 初期化
        self.settings = VoiceChangerSettings()
@ -238,9 +98,9 @@ class VoiceChanger:
        self.ioRecorder: IORecorder | None = None
        self.sola_buffer: AudioInOut | None = None
-        audioinput, audiooutput = list_audio_device()
+        # audioinput, audiooutput = list_audio_device()
-        self.settings.serverAudioInputDevices = audioinput
+        # self.settings.serverAudioInputDevices = audioinput
-        self.settings.serverAudioOutputDevices = audiooutput
+        # self.settings.serverAudioOutputDevices = audiooutput
        self.slotIndex = slotIndex
        self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex)
--- a/server/voice_changer/VoiceChangerManager.py
+++ b/server/voice_changer/VoiceChangerManager.py
@ -3,6 +3,7 @@ import threading
 from data.ModelSample import ModelSamples
 from data.ModelSlot import ModelSlots, loadSlotInfo
 from utils.downloader.SampleDownloader import downloadSample, getSampleInfos
 from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
 from voice_changer.Local.ServerDevice import ServerDevice
 from voice_changer.RVC.ModelSlotGenerator import setSlotAsRVC
@ -26,6 +27,7 @@ class GPUInfo:
@dataclass()
 class VoiceChangerManagerSettings:
    slotIndex: int
    intData: list[str] = field(default_factory=lambda: ["slotIndex"])
@ -43,6 +45,14 @@ class VoiceChangerManager(object):
        self.sampleModels: list[ModelSamples] = getSampleInfos(self.params.sample_mode)
        self.gpus: list[GPUInfo] = self._get_gpuInfos()
        audioinput, audiooutput = list_audio_device()
        self.serverAudioInputDevices: list[ServerAudioDevice] = audioinput
        self.serverAudioOutputDevices: list[ServerAudioDevice] = audiooutput
        # ServerDevice
        thread = threading.Thread(target=self.serverDevice.serverLocal, args=(self,))
        thread.start()
    def _get_gpuInfos(self):
        devCount = torch.cuda.device_count()
        gpus = []
@ -63,8 +73,6 @@ class VoiceChangerManager(object):
            print(f"VoiceChanger Initialized (GPU_NUM:{gpu_num}, mps_enabled:{mps_enabled})")
            cls._instance.voiceChanger = VoiceChanger(params, cls._instance.settings.slotIndex)
            thread = threading.Thread(target=cls._instance.serverDevice.serverLocal, args=(cls._instance.voiceChanger,))
            thread.start()
            cls._instance.voiceChanger.prepareModel()
        return cls._instance
@ -94,6 +102,8 @@ class VoiceChangerManager(object):
        data["slotInfos"] = slotInfos
        data["gpus"] = self.gpus
        data["sampleModels"] = self.sampleModels
        data["serverAudioInputDevices"] = self.serverAudioInputDevices
        data["serverAudioOutputDevices"] = self.serverAudioOutputDevices
        data["status"] = "OK"
        if hasattr(self, "voiceChanger"):