improve error handling 1

2025-01-23 21:45:00 +03:00 · 2023-04-17 09:45:12 +09:00 · 2023-04-17 09:45:12 +09:00 · e4ac5e74db
commit e4ac5e74db
parent 390a39fa64
9 changed files with 90 additions and 57 deletions
--- a/README.md
+++ b/README.md
@ -67,6 +67,7 @@ Windows 版と Mac 版を提供しています。
 - so-vits-svc 4.0/so-vits-svc 4.0v2、RVC(Retrieval-based-Voice-Conversion)の動作には hubert のモデルが必要になります。[このリポジトリ](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main)から`hubert_base.pt`をダウンロードして、バッチファイルがあるフォルダに格納してください。

 - DDSP-SVC の動作には、hubert-soft と enhancer のモデルが必要です。hubert-soft は[このリンク](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)からダウンロードして、バッチファイルがあるフォルダに格納してください。enhancer は[このサイト](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1)から`nsf_hifigan_20221211.zip`ダウンロードして下さい。解凍すると出てくる`nsf_hifigan`というフォルダをバッチファイルがあるフォルダに格納してください。
+- DDPS-SVC の encoder は hubert-soft のみ対応です。

 | Version   | OS  | フレームワーク                    | link                                                                                     | サポート VC                                                         | サイズ |
 | --------- | --- | --------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------ |
--- a/server/Exceptions.py
+++ b/server/Exceptions.py
@ -0,0 +1,7 @@
+
+class NoModeLoadedException(Exception):
+    def __init__(self, framework):
+        self.framework = framework
+
+    def __str__(self):
+        return repr(f"No model for {self.framework} loaded. Please confirm the model uploaded.")
--- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py
+++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py
@ -20,6 +20,9 @@ import pyworld as pw
 import ddsp.vocoder as vo
 from ddsp.core import upsample
 from enhancer import Enhancer
+
+from Exceptions import NoModeLoadedException
+
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]


@ -224,7 +227,7 @@ class DDSP_SVC:
    def _onnx_inference(self, data):
        if hasattr(self, "onnx_session") == False or self.onnx_session == None:
            print("[Voice Changer] No onnx session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        seg_units = data[0]
        # f0 = data[1]
@ -258,7 +261,7 @@ class DDSP_SVC:

        if hasattr(self, "model") == False or self.model == None:
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        c = data[0].to(self.useDevice())
        f0 = data[1].to(self.useDevice())
--- a/server/voice_changer/MMVCv13/MMVCv13.py
+++ b/server/voice_changer/MMVCv13/MMVCv13.py
@ -22,6 +22,8 @@ from symbols import symbols
 from models import SynthesizerTrn
 from voice_changer.MMVCv13.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file

+from Exceptions import NoModeLoadedException
+
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]


@ -119,6 +121,8 @@ class MMVCv13:
        return data

    def get_processing_sampling_rate(self):
+        if hasattr(self, "hps") == False:
+            raise NoModeLoadedException("config")
        return self.hps.data.sampling_rate

    def _get_spec(self, audio: any):
@ -158,7 +162,7 @@ class MMVCv13:
    def _onnx_inference(self, data):
        if hasattr(self, "onnx_session") == False or self.onnx_session == None:
            print("[Voice Changer] No ONNX session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data]
        sid_tgt1 = torch.LongTensor([self.settings.dstId])
@ -176,7 +180,7 @@ class MMVCv13:
    def _pyTorch_inference(self, data):
        if hasattr(self, "net_g") == False or self.net_g == None:
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -20,6 +20,8 @@ import pyworld as pw
 from models import SynthesizerTrn
 from voice_changer.MMVCv15.client_modules import convert_continuos_f0, spectrogram_torch, get_hparams_from_file, load_checkpoint

+from Exceptions import NoModeLoadedException
+
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]


@ -138,6 +140,8 @@ class MMVCv15:
        return data

    def get_processing_sampling_rate(self):
+        if hasattr(self, "hps") == False:
+            raise NoModeLoadedException("config")
        return self.hps.data.sampling_rate

    def _get_f0(self, detector: str, newData: any):
@ -191,7 +195,7 @@ class MMVCv15:
    def _onnx_inference(self, data):
        if self.settings.onnxModelFile == "":
            print("[Voice Changer] No ONNX session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        spec, f0, sid_src = data
        spec = spec.unsqueeze(0)
@ -217,7 +221,7 @@ class MMVCv15:
    def _pyTorch_inference(self, data):
        if self.settings.pyTorchModelFile == "":
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -2,7 +2,7 @@ import sys
 import os
 import resampy
 from voice_changer.RVC.ModelWrapper import ModelWrapper
-
+from Exceptions import NoModeLoadedException

 # avoiding parse arg error in RVC
 sys.argv = ["MMVCServerSIO.py"]
@ -198,7 +198,7 @@ class RVC:
    def _onnx_inference(self, data):
        if hasattr(self, "onnx_session") == False or self.onnx_session == None:
            print("[Voice Changer] No onnx session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
@ -239,7 +239,7 @@ class RVC:
    def _pyTorch_inference(self, data):
        if hasattr(self, "net_g") == False or self.net_g == None:
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
--- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
+++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
@ -26,6 +26,10 @@ import cluster
 import utils
 from fairseq import checkpoint_utils
 import librosa
+
+from Exceptions import NoModeLoadedException
+
+
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]


@ -185,6 +189,8 @@ class SoVitsSvc40:
        return data

    def get_processing_sampling_rate(self):
+        if hasattr(self, "hps") == False:
+            raise NoModeLoadedException("config")
        return self.hps.data.sampling_rate

    def get_unit_f0(self, audio_buffer, tran):
@ -278,7 +284,7 @@ class SoVitsSvc40:
    def _onnx_inference(self, data):
        if hasattr(self, "onnx_session") == False or self.onnx_session == None:
            print("[Voice Changer] No onnx session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        convertSize = data[3]
        vol = data[4]
@ -309,7 +315,7 @@ class SoVitsSvc40:
    def _pyTorch_inference(self, data):
        if hasattr(self, "net_g") == False or self.net_g == None:
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
--- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
+++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
@ -23,6 +23,9 @@ import cluster
 import utils
 from fairseq import checkpoint_utils
 import librosa
+
+from Exceptions import NoModeLoadedException
+
 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]


@ -161,6 +164,8 @@ class SoVitsSvc40v2:
        return data

    def get_processing_sampling_rate(self):
+        if hasattr(self, "hps") == False:
+            raise NoModeLoadedException("config")
        return self.hps.data.sampling_rate

    def get_unit_f0(self, audio_buffer, tran):
@ -240,7 +245,7 @@ class SoVitsSvc40v2:
    def _onnx_inference(self, data):
        if hasattr(self, "onnx_session") == False or self.onnx_session == None:
            print("[Voice Changer] No onnx session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("ONNX")

        convertSize = data[3]
        vol = data[4]
@ -272,7 +277,7 @@ class SoVitsSvc40v2:
    def _pyTorch_inference(self, data):
        if hasattr(self, "net_g") == False or self.net_g == None:
            print("[Voice Changer] No pyTorch session.")
-            return np.zeros(1).astype(np.int16)
+            raise NoModeLoadedException("pytorch")

        if self.settings.gpu < 0 or self.gpu_num == 0:
            dev = torch.device("cpu")
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -14,7 +14,7 @@ from voice_changer.IORecorder import IORecorder
 from voice_changer.utils.Timer import Timer
 from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
 import time
-
+from Exceptions import NoModeLoadedException

 providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]

@ -211,27 +211,27 @@ class VoiceChanger():
        return self.on_request_sola(receivedData)

    def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
-        processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
+        try:
+            processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()

-        # 前処理
-        with Timer("pre-process") as t:
-            if self.settings.inputSampleRate != processing_sampling_rate:
-                newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
-            else:
-                newData = receivedData
+            # 前処理
+            with Timer("pre-process") as t:
+                if self.settings.inputSampleRate != processing_sampling_rate:
+                    newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
+                else:
+                    newData = receivedData

-            sola_search_frame = int(0.012 * processing_sampling_rate)
-            # sola_search_frame = 0
-            block_frame = newData.shape[0]
-            crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
-            self._generate_strength(crossfade_frame)
+                sola_search_frame = int(0.012 * processing_sampling_rate)
+                # sola_search_frame = 0
+                block_frame = newData.shape[0]
+                crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
+                self._generate_strength(crossfade_frame)

-            data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
-        preprocess_time = t.secs
+                data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
+            preprocess_time = t.secs

-        # 変換処理
-        with Timer("main-process") as t:
-            try:
+            # 変換処理
+            with Timer("main-process") as t:
                # Inference
                audio = self.voiceChanger.inference(data)

@ -258,38 +258,41 @@ class VoiceChanger():
                else:
                    self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength
                    # self.sola_buffer = audio[- crossfade_frame:]
+            mainprocess_time = t.secs

-            except Exception as e:
-                print("VC PROCESSING!!!! EXCEPTION!!!", e)
-                print(traceback.format_exc())
-                return np.zeros(1).astype(np.int16), [0, 0, 0]
-        mainprocess_time = t.secs
+            # 後処理
+            with Timer("post-process") as t:
+                result = result.astype(np.int16)
+                if self.settings.inputSampleRate != processing_sampling_rate:
+                    outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
+                else:
+                    outputData = result

-        # 後処理
-        with Timer("post-process") as t:
-            result = result.astype(np.int16)
-            if self.settings.inputSampleRate != processing_sampling_rate:
-                outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
-            else:
-                outputData = result
+                print_convert_processing(
+                    f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")

-            print_convert_processing(
-                f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
+                if self.settings.recordIO == 1:
+                    self.ioRecorder.writeInput(receivedData)
+                    self.ioRecorder.writeOutput(outputData.tobytes())

-            if self.settings.recordIO == 1:
-                self.ioRecorder.writeInput(receivedData)
-                self.ioRecorder.writeOutput(outputData.tobytes())
+                # if receivedData.shape[0] != outputData.shape[0]:
+                #     print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}")
+                #     outputData = pad_array(outputData, receivedData.shape[0])
+                #     # print_convert_processing(
+                #     #     f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
+            postprocess_time = t.secs

-            # if receivedData.shape[0] != outputData.shape[0]:
-            #     print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}")
-            #     outputData = pad_array(outputData, receivedData.shape[0])
-            #     # print_convert_processing(
-            #     #     f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
-        postprocess_time = t.secs
+            print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
+            perf = [preprocess_time, mainprocess_time, postprocess_time]
+            return outputData, perf

-        print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
-        perf = [preprocess_time, mainprocess_time, postprocess_time]
-        return outputData, perf
+        except NoModeLoadedException as e:
+            print("[Voice Changer] [Exception]", e)
+            return np.zeros(1).astype(np.int16), [0, 0, 0]
+        except Exception as e:
+            print("VC PROCESSING!!!! EXCEPTION!!!", e)
+            print(traceback.format_exc())
+            return np.zeros(1).astype(np.int16), [0, 0, 0]

    def export2onnx(self):
        return self.voiceChanger.export2onnx()