diff --git a/README.md b/README.md index 8184d915..8cb18419 100644 --- a/README.md +++ b/README.md @@ -67,6 +67,7 @@ Windows 版と Mac 版を提供しています。 - so-vits-svc 4.0/so-vits-svc 4.0v2、RVC(Retrieval-based-Voice-Conversion)の動作には hubert のモデルが必要になります。[このリポジトリ](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main)から`hubert_base.pt`をダウンロードして、バッチファイルがあるフォルダに格納してください。 - DDSP-SVC の動作には、hubert-soft と enhancer のモデルが必要です。hubert-soft は[このリンク](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)からダウンロードして、バッチファイルがあるフォルダに格納してください。enhancer は[このサイト](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1)から`nsf_hifigan_20221211.zip`ダウンロードして下さい。解凍すると出てくる`nsf_hifigan`というフォルダをバッチファイルがあるフォルダに格納してください。 +- DDPS-SVC の encoder は hubert-soft のみ対応です。 | Version | OS | フレームワーク | link | サポート VC | サイズ | | --------- | --- | --------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------ | diff --git a/server/Exceptions.py b/server/Exceptions.py new file mode 100644 index 00000000..6b89d065 --- /dev/null +++ b/server/Exceptions.py @@ -0,0 +1,7 @@ + +class NoModeLoadedException(Exception): + def __init__(self, framework): + self.framework = framework + + def __str__(self): + return repr(f"No model for {self.framework} loaded. Please confirm the model uploaded.") diff --git a/server/voice_changer/DDSP_SVC/DDSP_SVC.py b/server/voice_changer/DDSP_SVC/DDSP_SVC.py index 14db177d..4d196aa1 100644 --- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py +++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py @@ -20,6 +20,9 @@ import pyworld as pw import ddsp.vocoder as vo from ddsp.core import upsample from enhancer import Enhancer + +from Exceptions import NoModeLoadedException + providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -224,7 +227,7 @@ class DDSP_SVC: def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No onnx session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") seg_units = data[0] # f0 = data[1] @@ -258,7 +261,7 @@ class DDSP_SVC: if hasattr(self, "model") == False or self.model == None: print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") c = data[0].to(self.useDevice()) f0 = data[1].to(self.useDevice()) diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py index 7193772a..8794f54b 100644 --- a/server/voice_changer/MMVCv13/MMVCv13.py +++ b/server/voice_changer/MMVCv13/MMVCv13.py @@ -22,6 +22,8 @@ from symbols import symbols from models import SynthesizerTrn from voice_changer.MMVCv13.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file +from Exceptions import NoModeLoadedException + providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -119,6 +121,8 @@ class MMVCv13: return data def get_processing_sampling_rate(self): + if hasattr(self, "hps") == False: + raise NoModeLoadedException("config") return self.hps.data.sampling_rate def _get_spec(self, audio: any): @@ -158,7 +162,7 @@ class MMVCv13: def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No ONNX session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data] sid_tgt1 = torch.LongTensor([self.settings.dstId]) @@ -176,7 +180,7 @@ class MMVCv13: def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") diff --git a/server/voice_changer/MMVCv15/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py index 57cfa38e..2081be45 100644 --- a/server/voice_changer/MMVCv15/MMVCv15.py +++ b/server/voice_changer/MMVCv15/MMVCv15.py @@ -20,6 +20,8 @@ import pyworld as pw from models import SynthesizerTrn from voice_changer.MMVCv15.client_modules import convert_continuos_f0, spectrogram_torch, get_hparams_from_file, load_checkpoint +from Exceptions import NoModeLoadedException + providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -138,6 +140,8 @@ class MMVCv15: return data def get_processing_sampling_rate(self): + if hasattr(self, "hps") == False: + raise NoModeLoadedException("config") return self.hps.data.sampling_rate def _get_f0(self, detector: str, newData: any): @@ -191,7 +195,7 @@ class MMVCv15: def _onnx_inference(self, data): if self.settings.onnxModelFile == "": print("[Voice Changer] No ONNX session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") spec, f0, sid_src = data spec = spec.unsqueeze(0) @@ -217,7 +221,7 @@ class MMVCv15: def _pyTorch_inference(self, data): if self.settings.pyTorchModelFile == "": print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 88fd7639..5819cb2a 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -2,7 +2,7 @@ import sys import os import resampy from voice_changer.RVC.ModelWrapper import ModelWrapper - +from Exceptions import NoModeLoadedException # avoiding parse arg error in RVC sys.argv = ["MMVCServerSIO.py"] @@ -198,7 +198,7 @@ class RVC: def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No onnx session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") @@ -239,7 +239,7 @@ class RVC: def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") diff --git a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py index 0aa907d4..8d274cdf 100644 --- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py +++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py @@ -26,6 +26,10 @@ import cluster import utils from fairseq import checkpoint_utils import librosa + +from Exceptions import NoModeLoadedException + + providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -185,6 +189,8 @@ class SoVitsSvc40: return data def get_processing_sampling_rate(self): + if hasattr(self, "hps") == False: + raise NoModeLoadedException("config") return self.hps.data.sampling_rate def get_unit_f0(self, audio_buffer, tran): @@ -278,7 +284,7 @@ class SoVitsSvc40: def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No onnx session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") convertSize = data[3] vol = data[4] @@ -309,7 +315,7 @@ class SoVitsSvc40: def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") diff --git a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py index b7c40454..0120bce2 100644 --- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py +++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py @@ -23,6 +23,9 @@ import cluster import utils from fairseq import checkpoint_utils import librosa + +from Exceptions import NoModeLoadedException + providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -161,6 +164,8 @@ class SoVitsSvc40v2: return data def get_processing_sampling_rate(self): + if hasattr(self, "hps") == False: + raise NoModeLoadedException("config") return self.hps.data.sampling_rate def get_unit_f0(self, audio_buffer, tran): @@ -240,7 +245,7 @@ class SoVitsSvc40v2: def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No onnx session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("ONNX") convertSize = data[3] vol = data[4] @@ -272,7 +277,7 @@ class SoVitsSvc40v2: def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: print("[Voice Changer] No pyTorch session.") - return np.zeros(1).astype(np.int16) + raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or self.gpu_num == 0: dev = torch.device("cpu") diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 988610d1..516dd3bf 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -14,7 +14,7 @@ from voice_changer.IORecorder import IORecorder from voice_changer.utils.Timer import Timer from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut import time - +from Exceptions import NoModeLoadedException providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -211,27 +211,27 @@ class VoiceChanger(): return self.on_request_sola(receivedData) def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: - processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() + try: + processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() - # 前処理 - with Timer("pre-process") as t: - if self.settings.inputSampleRate != processing_sampling_rate: - newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)) - else: - newData = receivedData + # 前処理 + with Timer("pre-process") as t: + if self.settings.inputSampleRate != processing_sampling_rate: + newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)) + else: + newData = receivedData - sola_search_frame = int(0.012 * processing_sampling_rate) - # sola_search_frame = 0 - block_frame = newData.shape[0] - crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) - self._generate_strength(crossfade_frame) + sola_search_frame = int(0.012 * processing_sampling_rate) + # sola_search_frame = 0 + block_frame = newData.shape[0] + crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) + self._generate_strength(crossfade_frame) - data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) - preprocess_time = t.secs + data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) + preprocess_time = t.secs - # 変換処理 - with Timer("main-process") as t: - try: + # 変換処理 + with Timer("main-process") as t: # Inference audio = self.voiceChanger.inference(data) @@ -258,38 +258,41 @@ class VoiceChanger(): else: self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength # self.sola_buffer = audio[- crossfade_frame:] + mainprocess_time = t.secs - except Exception as e: - print("VC PROCESSING!!!! EXCEPTION!!!", e) - print(traceback.format_exc()) - return np.zeros(1).astype(np.int16), [0, 0, 0] - mainprocess_time = t.secs + # 後処理 + with Timer("post-process") as t: + result = result.astype(np.int16) + if self.settings.inputSampleRate != processing_sampling_rate: + outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)) + else: + outputData = result - # 後処理 - with Timer("post-process") as t: - result = result.astype(np.int16) - if self.settings.inputSampleRate != processing_sampling_rate: - outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)) - else: - outputData = result + print_convert_processing( + f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - print_convert_processing( - f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") + if self.settings.recordIO == 1: + self.ioRecorder.writeInput(receivedData) + self.ioRecorder.writeOutput(outputData.tobytes()) - if self.settings.recordIO == 1: - self.ioRecorder.writeInput(receivedData) - self.ioRecorder.writeOutput(outputData.tobytes()) + # if receivedData.shape[0] != outputData.shape[0]: + # print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}") + # outputData = pad_array(outputData, receivedData.shape[0]) + # # print_convert_processing( + # # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") + postprocess_time = t.secs - # if receivedData.shape[0] != outputData.shape[0]: - # print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}") - # outputData = pad_array(outputData, receivedData.shape[0]) - # # print_convert_processing( - # # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - postprocess_time = t.secs + print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") + perf = [preprocess_time, mainprocess_time, postprocess_time] + return outputData, perf - print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") - perf = [preprocess_time, mainprocess_time, postprocess_time] - return outputData, perf + except NoModeLoadedException as e: + print("[Voice Changer] [Exception]", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] + except Exception as e: + print("VC PROCESSING!!!! EXCEPTION!!!", e) + print(traceback.format_exc()) + return np.zeros(1).astype(np.int16), [0, 0, 0] def export2onnx(self): return self.voiceChanger.export2onnx()