From d85bbae4784406c6ca31b72687f8b7b57aab6963 Mon Sep 17 00:00:00 2001 From: wataru Date: Sat, 15 Apr 2023 04:58:56 +0900 Subject: [PATCH] apply sola for all vc forcely --- .../dist/assets/gui_settings/MMVCv13.json | 4 - .../dist/assets/gui_settings/MMVCv15.json | 4 - client/demo/dist/assets/gui_settings/RVC.json | 4 - .../assets/gui_settings/so-vits-svc-40.json | 4 - .../assets/gui_settings/so-vits-svc-40v2.json | 4 - .../public/assets/gui_settings/MMVCv13.json | 4 - .../public/assets/gui_settings/MMVCv15.json | 4 - .../demo/public/assets/gui_settings/RVC.json | 4 - .../assets/gui_settings/so-vits-svc-40.json | 4 - .../assets/gui_settings/so-vits-svc-40v2.json | 4 - server/voice_changer/MMVCv13/MMVCv13.py | 33 +----- server/voice_changer/MMVCv15/MMVCv15.py | 7 +- server/voice_changer/RVC/RVC.py | 38 +----- .../voice_changer/SoVitsSvc40/SoVitsSvc40.py | 7 +- .../SoVitsSvc40v2/SoVitsSvc40v2.py | 9 +- server/voice_changer/VoiceChanger.py | 110 +----------------- 16 files changed, 14 insertions(+), 230 deletions(-) diff --git a/client/demo/dist/assets/gui_settings/MMVCv13.json b/client/demo/dist/assets/gui_settings/MMVCv13.json index 9781660a..d00b6258 100644 --- a/client/demo/dist/assets/gui_settings/MMVCv13.json +++ b/client/demo/dist/assets/gui_settings/MMVCv13.json @@ -145,10 +145,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/dist/assets/gui_settings/MMVCv15.json b/client/demo/dist/assets/gui_settings/MMVCv15.json index 98e1fa3d..503c30c5 100644 --- a/client/demo/dist/assets/gui_settings/MMVCv15.json +++ b/client/demo/dist/assets/gui_settings/MMVCv15.json @@ -147,10 +147,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/dist/assets/gui_settings/RVC.json b/client/demo/dist/assets/gui_settings/RVC.json index 78015a64..6bb5c6e3 100644 --- a/client/demo/dist/assets/gui_settings/RVC.json +++ b/client/demo/dist/assets/gui_settings/RVC.json @@ -166,10 +166,6 @@ { "name": "rvcQuality", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/dist/assets/gui_settings/so-vits-svc-40.json b/client/demo/dist/assets/gui_settings/so-vits-svc-40.json index 16807a37..2d77e628 100644 --- a/client/demo/dist/assets/gui_settings/so-vits-svc-40.json +++ b/client/demo/dist/assets/gui_settings/so-vits-svc-40.json @@ -157,10 +157,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json b/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json index 9ed03c30..bc2a5bcf 100644 --- a/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json +++ b/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json @@ -157,10 +157,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/public/assets/gui_settings/MMVCv13.json b/client/demo/public/assets/gui_settings/MMVCv13.json index 9781660a..d00b6258 100644 --- a/client/demo/public/assets/gui_settings/MMVCv13.json +++ b/client/demo/public/assets/gui_settings/MMVCv13.json @@ -145,10 +145,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/public/assets/gui_settings/MMVCv15.json b/client/demo/public/assets/gui_settings/MMVCv15.json index 98e1fa3d..503c30c5 100644 --- a/client/demo/public/assets/gui_settings/MMVCv15.json +++ b/client/demo/public/assets/gui_settings/MMVCv15.json @@ -147,10 +147,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/public/assets/gui_settings/RVC.json b/client/demo/public/assets/gui_settings/RVC.json index 78015a64..6bb5c6e3 100644 --- a/client/demo/public/assets/gui_settings/RVC.json +++ b/client/demo/public/assets/gui_settings/RVC.json @@ -166,10 +166,6 @@ { "name": "rvcQuality", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/public/assets/gui_settings/so-vits-svc-40.json b/client/demo/public/assets/gui_settings/so-vits-svc-40.json index 16807a37..2d77e628 100644 --- a/client/demo/public/assets/gui_settings/so-vits-svc-40.json +++ b/client/demo/public/assets/gui_settings/so-vits-svc-40.json @@ -157,10 +157,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json b/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json index 9ed03c30..bc2a5bcf 100644 --- a/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json +++ b/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json @@ -157,10 +157,6 @@ { "name": "trancateNumThreshold", "options": {} - }, - { - "name": "solaEnable", - "options": {} } ] }, diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py index cd5a788b..e5710de2 100644 --- a/server/voice_changer/MMVCv13/MMVCv13.py +++ b/server/voice_changer/MMVCv13/MMVCv13.py @@ -130,7 +130,7 @@ class MMVCv13: spec = torch.squeeze(spec, 0) return spec - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0): + def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): newData = newData.astype(np.float32) / self.hps.data.max_wav_value if hasattr(self, "audio_buffer"): @@ -138,10 +138,7 @@ class MMVCv13: else: self.audio_buffer = newData - if solaEnabled: - convertSize = inputSize + crossfadeSize + solaSearchFrame - else: - convertSize = inputSize + crossfadeSize + convertSize = inputSize + crossfadeSize + solaSearchFrame if convertSize < 8192: convertSize = 8192 @@ -160,32 +157,6 @@ class MMVCv13: return data - def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int): - newData = newData.astype(np.float32) / self.hps.data.max_wav_value - - if hasattr(self, "audio_buffer"): - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 - else: - self.audio_buffer = newData - - convertSize = inputSize + crossfadeSize - if convertSize < 8192: - convertSize = 8192 - if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) - - self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出 - - audio = torch.FloatTensor(self.audio_buffer) - audio_norm = audio.unsqueeze(0) # unsqueeze - spec = self._get_spec(audio_norm) - sid = torch.LongTensor([int(self.settings.srcId)]) - - data = (self.text_norm, spec, audio_norm, sid) - data = TextAudioSpeakerCollate()([data]) - - return data - def _onnx_inference(self, data): if hasattr(self, "onnx_session") == False or self.onnx_session == None: print("[Voice Changer] No ONNX session.") diff --git a/server/voice_changer/MMVCv15/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py index cffa6f32..7a90c550 100644 --- a/server/voice_changer/MMVCv15/MMVCv15.py +++ b/server/voice_changer/MMVCv15/MMVCv15.py @@ -166,7 +166,7 @@ class MMVCv15: spec = torch.squeeze(spec, 0) return spec - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0): + def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): newData = newData.astype(np.float32) / self.hps.data.max_wav_value if hasattr(self, "audio_buffer"): @@ -174,10 +174,7 @@ class MMVCv15: else: self.audio_buffer = newData - if solaEnabled: - convertSize = inputSize + crossfadeSize + solaSearchFrame - else: - convertSize = inputSize + crossfadeSize + convertSize = inputSize + crossfadeSize + solaSearchFrame if convertSize < 8192: convertSize = 8192 diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index da1ddc0a..5f0a38b2 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -165,7 +165,7 @@ class RVC: def get_processing_sampling_rate(self): return self.settings.modelSamplingRate - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0): + def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): newData = newData.astype(np.float32) / 32768.0 if hasattr(self, "audio_buffer"): @@ -173,10 +173,7 @@ class RVC: else: self.audio_buffer = newData - if solaEnabled: - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize - else: - convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize + convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) @@ -188,30 +185,6 @@ class RVC: vol = max(rms, self.prevVol * 0.0) self.prevVol = vol - return (self.audio_buffer, convertSize, vol, solaEnabled) - - def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int): - newData = newData.astype(np.float32) / 32768.0 - - if hasattr(self, "audio_buffer"): - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 - else: - self.audio_buffer = newData - - convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize - - # if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 - # convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) - convertSize = convertSize + (128 - (convertSize % 128)) - - self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出 - - crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] - rms = np.sqrt(np.square(crop).mean(axis=0)) - vol = max(rms, self.prevVol * 0.0) - self.prevVol = vol - return (self.audio_buffer, convertSize, vol) def _onnx_inference(self, data): @@ -302,12 +275,7 @@ class RVC: else: audio = self._pyTorch_inference(data) - sola_enabled = data[3] - if sola_enabled: - return audio - # return audio[self.settings.extraConvertSize:] - else: - return audio + return audio def __del__(self): del self.net_g diff --git a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py index d86b44eb..087218da 100644 --- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py +++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py @@ -252,7 +252,7 @@ class SoVitsSvc40: c = c.unsqueeze(0) return c, f0, uv - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0): + def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): newData = newData.astype(np.float32) / self.hps.data.max_wav_value if hasattr(self, "audio_buffer"): @@ -260,10 +260,7 @@ class SoVitsSvc40: else: self.audio_buffer = newData - if solaEnabled: - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize - else: - convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize + convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) diff --git a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py index f4fc578f..50686e82 100644 --- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py +++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py @@ -215,7 +215,7 @@ class SoVitsSvc40v2: c = c.unsqueeze(0) return c, f0, uv - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0): + def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): newData = newData.astype(np.float32) / self.hps.data.max_wav_value if hasattr(self, "audio_buffer"): @@ -223,10 +223,7 @@ class SoVitsSvc40v2: else: self.audio_buffer = newData - if solaEnabled: - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize - else: - convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize + convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)) @@ -274,8 +271,6 @@ class SoVitsSvc40v2: return result - pass - def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: print("[Voice Changer] No pyTorch session.") diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 5b8adedb..e27e145c 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -208,13 +208,9 @@ class VoiceChanger(): # receivedData: tuple of short def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: - if self.settings.solaEnabled: - return self.on_request_sola(receivedData) - else: - return self.on_request_legacy(receivedData) + return self.on_request_sola(receivedData) def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: - print("processing with sola") processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() # 前処理 @@ -230,7 +226,7 @@ class VoiceChanger(): crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) self._generate_strength(crossfade_frame) - data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, True, sola_search_frame) + data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) preprocess_time = t.secs # 変換処理 @@ -295,110 +291,10 @@ class VoiceChanger(): perf = [preprocess_time, mainprocess_time, postprocess_time] return outputData, perf - def on_request_legacy(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: - # print("processing with legacy") - - processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() - print_convert_processing(f"------------ Convert processing.... ------------") - # 前処理 - with Timer("pre-process") as t: - - with Timer("pre-process") as t1: - - if self.settings.inputSampleRate != processing_sampling_rate: - newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)) - else: - newData = receivedData - # print("t1::::", t1.secs) - inputSize = newData.shape[0] - crossfadeSize = min(self.settings.crossFadeOverlapSize, inputSize) - - print_convert_processing( - f" Input data size: {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz") - print_convert_processing( - f" Crossfade data size: crossfade:{crossfadeSize}, crossfade setting:{self.settings.crossFadeOverlapSize}, input size:{inputSize}") - - print_convert_processing(f" Convert data size of {inputSize + crossfadeSize} (+ extra size)") - print_convert_processing(f" will be cropped:{-1 * (inputSize + crossfadeSize)}, {-1 * (crossfadeSize)}") - - self._generate_strength(crossfadeSize) - with Timer("pre-process") as t2: - data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize) - # print("t2::::", t2.secs) - preprocess_time = t.secs - - # 変換処理 - with Timer("main-process") as t: - try: - # Inference - audio = self.voiceChanger.inference(data) - - if hasattr(self, 'np_prev_audio1') == True: - np.set_printoptions(threshold=10000) - prev_overlap_start = -1 * crossfadeSize - prev_overlap = self.np_prev_audio1[prev_overlap_start:] - cur_overlap_start = -1 * (inputSize + crossfadeSize) - cur_overlap_end = -1 * inputSize - cur_overlap = audio[cur_overlap_start:cur_overlap_end] - print_convert_processing( - f" audio:{audio.shape}, prev_overlap:{prev_overlap.shape}, self.np_prev_strength:{self.np_prev_strength.shape}") - powered_prev = prev_overlap * self.np_prev_strength - print_convert_processing( - f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}") - print_convert_processing(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}") - - powered_cur = cur_overlap * self.np_cur_strength - powered_result = powered_prev + powered_cur - - cur = audio[-1 * inputSize:-1 * crossfadeSize] - result = np.concatenate([powered_result, cur], axis=0) - print_convert_processing( - f" overlap:{crossfadeSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input") - if cur.shape[0] != result.shape[0]: - print_convert_processing(f" current and result should be same as input") - - else: - result = np.zeros(4096).astype(np.int16) - self.np_prev_audio1 = audio - - except Exception as e: - print("VC PROCESSING!!!! EXCEPTION!!!", e) - print(traceback.format_exc()) - if hasattr(self, "np_prev_audio1"): - del self.np_prev_audio1 - return np.zeros(1).astype(np.int16), [0, 0, 0] - mainprocess_time = t.secs - - # 後処理 - with Timer("post-process") as t: - result = result.astype(np.int16) - if self.settings.inputSampleRate != processing_sampling_rate: - outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)) - else: - outputData = result - # outputData = result - - print_convert_processing( - f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - - if self.settings.recordIO == 1: - self.ioRecorder.writeInput(receivedData) - self.ioRecorder.writeOutput(outputData.tobytes()) - - # if receivedData.shape[0] != outputData.shape[0]: - # print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}") - # outputData = pad_array(outputData, receivedData.shape[0]) - # # print_convert_processing( - # # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") - postprocess_time = t.secs - - print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") - perf = [preprocess_time, mainprocess_time, postprocess_time] - return outputData, perf - def export2onnx(self): return self.voiceChanger.export2onnx() + ############## PRINT_CONVERT_PROCESSING: bool = False # PRINT_CONVERT_PROCESSING = True