From d85bbae4784406c6ca31b72687f8b7b57aab6963 Mon Sep 17 00:00:00 2001
From: wataru <wataru@fdev.local.com>
Date: Sat, 15 Apr 2023 04:58:56 +0900
Subject: [PATCH] apply sola for all vc forcely

---
 .../dist/assets/gui_settings/MMVCv13.json     |   4 -
 .../dist/assets/gui_settings/MMVCv15.json     |   4 -
 client/demo/dist/assets/gui_settings/RVC.json |   4 -
 .../assets/gui_settings/so-vits-svc-40.json   |   4 -
 .../assets/gui_settings/so-vits-svc-40v2.json |   4 -
 .../public/assets/gui_settings/MMVCv13.json   |   4 -
 .../public/assets/gui_settings/MMVCv15.json   |   4 -
 .../demo/public/assets/gui_settings/RVC.json  |   4 -
 .../assets/gui_settings/so-vits-svc-40.json   |   4 -
 .../assets/gui_settings/so-vits-svc-40v2.json |   4 -
 server/voice_changer/MMVCv13/MMVCv13.py       |  33 +-----
 server/voice_changer/MMVCv15/MMVCv15.py       |   7 +-
 server/voice_changer/RVC/RVC.py               |  38 +-----
 .../voice_changer/SoVitsSvc40/SoVitsSvc40.py  |   7 +-
 .../SoVitsSvc40v2/SoVitsSvc40v2.py            |   9 +-
 server/voice_changer/VoiceChanger.py          | 110 +-----------------
 16 files changed, 14 insertions(+), 230 deletions(-)

diff --git a/client/demo/dist/assets/gui_settings/MMVCv13.json b/client/demo/dist/assets/gui_settings/MMVCv13.json
index 9781660a..d00b6258 100644
--- a/client/demo/dist/assets/gui_settings/MMVCv13.json
+++ b/client/demo/dist/assets/gui_settings/MMVCv13.json
@@ -145,10 +145,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/dist/assets/gui_settings/MMVCv15.json b/client/demo/dist/assets/gui_settings/MMVCv15.json
index 98e1fa3d..503c30c5 100644
--- a/client/demo/dist/assets/gui_settings/MMVCv15.json
+++ b/client/demo/dist/assets/gui_settings/MMVCv15.json
@@ -147,10 +147,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/dist/assets/gui_settings/RVC.json b/client/demo/dist/assets/gui_settings/RVC.json
index 78015a64..6bb5c6e3 100644
--- a/client/demo/dist/assets/gui_settings/RVC.json
+++ b/client/demo/dist/assets/gui_settings/RVC.json
@@ -166,10 +166,6 @@
             {
                 "name": "rvcQuality",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/dist/assets/gui_settings/so-vits-svc-40.json b/client/demo/dist/assets/gui_settings/so-vits-svc-40.json
index 16807a37..2d77e628 100644
--- a/client/demo/dist/assets/gui_settings/so-vits-svc-40.json
+++ b/client/demo/dist/assets/gui_settings/so-vits-svc-40.json
@@ -157,10 +157,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json b/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json
index 9ed03c30..bc2a5bcf 100644
--- a/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json
+++ b/client/demo/dist/assets/gui_settings/so-vits-svc-40v2.json
@@ -157,10 +157,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/public/assets/gui_settings/MMVCv13.json b/client/demo/public/assets/gui_settings/MMVCv13.json
index 9781660a..d00b6258 100644
--- a/client/demo/public/assets/gui_settings/MMVCv13.json
+++ b/client/demo/public/assets/gui_settings/MMVCv13.json
@@ -145,10 +145,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/public/assets/gui_settings/MMVCv15.json b/client/demo/public/assets/gui_settings/MMVCv15.json
index 98e1fa3d..503c30c5 100644
--- a/client/demo/public/assets/gui_settings/MMVCv15.json
+++ b/client/demo/public/assets/gui_settings/MMVCv15.json
@@ -147,10 +147,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/public/assets/gui_settings/RVC.json b/client/demo/public/assets/gui_settings/RVC.json
index 78015a64..6bb5c6e3 100644
--- a/client/demo/public/assets/gui_settings/RVC.json
+++ b/client/demo/public/assets/gui_settings/RVC.json
@@ -166,10 +166,6 @@
             {
                 "name": "rvcQuality",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/public/assets/gui_settings/so-vits-svc-40.json b/client/demo/public/assets/gui_settings/so-vits-svc-40.json
index 16807a37..2d77e628 100644
--- a/client/demo/public/assets/gui_settings/so-vits-svc-40.json
+++ b/client/demo/public/assets/gui_settings/so-vits-svc-40.json
@@ -157,10 +157,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json b/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json
index 9ed03c30..bc2a5bcf 100644
--- a/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json
+++ b/client/demo/public/assets/gui_settings/so-vits-svc-40v2.json
@@ -157,10 +157,6 @@
             {
                 "name": "trancateNumThreshold",
                 "options": {}
-            },
-            {
-                "name": "solaEnable",
-                "options": {}
             }
         ]
     },
diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py
index cd5a788b..e5710de2 100644
--- a/server/voice_changer/MMVCv13/MMVCv13.py
+++ b/server/voice_changer/MMVCv13/MMVCv13.py
@@ -130,7 +130,7 @@ class MMVCv13:
         spec = torch.squeeze(spec, 0)
         return spec
 
-    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
+    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
         newData = newData.astype(np.float32) / self.hps.data.max_wav_value
 
         if hasattr(self, "audio_buffer"):
@@ -138,10 +138,7 @@ class MMVCv13:
         else:
             self.audio_buffer = newData
 
-        if solaEnabled:
-            convertSize = inputSize + crossfadeSize + solaSearchFrame
-        else:
-            convertSize = inputSize + crossfadeSize
+        convertSize = inputSize + crossfadeSize + solaSearchFrame
 
         if convertSize < 8192:
             convertSize = 8192
@@ -160,32 +157,6 @@ class MMVCv13:
 
         return data
 
-    def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int):
-        newData = newData.astype(np.float32) / self.hps.data.max_wav_value
-
-        if hasattr(self, "audio_buffer"):
-            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
-        else:
-            self.audio_buffer = newData
-
-        convertSize = inputSize + crossfadeSize
-        if convertSize < 8192:
-            convertSize = 8192
-        if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-            convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
-
-        self.audio_buffer = self.audio_buffer[-1 * convertSize:]  # 変換対象の部分だけ抽出
-
-        audio = torch.FloatTensor(self.audio_buffer)
-        audio_norm = audio.unsqueeze(0)  # unsqueeze
-        spec = self._get_spec(audio_norm)
-        sid = torch.LongTensor([int(self.settings.srcId)])
-
-        data = (self.text_norm, spec, audio_norm, sid)
-        data = TextAudioSpeakerCollate()([data])
-
-        return data
-
     def _onnx_inference(self, data):
         if hasattr(self, "onnx_session") == False or self.onnx_session == None:
             print("[Voice Changer] No ONNX session.")
diff --git a/server/voice_changer/MMVCv15/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py
index cffa6f32..7a90c550 100644
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@@ -166,7 +166,7 @@ class MMVCv15:
         spec = torch.squeeze(spec, 0)
         return spec
 
-    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
+    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
         newData = newData.astype(np.float32) / self.hps.data.max_wav_value
 
         if hasattr(self, "audio_buffer"):
@@ -174,10 +174,7 @@ class MMVCv15:
         else:
             self.audio_buffer = newData
 
-        if solaEnabled:
-            convertSize = inputSize + crossfadeSize + solaSearchFrame
-        else:
-            convertSize = inputSize + crossfadeSize
+        convertSize = inputSize + crossfadeSize + solaSearchFrame
 
         if convertSize < 8192:
             convertSize = 8192
diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py
index da1ddc0a..5f0a38b2 100644
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@@ -165,7 +165,7 @@ class RVC:
     def get_processing_sampling_rate(self):
         return self.settings.modelSamplingRate
 
-    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
+    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
         newData = newData.astype(np.float32) / 32768.0
 
         if hasattr(self, "audio_buffer"):
@@ -173,10 +173,7 @@ class RVC:
         else:
             self.audio_buffer = newData
 
-        if solaEnabled:
-            convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
-        else:
-            convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
 
         if convertSize % 128 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
             convertSize = convertSize + (128 - (convertSize % 128))
@@ -188,30 +185,6 @@ class RVC:
         vol = max(rms, self.prevVol * 0.0)
         self.prevVol = vol
 
-        return (self.audio_buffer, convertSize, vol, solaEnabled)
-
-    def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int):
-        newData = newData.astype(np.float32) / 32768.0
-
-        if hasattr(self, "audio_buffer"):
-            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)  # 過去のデータに連結
-        else:
-            self.audio_buffer = newData
-
-        convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
-
-        # if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-        if convertSize % 128 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
-            # convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
-            convertSize = convertSize + (128 - (convertSize % 128))
-
-        self.audio_buffer = self.audio_buffer[-1 * convertSize:]  # 変換対象の部分だけ抽出
-
-        crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
-        rms = np.sqrt(np.square(crop).mean(axis=0))
-        vol = max(rms, self.prevVol * 0.0)
-        self.prevVol = vol
-
         return (self.audio_buffer, convertSize, vol)
 
     def _onnx_inference(self, data):
@@ -302,12 +275,7 @@ class RVC:
         else:
             audio = self._pyTorch_inference(data)
 
-        sola_enabled = data[3]
-        if sola_enabled:
-            return audio
-            # return audio[self.settings.extraConvertSize:]
-        else:
-            return audio
+        return audio
 
     def __del__(self):
         del self.net_g
diff --git a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
index d86b44eb..087218da 100644
--- a/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
+++ b/server/voice_changer/SoVitsSvc40/SoVitsSvc40.py
@@ -252,7 +252,7 @@ class SoVitsSvc40:
         c = c.unsqueeze(0)
         return c, f0, uv
 
-    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
+    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
         newData = newData.astype(np.float32) / self.hps.data.max_wav_value
 
         if hasattr(self, "audio_buffer"):
@@ -260,10 +260,7 @@ class SoVitsSvc40:
         else:
             self.audio_buffer = newData
 
-        if solaEnabled:
-            convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
-        else:
-            convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
 
         if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
             convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
diff --git a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
index f4fc578f..50686e82 100644
--- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
+++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
@@ -215,7 +215,7 @@ class SoVitsSvc40v2:
         c = c.unsqueeze(0)
         return c, f0, uv
 
-    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
+    def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
         newData = newData.astype(np.float32) / self.hps.data.max_wav_value
 
         if hasattr(self, "audio_buffer"):
@@ -223,10 +223,7 @@ class SoVitsSvc40v2:
         else:
             self.audio_buffer = newData
 
-        if solaEnabled:
-            convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
-        else:
-            convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
 
         if convertSize % self.hps.data.hop_length != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
             convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
@@ -274,8 +271,6 @@ class SoVitsSvc40v2:
 
         return result
 
-        pass
-
     def _pyTorch_inference(self, data):
         if hasattr(self, "net_g") == False or self.net_g == None:
             print("[Voice Changer] No pyTorch session.")
diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py
index 5b8adedb..e27e145c 100755
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@@ -208,13 +208,9 @@ class VoiceChanger():
 
     #  receivedData: tuple of short
     def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
-        if self.settings.solaEnabled:
-            return self.on_request_sola(receivedData)
-        else:
-            return self.on_request_legacy(receivedData)
+        return self.on_request_sola(receivedData)
 
     def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
-        print("processing with sola")
         processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
 
         # 前処理
@@ -230,7 +226,7 @@ class VoiceChanger():
             crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
             self._generate_strength(crossfade_frame)
 
-            data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, True, sola_search_frame)
+            data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
         preprocess_time = t.secs
 
         # 変換処理
@@ -295,110 +291,10 @@ class VoiceChanger():
         perf = [preprocess_time, mainprocess_time, postprocess_time]
         return outputData, perf
 
-    def on_request_legacy(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
-        # print("processing with legacy")
-
-        processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
-        print_convert_processing(f"------------ Convert processing.... ------------")
-        # 前処理
-        with Timer("pre-process") as t:
-
-            with Timer("pre-process") as t1:
-
-                if self.settings.inputSampleRate != processing_sampling_rate:
-                    newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
-                else:
-                    newData = receivedData
-            # print("t1::::", t1.secs)
-            inputSize = newData.shape[0]
-            crossfadeSize = min(self.settings.crossFadeOverlapSize, inputSize)
-
-            print_convert_processing(
-                f" Input data size: {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
-            print_convert_processing(
-                f" Crossfade data size: crossfade:{crossfadeSize}, crossfade setting:{self.settings.crossFadeOverlapSize}, input size:{inputSize}")
-
-            print_convert_processing(f" Convert data size of {inputSize + crossfadeSize} (+ extra size)")
-            print_convert_processing(f"         will be cropped:{-1 * (inputSize + crossfadeSize)}, {-1 * (crossfadeSize)}")
-
-            self._generate_strength(crossfadeSize)
-            with Timer("pre-process") as t2:
-                data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize)
-            # print("t2::::", t2.secs)
-        preprocess_time = t.secs
-
-        # 変換処理
-        with Timer("main-process") as t:
-            try:
-                # Inference
-                audio = self.voiceChanger.inference(data)
-
-                if hasattr(self, 'np_prev_audio1') == True:
-                    np.set_printoptions(threshold=10000)
-                    prev_overlap_start = -1 * crossfadeSize
-                    prev_overlap = self.np_prev_audio1[prev_overlap_start:]
-                    cur_overlap_start = -1 * (inputSize + crossfadeSize)
-                    cur_overlap_end = -1 * inputSize
-                    cur_overlap = audio[cur_overlap_start:cur_overlap_end]
-                    print_convert_processing(
-                        f" audio:{audio.shape}, prev_overlap:{prev_overlap.shape}, self.np_prev_strength:{self.np_prev_strength.shape}")
-                    powered_prev = prev_overlap * self.np_prev_strength
-                    print_convert_processing(
-                        f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
-                    print_convert_processing(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}")
-
-                    powered_cur = cur_overlap * self.np_cur_strength
-                    powered_result = powered_prev + powered_cur
-
-                    cur = audio[-1 * inputSize:-1 * crossfadeSize]
-                    result = np.concatenate([powered_result, cur], axis=0)
-                    print_convert_processing(
-                        f" overlap:{crossfadeSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
-                    if cur.shape[0] != result.shape[0]:
-                        print_convert_processing(f" current and result should be same as input")
-
-                else:
-                    result = np.zeros(4096).astype(np.int16)
-                self.np_prev_audio1 = audio
-
-            except Exception as e:
-                print("VC PROCESSING!!!! EXCEPTION!!!", e)
-                print(traceback.format_exc())
-                if hasattr(self, "np_prev_audio1"):
-                    del self.np_prev_audio1
-                return np.zeros(1).astype(np.int16), [0, 0, 0]
-        mainprocess_time = t.secs
-
-        # 後処理
-        with Timer("post-process") as t:
-            result = result.astype(np.int16)
-            if self.settings.inputSampleRate != processing_sampling_rate:
-                outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
-            else:
-                outputData = result
-            # outputData = result
-
-            print_convert_processing(
-                f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
-
-            if self.settings.recordIO == 1:
-                self.ioRecorder.writeInput(receivedData)
-                self.ioRecorder.writeOutput(outputData.tobytes())
-
-            # if receivedData.shape[0] != outputData.shape[0]:
-            #     print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}")
-            #     outputData = pad_array(outputData, receivedData.shape[0])
-            #     # print_convert_processing(
-            #     #     f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
-        postprocess_time = t.secs
-
-        print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
-        perf = [preprocess_time, mainprocess_time, postprocess_time]
-        return outputData, perf
-
     def export2onnx(self):
         return self.voiceChanger.export2onnx()
 
+
         ##############
 PRINT_CONVERT_PROCESSING: bool = False
 # PRINT_CONVERT_PROCESSING = True