WIP: support so-vits-svc, message impruvement

2025-03-11 09:58:49 +03:00 · 2023-03-11 03:59:03 +09:00 · 2023-03-11 03:59:03 +09:00 · d7cff0709f
commit d7cff0709f
parent 010eb395ef
2 changed files with 29 additions and 15 deletions
--- a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
+++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
@ -141,7 +141,8 @@ class SoVitsSvc40v2:
        wav_44k = audio_buffer
        # f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
        f0 = utils.compute_f0_dio(wav_44k, sampling_rate=self.hps.data.sampling_rate, hop_length=self.hps.data.hop_length)
-        print(f"--- >>>>> ---- >>>> {wav_44k.shape[0] / self.hps.data.hop_length}")
+        if wav_44k.shape[0] % self.hps.data.hop_length != 0:
+            print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}")

        f0, uv = utils.interpolate_f0(f0)
        f0 = torch.FloatTensor(f0)
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -147,7 +147,7 @@ class VoiceChanger():
    #  receivedData: tuple of short
    def on_request(self, receivedData: any):
        processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
-        print(f"------------ Convert processing.... ------------")
+        print_convert_processing(f"------------ Convert processing.... ------------")
        # 前処理
        with Timer("pre-process") as t:

@ -158,7 +158,8 @@ class VoiceChanger():

            inputSize = newData.shape[0]
            convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
-            print(f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
+            print_convert_processing(
+                f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")

            if convertSize < 8192:
                convertSize = 8192
@ -170,8 +171,9 @@ class VoiceChanger():
            overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
            cropRange = (-1 * (inputSize + overlapSize), -1 * overlapSize)

-            print(f" Convert input data size of {convertSize}")
-            print(f"         overlap:{overlapSize}, cropRange:{cropRange}")
+            print_convert_processing(f" Convert input data size of {convertSize}")
+            print_convert_processing(f"         overlap:{overlapSize}, cropRange:{cropRange}")
+
            self._generate_strength(inputSize)
            data = self.voiceChanger.generate_input(newData, convertSize, cropRange)
        preprocess_time = t.secs
@ -190,16 +192,18 @@ class VoiceChanger():
                    cur_overlap = audio[cur_overlap_start:cur_overlap_end]
                    # cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
                    powered_prev = prev_overlap * self.np_prev_strength
-                    print(f" ---- ---- ---- audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
-                    print(f" ---- ---- ---------------- {cur_overlap_start}, {cur_overlap_end}")
+                    print_convert_processing(
+                        f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
+                    print_convert_processing(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}")
                    powered_cur = cur_overlap * self.np_cur_strength
                    powered_result = powered_prev + powered_cur

                    cur = audio[-1 * inputSize:-1 * overlapSize]
                    result = np.concatenate([powered_result, cur], axis=0)
-                    print(f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
-                    # print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
-                    # print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
+                    print_convert_processing(
+                        f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
+                    if cur.shape[0] != result.shape[0]:
+                        print_convert_processing(f" current and result should be same as input")

                else:
                    result = np.zeros(4096).astype(np.int16)
@ -221,7 +225,8 @@ class VoiceChanger():
            else:
                outputData = result

-            print(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
+            print_convert_processing(
+                f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")

            if self.settings.recordIO == 1:
                self.ioRecorder.writeInput(receivedData)
@ -229,16 +234,26 @@ class VoiceChanger():

            if receivedData.shape[0] != outputData.shape[0]:
                outputData = pad_array(outputData, receivedData.shape[0])
-                print(
+                print_convert_processing(
                    f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")

        postprocess_time = t.secs

-        print(" [fin] Input/Output size:", receivedData.shape[0], outputData.shape[0])
+        print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
        perf = [preprocess_time, mainprocess_time, postprocess_time]
        return outputData, perf


+##############
+PRINT_CONVERT_PROCESSING = False
+# PRINT_CONVERT_PROCESSING = True
+
+
+def print_convert_processing(mess: str):
+    if PRINT_CONVERT_PROCESSING == True:
+        print(mess)
+
+
 def pad_array(arr, target_length):
    current_length = arr.shape[0]
    if current_length >= target_length:
@ -250,8 +265,6 @@ def pad_array(arr, target_length):
        padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
        return padded_arr

-##############
-

 class Timer(object):
    def __init__(self, title: str):