update

2025-01-23 13:35:12 +03:00 · 2023-04-28 17:18:33 +09:00 · 2023-04-28 17:18:33 +09:00 · 569d8d2c0c
commit 569d8d2c0c
parent 50d1977d50
2 changed files with 17 additions and 35 deletions
--- a/server/voice_changer/DDSP_SVC/DDSP_SVC.py
+++ b/server/voice_changer/DDSP_SVC/DDSP_SVC.py
@ -191,8 +191,8 @@ class DDSP_SVC:
            setattr(self.settings, key, str(val))
            if key == "f0Detector":
                print("f0Detector update", val)
-                if val == "dio":
-                    val = "parselmouth"
+                # if val == "dio":
+                #     val = "parselmouth"

                if hasattr(self, "sampling_rate") is False:
                    self.sampling_rate = 44100
--- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py
+++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
@ -1,5 +1,6 @@
 import numpy as np
-import parselmouth
+
+# import parselmouth
 import torch
 import torch.nn.functional as F
 from config import x_query, x_center, x_max  # type:ignore
@ -27,28 +28,13 @@ class VC(object):
        silence_front_offset = int(np.round(real_silence_front * self.sr))
        audio = audio[silence_front_offset:]

-        time_step = self.window / self.sr * 1000
+        # time_step = self.window / self.sr * 1000
        f0_min = 50
        f0_max = 1100
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
        if f0_method == "pm":
-            f0 = (
-                parselmouth.Sound(audio, self.sr)
-                .to_pitch_ac(
-                    time_step=time_step / 1000,
-                    voicing_threshold=0.6,
-                    pitch_floor=f0_min,
-                    pitch_ceiling=f0_max,
-                )
-                .selected_array["frequency"]
-            )
-            pad_size = (p_len - len(f0) + 1) // 2
-            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
-                f0 = np.pad(
-                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
-                )
-        elif f0_method == "harvest":
+            print("not implemented. use harvest")
            f0, t = pyworld.harvest(
                audio.astype(np.double),
                fs=self.sr,
@ -62,22 +48,18 @@ class VC(object):
                f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
            )
        else:
-            print("[Voice Changer] invalid f0 detector, use pm.", f0_method)
-            f0 = (
-                parselmouth.Sound(audio, self.sr)
-                .to_pitch_ac(
-                    time_step=time_step / 1000,
-                    voicing_threshold=0.6,
-                    pitch_floor=f0_min,
-                    pitch_ceiling=f0_max,
-                )
-                .selected_array["frequency"]
+            f0, t = pyworld.harvest(
+                audio.astype(np.double),
+                fs=self.sr,
+                f0_ceil=f0_max,
+                frame_period=10,
+            )
+            f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr)
+            f0 = signal.medfilt(f0, 3)
+
+            f0 = np.pad(
+                f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
            )
-            pad_size = (p_len - len(f0) + 1) // 2
-            if pad_size > 0 or p_len - len(f0) - pad_size > 0:
-                f0 = np.pad(
-                    f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
-                )

        f0 *= pow(2, f0_up_key / 12)
        f0bak = f0.copy()