WIP:crepe

2025-01-23 21:45:00 +03:00 · 2023-07-15 05:57:20 +09:00 · 2023-07-15 05:57:20 +09:00 · 485a747d55
commit 485a747d55
parent 8339e72ef5
3 changed files with 26 additions and 27 deletions
--- a/server/voice_changer/DiffusionSVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/DiffusionSVC/pitchExtractor/CrepePitchExtractor.py
@ -11,30 +11,26 @@ class CrepePitchExtractor(PitchExtractor):
    def __init__(self):
        super().__init__()
        self.pitchExtractorType: PitchExtractorType = "crepe"
        self.f0_min = 50
        self.f0_max = 1100
        self.sapmle_rate = 16000
        self.uv_interp = True
        if torch.cuda.is_available():
            self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
        else:
            self.device = torch.device("cpu")
-    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
+    def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
-        n_frames = int(len(audio) // window) + 1
+        start_frame = int(silence_front * self.sapmle_rate / window)
-        start_frame = int(silence_front * sr / window)
+        real_silence_front = start_frame * window / self.sapmle_rate
-        real_silence_front = start_frame * window / sr
+        audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
        silence_front_offset = int(np.round(real_silence_front * sr))
        audio = audio[silence_front_offset:]
        f0_min = 50
        f0_max = 1100
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)
        f0, pd = torchcrepe.predict(
            audio.unsqueeze(0),
-            sr,
+            self.sapmle_rate,
            hop_length=window,
-            fmin=f0_min,
+            fmin=self.f0_min,
-            fmax=f0_max,
+            fmax=self.f0_max,
            # model="tiny",
            model="full",
            batch_size=256,
@ -46,14 +42,15 @@ class CrepePitchExtractor(PitchExtractor):
        pd = torchcrepe.filter.median(pd, 3)
        f0[pd < 0.1] = 0
        f0 = f0.squeeze()
        pitch[-f0.shape[0]:] = f0.cpu()[:pitch.shape[0]]
        f0 = pitch
-        f0 *= pow(2, f0_up_key / 12)
+        if self.uv_interp:
-        pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
+            uv = f0 == 0
-        f0bak = pitchf.copy()
+            if len(f0[~uv]) > 0:
-        f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
+                f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
-        f0_mel = np.clip(
+            f0[f0 < self.f0_min] = self.f0_min
            (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
        )
        pitch_coarse = f0_mel.astype(int)
-        return pitch_coarse, pitchf
+        f0 = f0 * 2 ** (float(f0_up_key) / 12)
        return f0
--- a/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractorManager.py
+++ b/server/voice_changer/DiffusionSVC/pitchExtractor/PitchExtractorManager.py
@ -1,5 +1,6 @@
 from typing import Protocol
 from const import PitchExtractorType
 from voice_changer.DiffusionSVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
 from voice_changer.DiffusionSVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
 from voice_changer.DiffusionSVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
 from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
@ -29,8 +30,8 @@ class PitchExtractorManager(Protocol):
            return HarvestPitchExtractor()
        elif pitchExtractorType == "dio":
            return DioPitchExtractor()
-        # elif pitchExtractorType == "crepe":
+        elif pitchExtractorType == "crepe":
-        #     return CrepePitchExtractor()
+            return CrepePitchExtractor()
        # elif pitchExtractorType == "crepe_tiny":
        #     return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_tiny, gpu)
        # elif pitchExtractorType == "crepe_full":
--- a/server/voice_changer/common/VolumeExtractor.py
+++ b/server/voice_changer/common/VolumeExtractor.py
@ -2,6 +2,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 class VolumeExtractor:
    def __init__(self, hop_size: float):