protect + crepe対応

2025-01-23 21:45:00 +03:00 · 2023-05-31 23:50:43 +09:00 · 2023-05-31 23:50:43 +09:00 · 6b9777f3a2
commit 6b9777f3a2
parent dcfc1d83cf
8 changed files with 37 additions and 5 deletions
--- a/server/voice_changer/RVC/ModelSlot.py
+++ b/server/voice_changer/RVC/ModelSlot.py
@ -12,6 +12,7 @@ class ModelSlot:
    indexFile: str = ""
    defaultTune: int = 0
    defaultIndexRatio: int = 1
+    # defaultProtect: float = .5
    isONNX: bool = False
    modelType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
    samplingRate: int = -1
--- a/server/voice_changer/RVC/ModelSlotGenerator.py
+++ b/server/voice_changer/RVC/ModelSlotGenerator.py
@ -36,6 +36,9 @@ def generateModelSlot(slotDir: str):
        modelSlot.defaultIndexRatio = (
            params["defaultIndexRatio"] if "defaultIndexRatio" in params else 0
        )
+        # modelSlot.defaultProtect = (
+        #     params["defaultProtect"] if "defaultProtect" in params else 0.5
+        # )
        modelSlot.name = params["name"] if "name" in params else None
        modelSlot.description = params["description"] if "description" in params else None
        modelSlot.credit = params["credit"] if "credit" in params else None
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -242,6 +242,7 @@ class RVC:
        # その他の設定
        self.next_trans = modelSlot.defaultTune
        self.next_index_ratio = modelSlot.defaultIndexRatio
+        # self.next_protect = modelSlot.defaultProtect
        self.next_samplingRate = modelSlot.samplingRate
        self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch"
        # self.needSwitch = True
@ -254,6 +255,7 @@ class RVC:
        self.pipeline = self.next_pipeline
        self.settings.tran = self.next_trans
        self.settings.indexRatio = self.next_index_ratio
+        # self.settings.protect = self.next_protect
        self.settings.modelSamplingRate = self.next_samplingRate
        self.settings.framework = self.next_framework

@ -336,6 +338,7 @@ class RVC:
        sid = 0
        f0_up_key = self.settings.tran
        index_rate = self.settings.indexRatio
+        protect = .5# self.settings.protect
        if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0
        embOutputLayer = self.settings.modelSlots[self.currentSlot].embOutputLayer
        useFinalProj = self.settings.modelSlots[self.currentSlot].useFinalProj
@ -350,6 +353,7 @@ class RVC:
            embOutputLayer,
            useFinalProj,
            repeat,
+            protect
        )

        result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
@ -411,6 +415,7 @@ class RVC:
        params = {
            "defaultTune": req.defaultTune,
            "defaultIndexRatio": req.defaultIndexRatio,
+            # "defaultProtect": req.defaultProtect
            "sampleId": "",
            "files": {"rvcModel": storeFile},
        }
@ -432,6 +437,7 @@ class RVC:
        )
        params["defaultTune"] = self.settings.tran
        params["defaultIndexRatio"] = self.settings.indexRatio
+        # params["defaultProtect"] = self.settings.protect

        json.dump(params, open(os.path.join(slotDir, "params.json"), "w"))
        self.loadSlots()
--- a/server/voice_changer/RVC/RVCSettings.py
+++ b/server/voice_changer/RVC/RVCSettings.py
@ -29,6 +29,7 @@ class RVCSettings:
    sampleModels: list[RVCModelSample] = field(default_factory=lambda: [])

    indexRatio: float = 0
+    # protect: float = 0.5
    rvcQuality: int = 0
    silenceFront: int = 1  # 0:off, 1:on
    modelSamplingRate: int = 48000
@ -50,5 +51,5 @@ class RVCSettings:
        "isHalf",
        "enableDirectML",
    ]
-    floatData = ["silentThreshold", "indexRatio"]
+    floatData = ["silentThreshold", "indexRatio"]  # , "protect"]
    strData = ["framework", "f0Detector"]
--- a/server/voice_changer/RVC/SampleDownloader.py
+++ b/server/voice_changer/RVC/SampleDownloader.py
@ -81,6 +81,7 @@ def downloadInitialSampleModels(sampleJsons: list[str], model_dir: str):
        sampleParams["sampleId"] = sample.id
        sampleParams["defaultTune"] = 0
        sampleParams["defaultIndexRatio"] = 1
+        # sampleParams["defaultProtect"] = 0.5
        sampleParams["credit"] = sample.credit
        sampleParams["description"] = sample.description
        sampleParams["name"] = sample.name
--- a/server/voice_changer/RVC/modelMerger/MergeModelRequest.py
+++ b/server/voice_changer/RVC/modelMerger/MergeModelRequest.py
@ -17,4 +17,5 @@ class MergeModelRequest:
    slot: int = -1
    defaultTune: int = 0
    defaultIndexRatio: int = 1
+    # defaultProtect: float = .5
    files: List[MergeFile] = field(default_factory=lambda: [])
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -85,7 +85,9 @@ class Pipeline(object):
        embOutputLayer,
        useFinalProj,
        repeat,
+        protect=0.5,
    ):
+        search_index = self.index is not None and self.big_npy is not None and index_rate != 0
        self.t_pad = self.sr * repeat
        self.t_pad_tgt = self.targetSR * repeat

@ -136,10 +138,12 @@ class Pipeline(object):
                raise DeviceChangingException()
            else:
                raise e
+        if protect < 0.5 and search_index:
+             feats0 = feats.clone()   

        # Index - feature抽出
        # if self.index is not None and self.feature is not None and index_rate != 0:
-        if self.index is not None and self.big_npy is not None and index_rate != 0:
+        if search_index:
            npy = feats[0].cpu().numpy()
            if self.isHalf is True:
                npy = npy.astype("float32")
@ -165,7 +169,10 @@ class Pipeline(object):
                + (1 - index_rate) * feats
            )
        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-
+        if protect < 0.5 and search_index:
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
        # ピッチサイズ調整
        p_len = audio_pad.shape[0] // self.window
        if feats.shape[1] < p_len:
@ -173,6 +180,15 @@ class Pipeline(object):
            if pitch is not None and pitchf is not None:
                pitch = pitch[:, :p_len]
                pitchf = pitchf[:, :p_len]
+
+        # pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
+        if protect < 0.5 and search_index:
+            pitchff = pitchf.clone()
+            pitchff[pitchf > 0] = 1
+            pitchff[pitchf < 1] = protect
+            pitchff = pitchff.unsqueeze(-1)
+            feats = feats * pitchff + feats0 * (1 - pitchff)
+            feats = feats.to(feats0.dtype)
        p_len = torch.tensor([p_len], device=self.device).long()

        # 推論実行
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@ -26,7 +26,7 @@ class CrepePitchExtractor(PitchExtractor):
        f0_mel_min = 1127 * np.log(1 + f0_min / 700)
        f0_mel_max = 1127 * np.log(1 + f0_max / 700)

-        f0 = torchcrepe.predict(
+        f0, pd = torchcrepe.predict(
            audio.unsqueeze(0),
            sr,
            hop_length=window,
@ -37,8 +37,11 @@ class CrepePitchExtractor(PitchExtractor):
            batch_size=256,
            decoder=torchcrepe.decode.weighted_argmax,
            device=self.device,
+            return_periodicity=True,
        )
-        f0 = torchcrepe.filter.median(f0, 3)
+        f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
+        pd = torchcrepe.filter.median(pd, 3)
+        f0[pd < 0.1] = 0
        f0 = f0.squeeze()

        f0 = torch.nn.functional.pad(