bugfix: rvc quality, gui improve: index default to 0.0

2025-01-23 21:45:00 +03:00 · 2023-06-16 11:32:23 +09:00 · 2023-06-16 11:32:23 +09:00 · 40eccd653f
commit 40eccd653f
parent d7e541ffa7
3 changed files with 52 additions and 113 deletions
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -65,9 +65,7 @@ class RVC:
    needSwitch: bool = False

    def __init__(self, params: VoiceChangerParams):
-        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(
-            self.settings.f0Detector
-        )
+        self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
        self.params = params
        EmbedderManager.initialize(params)
        self.loadSlots()
@ -75,9 +73,7 @@ class RVC:

        # サンプルカタログ作成
        sampleJsons: list[str] = []
-        sampleJsonUrls, _sampleModels = getRVCSampleJsonAndModelIds(
-            params.rvc_sample_mode
-        )
+        sampleJsonUrls, _sampleModels = getRVCSampleJsonAndModelIds(params.rvc_sample_mode)
        for url in sampleJsonUrls:
            filename = os.path.basename(url)
            sampleJsons.append(filename)
@ -143,9 +139,7 @@ class RVC:
            # slotInfo.f0 = sampleInfo.f0
        else:
            slotInfo.modelFile = params["files"]["rvcModel"]
-            slotInfo.indexFile = (
-                params["files"]["rvcIndex"] if "rvcIndex" in params["files"] else None
-            )
+            slotInfo.indexFile = params["files"]["rvcIndex"] if "rvcIndex" in params["files"] else None

        slotInfo.defaultTune = params["defaultTune"]
        slotInfo.defaultIndexRatio = params["defaultIndexRatio"]
@ -159,9 +153,7 @@ class RVC:

        # メタデータを見て、永続化モデルフォルダに移動させる
        # その際に、メタデータのファイル格納場所も書き換える
-        slotDir = os.path.join(
-            self.params.model_dir, RVC_MODEL_DIRNAME, str(target_slot_idx)
-        )
+        slotDir = os.path.join(self.params.model_dir, RVC_MODEL_DIRNAME, str(target_slot_idx))
        os.makedirs(slotDir, exist_ok=True)
        slotInfo.modelFile = self.moveToModelDir(slotInfo.modelFile, slotDir)
        if slotInfo.indexFile is not None and len(slotInfo.indexFile) > 0:
@ -189,9 +181,7 @@ class RVC:

        modelSlots: list[ModelSlot] = []
        for slot_idx in range(len(self.settings.modelSlots)):
-            slotDir = os.path.join(
-                self.params.model_dir, RVC_MODEL_DIRNAME, str(slot_idx)
-            )
+            slotDir = os.path.join(self.params.model_dir, RVC_MODEL_DIRNAME, str(slot_idx))
            jsonDict = os.path.join(slotDir, "params.json")
            if os.path.exists(jsonDict):
                jsonDict = json.load(open(os.path.join(slotDir, "params.json")))
@ -209,10 +199,7 @@ class RVC:
                if val < 0:
                    return True
                val = val % 1000  # Quick hack for same slot is selected
-                if (
-                    self.settings.modelSlots[val].modelFile is None
-                    or self.settings.modelSlots[val].modelFile == ""
-                ):
+                if self.settings.modelSlots[val].modelFile is None or self.settings.modelSlots[val].modelFile == "":
                    print("[Voice Changer] slot does not have model.")
                    return True
                self.prepareModel(val)
@ -229,9 +216,7 @@ class RVC:
        elif key in self.settings.strData:
            setattr(self.settings, key, str(val))
            if key == "f0Detector" and self.pipeline is not None:
-                pitchExtractor = PitchExtractorManager.getPitchExtractor(
-                    self.settings.f0Detector
-                )
+                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
                self.pipeline.setPitchExtractor(pitchExtractor)
        else:
            return False
@ -246,9 +231,7 @@ class RVC:
        print("[Voice Changer] Prepare Model of slot:", slot)

        # pipelineの生成
-        self.next_pipeline = createPipeline(
-            modelSlot, self.settings.gpu, self.settings.f0Detector
-        )
+        self.next_pipeline = createPipeline(modelSlot, self.settings.gpu, self.settings.f0Detector)

        # その他の設定
        self.next_trans = modelSlot.defaultTune
@ -293,9 +276,7 @@ class RVC:
        crossfadeSize: int,
        solaSearchFrame: int = 0,
    ):
-        newData = (
-            newData.astype(np.float32) / 32768.0
-        )  # RVCのモデルのサンプリングレートで入ってきている。（extraDataLength, Crossfade等も同じSRで処理）(★１)
+        newData = newData.astype(np.float32) / 32768.0  # RVCのモデルのサンプリングレートで入ってきている。（extraDataLength, Crossfade等も同じSRで処理）(★１)

        if self.audio_buffer is not None:
            # 過去のデータに連結
@ -303,18 +284,14 @@ class RVC:
        else:
            self.audio_buffer = newData

-        convertSize = (
-            inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
-        )
+        convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize

        if convertSize % 128 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
            convertSize = convertSize + (128 - (convertSize % 128))

        # バッファがたまっていない場合はzeroで補う
        if self.audio_buffer.shape[0] < convertSize:
-            self.audio_buffer = np.concatenate(
-                [np.zeros([convertSize]), self.audio_buffer]
-            )
+            self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])

        convertOffset = -1 * convertSize
        self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
@ -324,9 +301,7 @@ class RVC:
        else:
            device = torch.device("cpu")

-        audio_buffer = torch.from_numpy(self.audio_buffer).to(
-            device=device, dtype=torch.float32
-        )
+        audio_buffer = torch.from_numpy(self.audio_buffer).to(device=device, dtype=torch.float32)

        # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
        cropOffset = -1 * (inputSize + crossfadeSize)
@ -354,7 +329,7 @@ class RVC:
        #     self.needSwitch = False

        # half = self.deviceManager.halfPrecisionAvailable(self.settings.gpu)
-        half = self.pipeline.isHalf
+        # half = self.pipeline.isHalf

        audio = data[0]
        convertSize = data[1]
@ -363,11 +338,9 @@ class RVC:
        if vol < self.settings.silentThreshold:
            return np.zeros(convertSize).astype(np.int16)

-        audio = torchaudio.functional.resample(
-            audio, self.settings.modelSamplingRate, 16000, rolloff=0.99
-        )
-        repeat = 3 if half else 1
-        repeat *= self.settings.rvcQuality  # 0 or 3
+        audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
+        repeat = 1 if self.settings.rvcQuality else 0
+        print()
        sid = 0
        f0_up_key = self.settings.tran
        index_rate = self.settings.indexRatio
@ -382,8 +355,7 @@ class RVC:
                f0_up_key,
                index_rate,
                if_f0,
-                self.settings.extraConvertSize
-                / self.settings.modelSamplingRate,  # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★１)。
+                self.settings.extraConvertSize / self.settings.modelSamplingRate,  # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★１)。
                embOutputLayer,
                useFinalProj,
                repeat,
@ -393,9 +365,7 @@ class RVC:

            return result
        except DeviceCannotSupportHalfPrecisionException as e:
-            print(
-                "[Device Manager] Device cannot support half precision. Fallback to float...."
-            )
+            print("[Device Manager] Device cannot support half precision. Fallback to float....")
            self.deviceManager.setForceTensor(True)
            self.prepareModel(self.settings.modelSlotIndex)
            raise e
@ -461,9 +431,7 @@ class RVC:
            "sampleId": "",
            "files": {"rvcModel": storeFile},
        }
-        props: LoadModelParams = LoadModelParams(
-            slot=targetSlot, isHalf=True, params=params
-        )
+        props: LoadModelParams = LoadModelParams(slot=targetSlot, isHalf=True, params=params)
        self.loadModel(props)
        self.prepareModel(targetSlot)
        self.settings.modelSlotIndex = targetSlot
@ -471,12 +439,8 @@ class RVC:

    def update_model_default(self):
        print("[Voice Changer] UPDATE MODEL DEFAULT!!")
-        slotDir = os.path.join(
-            self.params.model_dir, RVC_MODEL_DIRNAME, str(self.currentSlot)
-        )
-        params = json.load(
-            open(os.path.join(slotDir, "params.json"), "r", encoding="utf-8")
-        )
+        slotDir = os.path.join(self.params.model_dir, RVC_MODEL_DIRNAME, str(self.currentSlot))
+        params = json.load(open(os.path.join(slotDir, "params.json"), "r", encoding="utf-8"))
        params["defaultTune"] = self.settings.tran
        params["defaultIndexRatio"] = self.settings.indexRatio
        params["defaultProtect"] = self.settings.protect
@ -488,14 +452,10 @@ class RVC:
        print("[Voice Changer] UPDATE MODEL INFO", newData)
        newDataDict = json.loads(newData)
        try:
-            slotDir = os.path.join(
-                self.params.model_dir, RVC_MODEL_DIRNAME, str(newDataDict["slot"])
-            )
+            slotDir = os.path.join(self.params.model_dir, RVC_MODEL_DIRNAME, str(newDataDict["slot"]))
        except Exception as e:
            print("Exception::::", e)
-        params = json.load(
-            open(os.path.join(slotDir, "params.json"), "r", encoding="utf-8")
-        )
+        params = json.load(open(os.path.join(slotDir, "params.json"), "r", encoding="utf-8"))
        params[newDataDict["key"]] = newDataDict["val"]
        json.dump(params, open(os.path.join(slotDir, "params.json"), "w"))
        self.loadSlots()
@ -504,9 +464,7 @@ class RVC:
        print("[Voice Changer] UPLOAD ASSETS", params)
        paramsDict = json.loads(params)
        uploadPath = os.path.join(UPLOAD_DIR, paramsDict["file"])
-        storeDir = os.path.join(
-            self.params.model_dir, RVC_MODEL_DIRNAME, str(paramsDict["slot"])
-        )
+        storeDir = os.path.join(self.params.model_dir, RVC_MODEL_DIRNAME, str(paramsDict["slot"]))
        storePath = os.path.join(
            storeDir,
            paramsDict["file"],
--- a/server/voice_changer/RVC/SampleDownloader.py
+++ b/server/voice_changer/RVC/SampleDownloader.py
@ -17,9 +17,7 @@ def checkRvcModelExist(model_dir: str):
    return True


-def downloadInitialSampleModels(
-    sampleJsons: list[str], sampleModelIds: list[str], model_dir: str
-):
+def downloadInitialSampleModels(sampleJsons: list[str], sampleModelIds: list[str], model_dir: str):
    sampleModels = getModelSamples(sampleJsons, "RVC")
    if sampleModels is None:
        return
@ -58,11 +56,7 @@ def downloadInitialSampleModels(
        slotInfo.modelFile = modelFilePath
        line_num += 1

-        if (
-            initSampleId[1] is True
-            and hasattr(sample, "indexUrl")
-            and sample.indexUrl != ""
-        ):
+        if initSampleId[1] is True and hasattr(sample, "indexUrl") and sample.indexUrl != "":
            indexPath = os.path.join(
                slotDir,
                os.path.basename(sample.indexUrl),
@ -97,7 +91,7 @@ def downloadInitialSampleModels(
        slotInfo.name = sample.name
        slotInfo.termsOfUseUrl = sample.termsOfUseUrl
        slotInfo.defaultTune = 0
-        slotInfo.defaultIndexRatio = 1
+        slotInfo.defaultIndexRatio = 0.0
        slotInfo.defaultProtect = 0.5
        slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")

@ -141,11 +135,7 @@ def downloadModelFiles(sampleInfo: RVCModelSample, useIndex: bool = True):
    )

    indexPath = None
-    if (
-        useIndex is True
-        and hasattr(sampleInfo, "indexUrl")
-        and sampleInfo.indexUrl != ""
-    ):
+    if useIndex is True and hasattr(sampleInfo, "indexUrl") and sampleInfo.indexUrl != "":
        print("[Voice Changer] Download sample with index.")
        indexPath = os.path.join(TMP_DIR, os.path.basename(sampleInfo.indexUrl))
        downloadParams.append(
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -47,9 +47,7 @@ class Pipeline(object):
        print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)

        self.index = index
-        self.big_npy = (
-            index.reconstruct_n(0, index.ntotal) if index is not None else None
-        )
+        self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
        # self.feature = feature

        self.targetSR = targetSR
@ -63,11 +61,7 @@ class Pipeline(object):
        inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
        embedderInfo = self.embedder.getEmbedderInfo()
        pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
-        return {
-            "inferencer": inferencerInfo,
-            "embedder": embedderInfo,
-            "pitchExtractor": pitchExtractorInfo,
-        }
+        return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf}

    def setPitchExtractor(self, pitchExtractor: PitchExtractor):
        self.pitchExtractor = pitchExtractor
@ -87,17 +81,24 @@ class Pipeline(object):
    ):
        # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。

-        search_index = (
-            self.index is not None and self.big_npy is not None and index_rate != 0
-        )
-        self.t_pad = self.sr * repeat
-        self.t_pad_tgt = self.targetSR * repeat
-        audio_pad = F.pad(
-            audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect"
-        ).squeeze(0)
+        search_index = self.index is not None and self.big_npy is not None and index_rate != 0
+        # self.t_pad = self.sr * repeat  # 1秒
+        # self.t_pad_tgt = self.targetSR * repeat  # 1秒　出力時のトリミング(モデルのサンプリングで出力される)
+        audio = audio.unsqueeze(0)
+
+        quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr  # padding(reflect)のサイズは元のサイズより小さい必要がある。
+
+        self.t_pad = round(self.sr * quality_padding_sec)  # 前後に音声を追加
+        self.t_pad_tgt = round(self.targetSR * quality_padding_sec)  # 前後に音声を追加　出力時のトリミング(モデルのサンプリングで出力される)
+        print("audio shape", self.t_pad, self.t_pad_tgt, audio.shape)
+        audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
        p_len = audio_pad.shape[0] // self.window
        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()

+        # RVC QualityがOnのときにはsilence_frontをオフに。
+        silence_front = silence_front if repeat == 0 else 0
+        print("silence_front", silence_front)
+
        # ピッチ検出
        pitch, pitchf = None, None
        try:
@ -112,9 +113,7 @@ class Pipeline(object):
                pitch = pitch[:p_len]
                pitchf = pitchf[:p_len]
                pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
-                pitchf = torch.tensor(
-                    pitchf, device=self.device, dtype=torch.float
-                ).unsqueeze(0)
+                pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
        except IndexError:
            # print(e)
            raise NotEnoughDataExtimateF0()
@ -169,21 +168,14 @@ class Pipeline(object):
                npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)

            # recover silient font
-            npy = np.concatenate(
-                [np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy]
-            )
+            npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy])
            if self.isHalf is True:
                npy = npy.astype("float16")

-            feats = (
-                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
-                + (1 - index_rate) * feats
-            )
+            feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
        feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
        if protect < 0.5 and search_index:
-            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
-                0, 2, 1
-            )
+            feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)

        # ピッチサイズ調整
        p_len = audio_pad.shape[0] // self.window
@ -219,14 +211,11 @@ class Pipeline(object):
            with torch.no_grad():
                audio1 = (
                    torch.clip(
-                        self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][
-                            0, 0
-                        ].to(dtype=torch.float32),
+                        self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32),
                        -1.0,
                        1.0,
                    )
                    * 32767.5
-                    - 0.5
                ).data.to(dtype=torch.int16)
        except RuntimeError as e:
            if "HALF" in e.__str__().upper():
@ -238,6 +227,8 @@ class Pipeline(object):
        del feats, p_len, padding_mask
        torch.cuda.empty_cache()

+        # inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
+        # pipelineに（入力されるときはhubertように16k）
        if self.t_pad_tgt != 0:
            offset = self.t_pad_tgt
            end = -1 * self.t_pad_tgt