rmvpe

2025-01-23 13:35:12 +03:00 · 2023-08-26 13:48:03 +09:00 · 2023-08-26 13:48:03 +09:00 · 78af3b7fff
commit 78af3b7fff
parent 8974bf78d2
10 changed files with 97 additions and 3 deletions
--- a/client/demo/dist/assets/gui_settings/GUI.json
+++ b/client/demo/dist/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/public/assets/gui_settings/GUI.json
+++ b/client/demo/public/assets/gui_settings/GUI.json
@ -21,7 +21,7 @@
            {
                "name": "configArea",
                "options": {
-                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe"],
+                    "detectors": ["dio", "harvest", "crepe", "crepe_full", "crepe_tiny", "rmvpe", "rmvpe_onnx"],
                    "inputChunkNums": [8, 16, 24, 32, 40, 48, 64, 80, 96, 112, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048]
                }
            }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -47,6 +47,8 @@ export const F0Detector = {
    "crepe": "crepe",
    "crepe_full": "crepe_full",
    "crepe_tiny": "crepe_tiny",
+    "rmvpe": "rmvpe",
+    "rmvpe_onnx": "rmvpe_onnx",
 } as const
 export type F0Detector = typeof F0Detector[keyof typeof F0Detector]

--- a/server/MMVCServerSIO.py
+++ b/server/MMVCServerSIO.py
@ -62,6 +62,7 @@ def setupArgParser():
    parser.add_argument("--crepe_onnx_full", type=str, default="pretrain/crepe_onnx_full.onnx", help="path to crepe_onnx_full")
    parser.add_argument("--crepe_onnx_tiny", type=str, default="pretrain/crepe_onnx_tiny.onnx", help="path to crepe_onnx_tiny")
    parser.add_argument("--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe")
+    parser.add_argument("--rmvpe_onnx", type=str, default="pretrain/rmvpe.onnx", help="path to rmvpe onnx")

    return parser

@ -103,6 +104,7 @@ voiceChangerParams = VoiceChangerParams(
    crepe_onnx_full=args.crepe_onnx_full,
    crepe_onnx_tiny=args.crepe_onnx_tiny,
    rmvpe=args.rmvpe,
+    rmvpe_onnx=args.rmvpe_onnx,
    sample_mode=args.sample_mode,
 )
 vcparams = VoiceChangerParamsManager.get_instance()
--- a/server/const.py
+++ b/server/const.py
@ -79,6 +79,7 @@ PitchExtractorType: TypeAlias = Literal[
    "crepe_full",
    "crepe_tiny",
    "rmvpe",
+    "rmvpe_onnx",
 ]

 ServerAudioDeviceType: TypeAlias = Literal[
--- a/server/downloader/WeightDownloader.py
+++ b/server/downloader/WeightDownloader.py
@ -18,6 +18,7 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
    crepe_onnx_full = voiceChangerParams.crepe_onnx_full
    crepe_onnx_tiny = voiceChangerParams.crepe_onnx_tiny
    rmvpe = voiceChangerParams.rmvpe
+    rmvpe_onnx = voiceChangerParams.rmvpe_onnx

    weight_files = [content_vec_500_onnx, hubert_base, hubert_base_jp, hubert_soft,
                    nsf_hifigan, crepe_onnx_full, crepe_onnx_tiny, rmvpe]
@ -109,6 +110,14 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
                "position": 8,
            }
        )
+    if os.path.exists(rmvpe_onnx) is False:
+        downloadParams.append(
+            {
+                "url": "https://huggingface.co/wok000/weights_gpl/resolve/main/rmvpe/rmvpe_s.onnx",
+                "saveTo": rmvpe_onnx,
+                "position": 9,
+            }
+        )

    with ThreadPoolExecutor() as pool:
        pool.map(download, downloadParams)
--- a/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
+++ b/server/voice_changer/RVC/pitchExtractor/PitchExtractorManager.py
@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchE
 from voice_changer.RVC.pitchExtractor.CrepePitchExtractor import CrepePitchExtractor
 from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 from voice_changer.RVC.pitchExtractor.RMVPEPitchExtractor import RMVPEPitchExtractor
+from voice_changer.RVC.pitchExtractor.RMVPOnnxEPitchExtractor import RMVPOnnxEPitchExtractor
 from voice_changer.utils.VoiceChangerParams import VoiceChangerParams


@ -40,6 +41,8 @@ class PitchExtractorManager(Protocol):
            return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
        elif pitchExtractorType == "rmvpe":
            return RMVPEPitchExtractor(cls.params.rmvpe, gpu)
+        elif pitchExtractorType == "rmvpe_onnx":
+            return RMVPOnnxEPitchExtractor(cls.params.rmvpe_onnx, gpu)
        else:
            # return hubert as default
            print("[Voice Changer] PitchExctractor not found", pitchExtractorType)
--- a/server/voice_changer/RVC/pitchExtractor/RMVPOnnxEPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/RMVPOnnxEPitchExtractor.py
@ -0,0 +1,76 @@
+import numpy as np
+from const import PitchExtractorType
+from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
+from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+import onnxruntime
+
+
+class RMVPOnnxEPitchExtractor(PitchExtractor):
+
+    def __init__(self, file: str, gpu: int):
+        super().__init__()
+        self.file = file
+        self.pitchExtractorType: PitchExtractorType = "rmvpe_onnx"
+        self.f0_min = 50
+        self.f0_max = 1100
+        self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
+        self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
+
+        (
+            onnxProviders,
+            onnxProviderOptions,
+        ) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
+        self.onnxProviders = onnxProviders
+        self.onnxProviderOptions = onnxProviderOptions
+
+        so = onnxruntime.SessionOptions()
+        so.log_severity_level = 3
+        self.onnx_session = onnxruntime.InferenceSession(self.file, sess_options=so, providers=onnxProviders, provider_options=onnxProviderOptions)
+
+    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
+        try:
+            # データ変換
+            if isinstance(audio, np.ndarray) is False:
+                audio = audio = audio.cpu().numpy()
+
+            if isinstance(pitchf, np.ndarray) is False:
+                pitchf = pitchf.cpu().numpy().astype(np.float32)
+
+            if audio.ndim != 1:
+                raise RuntimeError(f"Exeption in {self.__class__.__name__} audio.ndim is not 1 (size :{audio.ndim}, {audio.shape})")
+            if pitchf.ndim != 1:
+                raise RuntimeError(f"Exeption in {self.__class__.__name__} pitchf.ndim is not 1 (size :{pitchf.ndim}, {pitchf.shape})")
+
+            # 処理
+            silenceFrontFrame = silence_front * sr
+            startWindow = int(silenceFrontFrame / window)  # 小数点以下切り捨て
+            slienceFrontFrameOffset = startWindow * window
+            targetFrameLength = len(audio) - slienceFrontFrameOffset
+            minimumFrames = 0.01 * sr
+            targetFrameLength = max(minimumFrames, targetFrameLength)
+            audio = audio[-targetFrameLength:]
+            audio = np.expand_dims(audio, axis=0)
+
+            output = self.onnx_session.run(
+                ["f0", "uv"],
+                {
+                    "waveform": audio.astype(np.float32),
+                    "threshold": np.array([0.3]).astype(np.float32),
+                },
+            )
+
+            f0 = output[0].squeeze()
+
+            f0 *= pow(2, f0_up_key / 12)
+            pitchf[-f0.shape[0]:] = f0[: pitchf.shape[0]]
+
+            f0_mel = 1127.0 * np.log(1.0 + pitchf / 700.0)
+            f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
+            f0_mel[f0_mel <= 1] = 1
+            f0_mel[f0_mel > 255] = 255
+            f0_coarse = np.rint(f0_mel).astype(int)
+
+        except Exception as e:
+            raise RuntimeError(f"Exeption in {self.__class__.__name__}", e)
+
+        return f0_coarse, pitchf
--- a/server/voice_changer/utils/VoiceChangerParams.py
+++ b/server/voice_changer/utils/VoiceChangerParams.py
@ -15,3 +15,4 @@ class VoiceChangerParams:
    crepe_onnx_full: str
    crepe_onnx_tiny: str
    rmvpe: str
+    rmvpe_onnx: str