From 3ffacaed9772ff7871213970941b57bd8ad2ad50 Mon Sep 17 00:00:00 2001 From: w-okada Date: Sat, 15 Jul 2023 09:34:29 +0900 Subject: [PATCH] WIP: RMVPE --- server/voice_changer/DiffusionSVC/pipeline/Pipeline.py | 1 - .../DiffusionSVC/pitchExtractor/RMVPEPitchExtractor.py | 6 ++++++ .../DiffusionSVC/pitchExtractor/rmvpe/rmvpe.py | 4 ++-- server/voice_changer/utils/VoiceChangerParams.py | 1 - 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index 2d988442..2d5ae0ad 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -9,7 +9,6 @@ from Exceptions import ( NotEnoughDataExtimateF0, ) from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer -from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.RVC.embedder.Embedder import Embedder diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/RMVPEPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/RMVPEPitchExtractor.py index ce543b4b..8fe7ab1c 100644 --- a/server/voice_changer/DiffusionSVC/pitchExtractor/RMVPEPitchExtractor.py +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/RMVPEPitchExtractor.py @@ -3,6 +3,7 @@ import numpy as np from const import PitchExtractorType from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE +from scipy.ndimage import zoom class RMVPEPitchExtractor(PitchExtractor): @@ -24,7 +25,9 @@ class RMVPEPitchExtractor(PitchExtractor): def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): start_frame = int(silence_front * self.sapmle_rate / window) real_silence_front = start_frame * window / self.sapmle_rate + audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] + silented_frames = int(audio.size(0) // window) + 1 print("[RMVPE AUDI]", audio.device) print("[RMVPE RMVPE]", self.rmvpe.device) @@ -47,6 +50,9 @@ class RMVPEPitchExtractor(PitchExtractor): # pd = torchcrepe.filter.median(pd, 3) # f0[pd < 0.1] = 0 # f0 = f0.squeeze() + resize_factor = silented_frames / len(f0) + f0 = zoom(f0, resize_factor, order=0) + pitch[-f0.shape[0]:] = f0[:pitch.shape[0]] f0 = pitch diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/rmvpe/rmvpe.py b/server/voice_changer/DiffusionSVC/pitchExtractor/rmvpe/rmvpe.py index 8d36e504..60d7de81 100644 --- a/server/voice_changer/DiffusionSVC/pitchExtractor/rmvpe/rmvpe.py +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/rmvpe/rmvpe.py @@ -249,7 +249,7 @@ class E2E(nn.Module): ) # else: # self.fc = nn.Sequential( - # nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + # nn.Linear(3 * N_MELS, N_CLASS), nn.Dropout(0.25), nn.Sigmoid() # ) def forward(self, mel): @@ -392,7 +392,7 @@ class RMVPE: hidden = hidden.astype("float32") f0 = self.decode(hidden, thred=thred) return f0 - + def to_local_average_cents(self, salience, thred=0.05): # t0 = ttime() center = np.argmax(salience, axis=1) # 帧长#index diff --git a/server/voice_changer/utils/VoiceChangerParams.py b/server/voice_changer/utils/VoiceChangerParams.py index b640227e..3c8f369d 100644 --- a/server/voice_changer/utils/VoiceChangerParams.py +++ b/server/voice_changer/utils/VoiceChangerParams.py @@ -15,4 +15,3 @@ class VoiceChangerParams: crepe_onnx_full: str crepe_onnx_tiny: str rmvpe: str -