WIP: RMVPE

This commit is contained in:
w-okada 2023-07-15 09:34:29 +09:00
parent 7d7702bb79
commit 3ffacaed97
4 changed files with 8 additions and 4 deletions

View File

@ -9,7 +9,6 @@ from Exceptions import (
NotEnoughDataExtimateF0, NotEnoughDataExtimateF0,
) )
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.F0Extractor import F0_Extractor
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.Embedder import Embedder

View File

@ -3,6 +3,7 @@ import numpy as np
from const import PitchExtractorType from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
from scipy.ndimage import zoom
class RMVPEPitchExtractor(PitchExtractor): class RMVPEPitchExtractor(PitchExtractor):
@ -24,7 +25,9 @@ class RMVPEPitchExtractor(PitchExtractor):
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window) start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
silented_frames = int(audio.size(0) // window) + 1
print("[RMVPE AUDI]", audio.device) print("[RMVPE AUDI]", audio.device)
print("[RMVPE RMVPE]", self.rmvpe.device) print("[RMVPE RMVPE]", self.rmvpe.device)
@ -47,6 +50,9 @@ class RMVPEPitchExtractor(PitchExtractor):
# pd = torchcrepe.filter.median(pd, 3) # pd = torchcrepe.filter.median(pd, 3)
# f0[pd < 0.1] = 0 # f0[pd < 0.1] = 0
# f0 = f0.squeeze() # f0 = f0.squeeze()
resize_factor = silented_frames / len(f0)
f0 = zoom(f0, resize_factor, order=0)
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]] pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
f0 = pitch f0 = pitch

View File

@ -15,4 +15,3 @@ class VoiceChangerParams:
crepe_onnx_full: str crepe_onnx_full: str
crepe_onnx_tiny: str crepe_onnx_tiny: str
rmvpe: str rmvpe: str