WIP: Japanese Hubert

This commit is contained in:
wataru 2023-05-02 22:29:28 +09:00
parent 72fb482dc7
commit 6a01467ac8
10 changed files with 214 additions and 87 deletions

View File

@ -82,6 +82,11 @@ class EnumInferenceTypes(Enum):
onnxRVCNono = "onnxRVCNono"
class EnumPitchExtractorTypes(Enum):
harvest = "harvest"
dio = "dio"
class EnumFrameworkTypes(Enum):
pyTorch = "pyTorch"
onnx = "onnx"

View File

@ -1,6 +1,9 @@
import sys
import os
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
# avoiding parse arg error in RVC
sys.argv = ["MMVCServerSIO.py"]
@ -55,10 +58,14 @@ class RVC:
audio_buffer: AudioInOut | None = None
embedder: Embedder | None = None
inferencer: Inferencer | None = None
pitchExtractor: PitchExtractor | None = None
def __init__(self, params: VoiceChangerParams):
self.initialLoad = True
self.settings = RVCSettings()
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(
self.settings.f0Detector
)
self.feature_file = None
self.index_file = None
@ -102,6 +109,15 @@ class RVC:
return self.get_info()
def _getDevice(self):
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
elif self.mps_enabled:
dev = torch.device("mps")
else:
dev = torch.device("cuda", index=self.settings.gpu)
return dev
def prepareModel(self, slot: int):
if slot < 0:
return self.get_info()
@ -110,20 +126,14 @@ class RVC:
filename = (
modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile
)
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
elif self.mps_enabled:
dev = torch.device("mps")
else:
dev = torch.device("cuda", index=self.settings.gpu)
dev = self._getDevice()
# Inferencerのロード
inferencer = InferencerManager.getInferencer(
modelSlot.modelType,
filename,
self.settings.isHalf,
torch.device("cuda:0"),
dev,
)
self.next_inferencer = inferencer
@ -156,8 +166,14 @@ class RVC:
def switchModel(self):
print("[Voice Changer] Switching model..")
# del self.net_g
# del self.onnx_session
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
elif self.mps_enabled:
dev = torch.device("mps")
else:
dev = torch.device("cuda", index=self.settings.gpu)
# embedderはモデルによらず再利用できる可能性が高いので、Switchのタイミングでこちらで取得
try:
self.embedder = EmbedderManager.getEmbedder(
self.next_embedder,
@ -330,6 +346,7 @@ class RVC:
# self.hubert_model,
self.embedder,
self.onnx_session,
self.pitchExtractor,
sid,
audio,
f0_up_key,
@ -391,6 +408,7 @@ class RVC:
audio_out = vc.pipeline(
self.embedder,
self.inferencer,
self.pitchExtractor,
sid,
audio,
f0_up_key,

View File

@ -3,10 +3,10 @@ import numpy as np
# import parselmouth
import torch
import torch.nn.functional as F
import scipy.signal as signal
import pyworld
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class VC(object):
@ -18,62 +18,11 @@ class VC(object):
self.device = device
self.is_half = is_half
def get_f0(self, audio, p_len, f0_up_key, f0_method, silence_front=0):
n_frames = int(len(audio) // self.window) + 1
start_frame = int(silence_front * self.sr / self.window)
real_silence_front = start_frame * self.window / self.sr
silence_front_offset = int(np.round(real_silence_front * self.sr))
audio = audio[silence_front_offset:]
# time_step = self.window / self.sr * 1000
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "dio":
_f0, t = pyworld.dio(
audio.astype(np.double),
self.sr,
f0_floor=f0_min,
f0_ceil=f0_max,
channels_in_octave=2,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sr)
f0 = np.pad(
f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
)
else:
f0, t = pyworld.harvest(
audio.astype(np.double),
fs=self.sr,
f0_ceil=f0_max,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr)
f0 = signal.medfilt(f0, 3)
f0 = np.pad(
f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
)
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak
def pipeline(
self,
embedder: Embedder,
model,
inferencer: Inferencer,
pitchExtractor: PitchExtractor,
sid,
audio,
f0_up_key,
@ -92,11 +41,11 @@ class VC(object):
# ピッチ検出
pitch, pitchf = None, None
if if_f0 == 1:
pitch, pitchf = self.get_f0(
pitch, pitchf = pitchExtractor.extract(
audio_pad,
p_len,
f0_up_key,
f0_method,
self.sr,
self.window,
silence_front=silence_front,
)
pitch = pitch[:p_len]
@ -156,16 +105,19 @@ class VC(object):
with torch.no_grad():
if pitch is not None:
audio1 = (
(model.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768)
(
inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
* 32768
)
.data.cpu()
.float()
.numpy()
.astype(np.int16)
)
else:
if hasattr(model, "infer_pitchless"):
if hasattr(inferencer, "infer_pitchless"):
audio1 = (
(model.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768)
(inferencer.infer_pitchless(feats, p_len, sid)[0][0, 0] * 32768)
.data.cpu()
.float()
.numpy()
@ -173,7 +125,7 @@ class VC(object):
)
else:
audio1 = (
(model.infer(feats, p_len, sid)[0][0, 0] * 32768)
(inferencer.infer(feats, p_len, sid)[0][0, 0] * 32768)
.data.cpu()
.float()
.numpy()

View File

@ -29,12 +29,20 @@ class EmbedderManager:
def loadEmbedder(
cls, embederType: EnumEmbedderTypes, file: str, isHalf: bool, dev: device
) -> Embedder:
if embederType == EnumEmbedderTypes.hubert:
if (
embederType == EnumEmbedderTypes.hubert
or embederType == EnumEmbedderTypes.hubert.value
):
return FairseqHubert().loadModel(file, dev, isHalf)
elif embederType == EnumEmbedderTypes.hubert_jp: # same as hubert
elif (
embederType == EnumEmbedderTypes.hubert_jp
or embederType == EnumEmbedderTypes.hubert_jp.value
):
return FairseqHubertJp().loadModel(file, dev, isHalf)
elif embederType == EnumEmbedderTypes.contentvec: # same as hubert
elif (
embederType == EnumEmbedderTypes.contentvec
or embederType == EnumEmbedderTypes.contentvec.value
):
return FairseqContentvec().loadModel(file, dev, isHalf)
else:
# return hubert as default
return FairseqHubert().loadModel(file, dev, isHalf)

View File

@ -37,8 +37,6 @@ class FairseqHubert(Embedder):
"padding_mask": padding_mask,
}
print("feat dev", self.dev)
with torch.no_grad():
logits = self.model.extract_features(**inputs)
if embChannels == 256:

View File

@ -1,7 +1,6 @@
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono
@ -24,19 +23,36 @@ class InferencerManager:
@classmethod
def loadInferencer(
cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device
) -> Embedder:
if inferencerType == EnumInferenceTypes.pyTorchRVC:
) -> Inferencer:
if (
inferencerType == EnumInferenceTypes.pyTorchRVC
or inferencerType == EnumInferenceTypes.pyTorchRVC.value
):
return RVCInferencer().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchRVCNono:
elif (
inferencerType == EnumInferenceTypes.pyTorchRVCNono
or inferencerType == EnumInferenceTypes.pyTorchRVCNono.value
):
return RVCInferencerNono().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchWebUI:
elif (
inferencerType == EnumInferenceTypes.pyTorchWebUI
or inferencerType == EnumInferenceTypes.pyTorchWebUI.value
):
return WebUIInferencer().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchWebUINono:
elif (
inferencerType == EnumInferenceTypes.pyTorchWebUINono
or inferencerType == EnumInferenceTypes.pyTorchWebUINono.value
):
return WebUIInferencerNono().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.onnxRVC:
elif (
inferencerType == EnumInferenceTypes.onnxRVC
or inferencerType == EnumInferenceTypes.onnxRVC.value
):
return OnnxRVCInference().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.onnxRVCNono:
elif (
inferencerType == EnumInferenceTypes.onnxRVCNono
or inferencerType == EnumInferenceTypes.onnxRVCNono.value
):
return OnnxRVCInferenceNono().loadModel(file, dev, isHalf)
else:
# return hubert as default
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)

View File

@ -0,0 +1,42 @@
import pyworld
import numpy as np
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
_f0, t = pyworld.dio(
audio.astype(np.double),
sr,
f0_floor=f0_min,
f0_ceil=f0_max,
channels_in_octave=2,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak

View File

@ -0,0 +1,43 @@
import pyworld
import numpy as np
import scipy.signal as signal
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
silence_front_offset = int(np.round(real_silence_front * sr))
audio = audio[silence_front_offset:]
f0_min = 50
f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
f0, t = pyworld.harvest(
audio.astype(np.double),
fs=sr,
f0_ceil=f0_max,
frame_period=10,
)
f0 = pyworld.stonemask(audio.astype(np.double), f0, t, sr)
f0 = signal.medfilt(f0, 3)
f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak

View File

@ -0,0 +1,9 @@
from typing import Protocol
from const import EnumPitchExtractorTypes
class PitchExtractor(Protocol):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
...

View File

@ -0,0 +1,36 @@
from typing import Protocol
from const import EnumPitchExtractorTypes
from voice_changer.RVC.pitchExtractor.DioPitchExtractor import DioPitchExtractor
from voice_changer.RVC.pitchExtractor.HarvestPitchExtractor import HarvestPitchExtractor
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class PitchExtractorManager(Protocol):
currentPitchExtractor: PitchExtractor | None = None
@classmethod
def getPitchExtractor(
cls, pitchExtractorType: EnumPitchExtractorTypes
) -> PitchExtractor:
cls.currentPitchExtractor = cls.loadPitchExtractor(pitchExtractorType)
return cls.currentPitchExtractor
@classmethod
def loadPitchExtractor(
cls, pitchExtractorType: EnumPitchExtractorTypes
) -> PitchExtractor:
if (
pitchExtractorType == EnumPitchExtractorTypes.harvest
or pitchExtractorType == EnumPitchExtractorTypes.harvest.value
):
return HarvestPitchExtractor()
elif (
pitchExtractorType == EnumPitchExtractorTypes.dio
or pitchExtractorType == EnumPitchExtractorTypes.dio.value
):
return DioPitchExtractor()
else:
# return hubert as default
raise RuntimeError(
"[Voice Changer] PitchExctractor not found", pitchExtractorType
)