mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
WIP: vocoder
This commit is contained in:
parent
1d54687577
commit
72702ee70a
@ -60,6 +60,15 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
||||
"position": 4,
|
||||
}
|
||||
)
|
||||
nsf_hifigan_onnx = os.path.join(os.path.dirname(nsf_hifigan), "nsf_hifigan.onnx")
|
||||
if os.path.exists(nsf_hifigan_onnx) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_onnx_20221211/nsf_hifigan.onnx",
|
||||
"saveTo": nsf_hifigan_onnx,
|
||||
"position": 4,
|
||||
}
|
||||
)
|
||||
|
||||
if os.path.exists(crepe_onnx_full) is False:
|
||||
downloadParams.append(
|
||||
@ -86,8 +95,7 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
||||
"position": 7,
|
||||
}
|
||||
)
|
||||
|
||||
if os.path.exists(content_vec_500_onnx) is False:
|
||||
if os.path.exists(rmvpe) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt",
|
||||
|
@ -94,7 +94,9 @@ class MMVC_Rest_Fileuploader:
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] post_load_model ex:", e)
|
||||
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def get_onnx(self):
|
||||
try:
|
||||
info = self.voiceChangerManager.export2onnx()
|
||||
|
@ -2,6 +2,7 @@ from dataclasses import asdict
|
||||
import numpy as np
|
||||
from data.ModelSlot import DiffusionSVCModelSlot
|
||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
@ -21,6 +22,7 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
self.deviceManager = DeviceManager.get_instance()
|
||||
EmbedderManager.initialize(params)
|
||||
PitchExtractorManager.initialize(params)
|
||||
InferencerManager.initialize(params)
|
||||
self.settings = DiffusionSVCSettings()
|
||||
self.params = params
|
||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||
|
@ -31,7 +31,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
|
||||
|
||||
@classmethod
|
||||
def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot):
|
||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
|
||||
diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
|
||||
slot.kStepMax = diff_args.model.k_step_max
|
||||
slot.nLayers = diff_args.model.n_layers
|
||||
slot.nnLayers = naive_args.model.n_layers
|
||||
|
@ -4,41 +4,43 @@ from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.utils.Timer import Timer
|
||||
|
||||
|
||||
class DiffusionSVCInferencer(Inferencer):
|
||||
def __init__(self):
|
||||
def __init__(self, vocoder_torch_path, vocoder_onnx_path):
|
||||
self.diff_model: Unit2Mel | None = None
|
||||
self.naive_model: Unit2MelNaive | None = None
|
||||
self.vocoder: Vocoder | None = None
|
||||
self.vocoder_onnx_path = vocoder_onnx_path
|
||||
self.vocoder_torch_path = vocoder_torch_path
|
||||
self.vocoder_onnx = None
|
||||
|
||||
def loadModel(self, file: str, gpu: int):
|
||||
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
||||
|
||||
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
|
||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
|
||||
diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(file, device=self.dev)
|
||||
# vocoder
|
||||
try: # try onnx
|
||||
self.vocoder_onnx = VocoderOnnx()
|
||||
self.vocoder_onnx.initialize(self.vocoder_onnx_path, gpu)
|
||||
print("[Voice Changer] load onnx nsf-hifigan")
|
||||
vocoder = None
|
||||
except Exception as e: # noqa
|
||||
print("[Voice Changer] load torch nsf-hifigan")
|
||||
vocoder = Vocoder("nsf-hifigan", self.vocoder_torch_path, device=self.dev)
|
||||
self.vocoder_onnx = None
|
||||
|
||||
self.diff_model = diff_model
|
||||
self.naive_model = naive_model
|
||||
self.vocoder = vocoder
|
||||
self.diff_args = diff_args
|
||||
self.naive_args = naive_args
|
||||
|
||||
# cpt = torch.load(file, map_location="cpu")
|
||||
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||
|
||||
# model.eval()
|
||||
# model.load_state_dict(cpt["weight"], strict=False)
|
||||
|
||||
# model = model.to(dev)
|
||||
# if isHalf:
|
||||
# model = model.half()
|
||||
|
||||
# self.model = model
|
||||
return self
|
||||
|
||||
def getConfig(self) -> tuple[int, int]:
|
||||
@ -123,9 +125,12 @@ class DiffusionSVCInferencer(Inferencer):
|
||||
|
||||
# print("[ ----Timer::2: ]", t.secs)
|
||||
with Timer("pre-process") as t: # NOQA
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||
out_wav *= mask
|
||||
# print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape)
|
||||
if self.vocoder_onnx is None:
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||
out_wav *= mask
|
||||
else:
|
||||
out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask)
|
||||
# print("[ ----Timer::3: ]", t.secs)
|
||||
|
||||
return out_wav.squeeze()
|
||||
|
@ -1,10 +1,17 @@
|
||||
from const import DiffusionSVCInferenceType
|
||||
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
import os
|
||||
|
||||
|
||||
class InferencerManager:
|
||||
currentInferencer: Inferencer | None = None
|
||||
params: VoiceChangerParams
|
||||
|
||||
@classmethod
|
||||
def initialize(cls, params: VoiceChangerParams):
|
||||
cls.params = params
|
||||
|
||||
@classmethod
|
||||
def getInferencer(
|
||||
@ -24,6 +31,8 @@ class InferencerManager:
|
||||
gpu: int,
|
||||
) -> Inferencer:
|
||||
if inferencerType == "combo":
|
||||
return DiffusionSVCInferencer().loadModel(file, gpu)
|
||||
vocoder_onnx_path = os.path.join(os.path.dirname(cls.params.nsf_hifigan), "nsf_hifigan.onnx")
|
||||
return DiffusionSVCInferencer(cls.params.nsf_hifigan, vocoder_onnx_path).loadModel(file, gpu)
|
||||
|
||||
else:
|
||||
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
||||
|
@ -49,22 +49,23 @@ def load_model_vocoder_from_combo(combo_model_path, device='cpu'):
|
||||
# args
|
||||
diff_args = DotDict(read_dict["diff_config_dict"])
|
||||
naive_args = DotDict(read_dict["naive_config_dict"])
|
||||
# vocoder
|
||||
vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
|
||||
# # vocoder
|
||||
# vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
|
||||
|
||||
# diff_model
|
||||
print(' [Loading] ' + combo_model_path)
|
||||
diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
|
||||
# diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
|
||||
diff_model = load_svc_model(args=diff_args, vocoder_dimension=128)
|
||||
diff_model.to(device)
|
||||
diff_model.load_state_dict(read_dict["diff_model"]['model'])
|
||||
diff_model.eval()
|
||||
|
||||
# naive_model
|
||||
naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension)
|
||||
naive_model = load_svc_model(args=naive_args, vocoder_dimension=128)
|
||||
naive_model.to(device)
|
||||
naive_model.load_state_dict(read_dict["naive_model"]['model'])
|
||||
naive_model.eval()
|
||||
return diff_model, diff_args, naive_model, naive_args, vocoder
|
||||
return diff_model, diff_args, naive_model, naive_args # , vocoder
|
||||
|
||||
|
||||
def load_svc_model(args, vocoder_dimension):
|
||||
|
@ -0,0 +1,11 @@
|
||||
|
||||
|
||||
class VocoderOnnx:
|
||||
def __init__(self) -> None:
|
||||
pass
|
||||
|
||||
def initialize(self, onnx_path: str, gpu: int):
|
||||
raise Exception("Not implemented")
|
||||
|
||||
def infer(self, out_mel, pitch, silence_front, mask):
|
||||
raise Exception("Not implemented")
|
@ -45,8 +45,8 @@ class EmbedderManager:
|
||||
raise Exception("[Voice Changer][Embedder] onnx is off")
|
||||
file = cls.params.content_vec_500_onnx
|
||||
return OnnxContentvec().loadModel(file, dev)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception as e: # noqa
|
||||
print("[Voice Changer] use torch contentvec")
|
||||
file = cls.params.hubert_base
|
||||
return FairseqHubert().loadModel(file, dev, isHalf)
|
||||
elif embederType == "hubert-base-japanese":
|
||||
|
Loading…
Reference in New Issue
Block a user