mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 21:45:00 +03:00
WIP: vocoder
This commit is contained in:
parent
1d54687577
commit
72702ee70a
@ -60,6 +60,15 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
|||||||
"position": 4,
|
"position": 4,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
nsf_hifigan_onnx = os.path.join(os.path.dirname(nsf_hifigan), "nsf_hifigan.onnx")
|
||||||
|
if os.path.exists(nsf_hifigan_onnx) is False:
|
||||||
|
downloadParams.append(
|
||||||
|
{
|
||||||
|
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_onnx_20221211/nsf_hifigan.onnx",
|
||||||
|
"saveTo": nsf_hifigan_onnx,
|
||||||
|
"position": 4,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
if os.path.exists(crepe_onnx_full) is False:
|
if os.path.exists(crepe_onnx_full) is False:
|
||||||
downloadParams.append(
|
downloadParams.append(
|
||||||
@ -86,8 +95,7 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
|
|||||||
"position": 7,
|
"position": 7,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
if os.path.exists(rmvpe) is False:
|
||||||
if os.path.exists(content_vec_500_onnx) is False:
|
|
||||||
downloadParams.append(
|
downloadParams.append(
|
||||||
{
|
{
|
||||||
"url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt",
|
"url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt",
|
||||||
|
@ -94,7 +94,9 @@ class MMVC_Rest_Fileuploader:
|
|||||||
return JSONResponse(content=json_compatible_item_data)
|
return JSONResponse(content=json_compatible_item_data)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("[Voice Changer] post_load_model ex:", e)
|
print("[Voice Changer] post_load_model ex:", e)
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
def get_onnx(self):
|
def get_onnx(self):
|
||||||
try:
|
try:
|
||||||
info = self.voiceChangerManager.export2onnx()
|
info = self.voiceChangerManager.export2onnx()
|
||||||
|
@ -2,6 +2,7 @@ from dataclasses import asdict
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from data.ModelSlot import DiffusionSVCModelSlot
|
from data.ModelSlot import DiffusionSVCModelSlot
|
||||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||||
@ -21,6 +22,7 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
self.deviceManager = DeviceManager.get_instance()
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
EmbedderManager.initialize(params)
|
EmbedderManager.initialize(params)
|
||||||
PitchExtractorManager.initialize(params)
|
PitchExtractorManager.initialize(params)
|
||||||
|
InferencerManager.initialize(params)
|
||||||
self.settings = DiffusionSVCSettings()
|
self.settings = DiffusionSVCSettings()
|
||||||
self.params = params
|
self.params = params
|
||||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||||
|
@ -31,7 +31,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot):
|
def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot):
|
||||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
|
diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
|
||||||
slot.kStepMax = diff_args.model.k_step_max
|
slot.kStepMax = diff_args.model.k_step_max
|
||||||
slot.nLayers = diff_args.model.n_layers
|
slot.nLayers = diff_args.model.n_layers
|
||||||
slot.nnLayers = naive_args.model.n_layers
|
slot.nnLayers = naive_args.model.n_layers
|
||||||
|
@ -4,41 +4,43 @@ from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
|||||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
|
||||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||||
|
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.utils.Timer import Timer
|
from voice_changer.utils.Timer import Timer
|
||||||
|
|
||||||
|
|
||||||
class DiffusionSVCInferencer(Inferencer):
|
class DiffusionSVCInferencer(Inferencer):
|
||||||
def __init__(self):
|
def __init__(self, vocoder_torch_path, vocoder_onnx_path):
|
||||||
self.diff_model: Unit2Mel | None = None
|
self.diff_model: Unit2Mel | None = None
|
||||||
self.naive_model: Unit2MelNaive | None = None
|
self.naive_model: Unit2MelNaive | None = None
|
||||||
self.vocoder: Vocoder | None = None
|
self.vocoder: Vocoder | None = None
|
||||||
|
self.vocoder_onnx_path = vocoder_onnx_path
|
||||||
|
self.vocoder_torch_path = vocoder_torch_path
|
||||||
|
self.vocoder_onnx = None
|
||||||
|
|
||||||
def loadModel(self, file: str, gpu: int):
|
def loadModel(self, file: str, gpu: int):
|
||||||
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
||||||
|
|
||||||
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
||||||
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
|
||||||
|
|
||||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
|
diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(file, device=self.dev)
|
||||||
|
# vocoder
|
||||||
|
try: # try onnx
|
||||||
|
self.vocoder_onnx = VocoderOnnx()
|
||||||
|
self.vocoder_onnx.initialize(self.vocoder_onnx_path, gpu)
|
||||||
|
print("[Voice Changer] load onnx nsf-hifigan")
|
||||||
|
vocoder = None
|
||||||
|
except Exception as e: # noqa
|
||||||
|
print("[Voice Changer] load torch nsf-hifigan")
|
||||||
|
vocoder = Vocoder("nsf-hifigan", self.vocoder_torch_path, device=self.dev)
|
||||||
|
self.vocoder_onnx = None
|
||||||
|
|
||||||
self.diff_model = diff_model
|
self.diff_model = diff_model
|
||||||
self.naive_model = naive_model
|
self.naive_model = naive_model
|
||||||
self.vocoder = vocoder
|
self.vocoder = vocoder
|
||||||
self.diff_args = diff_args
|
self.diff_args = diff_args
|
||||||
self.naive_args = naive_args
|
self.naive_args = naive_args
|
||||||
|
|
||||||
# cpt = torch.load(file, map_location="cpu")
|
|
||||||
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
|
||||||
|
|
||||||
# model.eval()
|
|
||||||
# model.load_state_dict(cpt["weight"], strict=False)
|
|
||||||
|
|
||||||
# model = model.to(dev)
|
|
||||||
# if isHalf:
|
|
||||||
# model = model.half()
|
|
||||||
|
|
||||||
# self.model = model
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def getConfig(self) -> tuple[int, int]:
|
def getConfig(self) -> tuple[int, int]:
|
||||||
@ -123,9 +125,12 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
|
|
||||||
# print("[ ----Timer::2: ]", t.secs)
|
# print("[ ----Timer::2: ]", t.secs)
|
||||||
with Timer("pre-process") as t: # NOQA
|
with Timer("pre-process") as t: # NOQA
|
||||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
if self.vocoder_onnx is None:
|
||||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||||
out_wav *= mask
|
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||||
# print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape)
|
out_wav *= mask
|
||||||
|
else:
|
||||||
|
out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask)
|
||||||
|
# print("[ ----Timer::3: ]", t.secs)
|
||||||
|
|
||||||
return out_wav.squeeze()
|
return out_wav.squeeze()
|
||||||
|
@ -1,10 +1,17 @@
|
|||||||
from const import DiffusionSVCInferenceType
|
from const import DiffusionSVCInferenceType
|
||||||
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
|
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
|
||||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||||
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
class InferencerManager:
|
class InferencerManager:
|
||||||
currentInferencer: Inferencer | None = None
|
currentInferencer: Inferencer | None = None
|
||||||
|
params: VoiceChangerParams
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def initialize(cls, params: VoiceChangerParams):
|
||||||
|
cls.params = params
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def getInferencer(
|
def getInferencer(
|
||||||
@ -24,6 +31,8 @@ class InferencerManager:
|
|||||||
gpu: int,
|
gpu: int,
|
||||||
) -> Inferencer:
|
) -> Inferencer:
|
||||||
if inferencerType == "combo":
|
if inferencerType == "combo":
|
||||||
return DiffusionSVCInferencer().loadModel(file, gpu)
|
vocoder_onnx_path = os.path.join(os.path.dirname(cls.params.nsf_hifigan), "nsf_hifigan.onnx")
|
||||||
|
return DiffusionSVCInferencer(cls.params.nsf_hifigan, vocoder_onnx_path).loadModel(file, gpu)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
||||||
|
@ -49,22 +49,23 @@ def load_model_vocoder_from_combo(combo_model_path, device='cpu'):
|
|||||||
# args
|
# args
|
||||||
diff_args = DotDict(read_dict["diff_config_dict"])
|
diff_args = DotDict(read_dict["diff_config_dict"])
|
||||||
naive_args = DotDict(read_dict["naive_config_dict"])
|
naive_args = DotDict(read_dict["naive_config_dict"])
|
||||||
# vocoder
|
# # vocoder
|
||||||
vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
|
# vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
|
||||||
|
|
||||||
# diff_model
|
# diff_model
|
||||||
print(' [Loading] ' + combo_model_path)
|
print(' [Loading] ' + combo_model_path)
|
||||||
diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
|
# diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
|
||||||
|
diff_model = load_svc_model(args=diff_args, vocoder_dimension=128)
|
||||||
diff_model.to(device)
|
diff_model.to(device)
|
||||||
diff_model.load_state_dict(read_dict["diff_model"]['model'])
|
diff_model.load_state_dict(read_dict["diff_model"]['model'])
|
||||||
diff_model.eval()
|
diff_model.eval()
|
||||||
|
|
||||||
# naive_model
|
# naive_model
|
||||||
naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension)
|
naive_model = load_svc_model(args=naive_args, vocoder_dimension=128)
|
||||||
naive_model.to(device)
|
naive_model.to(device)
|
||||||
naive_model.load_state_dict(read_dict["naive_model"]['model'])
|
naive_model.load_state_dict(read_dict["naive_model"]['model'])
|
||||||
naive_model.eval()
|
naive_model.eval()
|
||||||
return diff_model, diff_args, naive_model, naive_args, vocoder
|
return diff_model, diff_args, naive_model, naive_args # , vocoder
|
||||||
|
|
||||||
|
|
||||||
def load_svc_model(args, vocoder_dimension):
|
def load_svc_model(args, vocoder_dimension):
|
||||||
|
@ -0,0 +1,11 @@
|
|||||||
|
|
||||||
|
|
||||||
|
class VocoderOnnx:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def initialize(self, onnx_path: str, gpu: int):
|
||||||
|
raise Exception("Not implemented")
|
||||||
|
|
||||||
|
def infer(self, out_mel, pitch, silence_front, mask):
|
||||||
|
raise Exception("Not implemented")
|
@ -45,8 +45,8 @@ class EmbedderManager:
|
|||||||
raise Exception("[Voice Changer][Embedder] onnx is off")
|
raise Exception("[Voice Changer][Embedder] onnx is off")
|
||||||
file = cls.params.content_vec_500_onnx
|
file = cls.params.content_vec_500_onnx
|
||||||
return OnnxContentvec().loadModel(file, dev)
|
return OnnxContentvec().loadModel(file, dev)
|
||||||
except Exception as e:
|
except Exception as e: # noqa
|
||||||
print(e)
|
print("[Voice Changer] use torch contentvec")
|
||||||
file = cls.params.hubert_base
|
file = cls.params.hubert_base
|
||||||
return FairseqHubert().loadModel(file, dev, isHalf)
|
return FairseqHubert().loadModel(file, dev, isHalf)
|
||||||
elif embederType == "hubert-base-japanese":
|
elif embederType == "hubert-base-japanese":
|
||||||
|
Loading…
Reference in New Issue
Block a user