WIP: vocoder

This commit is contained in:
w-okada 2023-07-19 10:20:30 +09:00
parent 1d54687577
commit 72702ee70a
9 changed files with 69 additions and 31 deletions

View File

@ -60,6 +60,15 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
"position": 4, "position": 4,
} }
) )
nsf_hifigan_onnx = os.path.join(os.path.dirname(nsf_hifigan), "nsf_hifigan.onnx")
if os.path.exists(nsf_hifigan_onnx) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_onnx_20221211/nsf_hifigan.onnx",
"saveTo": nsf_hifigan_onnx,
"position": 4,
}
)
if os.path.exists(crepe_onnx_full) is False: if os.path.exists(crepe_onnx_full) is False:
downloadParams.append( downloadParams.append(
@ -86,8 +95,7 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams):
"position": 7, "position": 7,
} }
) )
if os.path.exists(rmvpe) is False:
if os.path.exists(content_vec_500_onnx) is False:
downloadParams.append( downloadParams.append(
{ {
"url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt", "url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt",

View File

@ -94,7 +94,9 @@ class MMVC_Rest_Fileuploader:
return JSONResponse(content=json_compatible_item_data) return JSONResponse(content=json_compatible_item_data)
except Exception as e: except Exception as e:
print("[Voice Changer] post_load_model ex:", e) print("[Voice Changer] post_load_model ex:", e)
import traceback
traceback.print_exc()
def get_onnx(self): def get_onnx(self):
try: try:
info = self.voiceChangerManager.export2onnx() info = self.voiceChangerManager.export2onnx()

View File

@ -2,6 +2,7 @@ from dataclasses import asdict
import numpy as np import numpy as np
from data.ModelSlot import DiffusionSVCModelSlot from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
@ -21,6 +22,7 @@ class DiffusionSVC(VoiceChangerModel):
self.deviceManager = DeviceManager.get_instance() self.deviceManager = DeviceManager.get_instance()
EmbedderManager.initialize(params) EmbedderManager.initialize(params)
PitchExtractorManager.initialize(params) PitchExtractorManager.initialize(params)
InferencerManager.initialize(params)
self.settings = DiffusionSVCSettings() self.settings = DiffusionSVCSettings()
self.params = params self.params = params
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)

View File

@ -31,7 +31,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
@classmethod @classmethod
def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot): def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot):
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(slot.modelFile, device="cpu") diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
slot.kStepMax = diff_args.model.k_step_max slot.kStepMax = diff_args.model.k_step_max
slot.nLayers = diff_args.model.n_layers slot.nLayers = diff_args.model.n_layers
slot.nnLayers = naive_args.model.n_layers slot.nnLayers = naive_args.model.n_layers

View File

@ -4,41 +4,43 @@ from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.utils.Timer import Timer from voice_changer.utils.Timer import Timer
class DiffusionSVCInferencer(Inferencer): class DiffusionSVCInferencer(Inferencer):
def __init__(self): def __init__(self, vocoder_torch_path, vocoder_onnx_path):
self.diff_model: Unit2Mel | None = None self.diff_model: Unit2Mel | None = None
self.naive_model: Unit2MelNaive | None = None self.naive_model: Unit2MelNaive | None = None
self.vocoder: Vocoder | None = None self.vocoder: Vocoder | None = None
self.vocoder_onnx_path = vocoder_onnx_path
self.vocoder_torch_path = vocoder_torch_path
self.vocoder_onnx = None
def loadModel(self, file: str, gpu: int): def loadModel(self, file: str, gpu: int):
self.setProps("DiffusionSVCCombo", file, True, gpu) self.setProps("DiffusionSVCCombo", file, True, gpu)
self.dev = DeviceManager.get_instance().getDevice(gpu) self.dev = DeviceManager.get_instance().getDevice(gpu)
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev) diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(file, device=self.dev)
# vocoder
try: # try onnx
self.vocoder_onnx = VocoderOnnx()
self.vocoder_onnx.initialize(self.vocoder_onnx_path, gpu)
print("[Voice Changer] load onnx nsf-hifigan")
vocoder = None
except Exception as e: # noqa
print("[Voice Changer] load torch nsf-hifigan")
vocoder = Vocoder("nsf-hifigan", self.vocoder_torch_path, device=self.dev)
self.vocoder_onnx = None
self.diff_model = diff_model self.diff_model = diff_model
self.naive_model = naive_model self.naive_model = naive_model
self.vocoder = vocoder self.vocoder = vocoder
self.diff_args = diff_args self.diff_args = diff_args
self.naive_args = naive_args self.naive_args = naive_args
# cpt = torch.load(file, map_location="cpu")
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
# model.eval()
# model.load_state_dict(cpt["weight"], strict=False)
# model = model.to(dev)
# if isHalf:
# model = model.half()
# self.model = model
return self return self
def getConfig(self) -> tuple[int, int]: def getConfig(self) -> tuple[int, int]:
@ -123,9 +125,12 @@ class DiffusionSVCInferencer(Inferencer):
# print("[ ----Timer::2: ]", t.secs) # print("[ ----Timer::2: ]", t.secs)
with Timer("pre-process") as t: # NOQA with Timer("pre-process") as t: # NOQA
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) if self.vocoder_onnx is None:
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
out_wav *= mask out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
# print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape) out_wav *= mask
else:
out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask)
# print("[ ----Timer::3: ]", t.secs)
return out_wav.squeeze() return out_wav.squeeze()

View File

@ -1,10 +1,17 @@
from const import DiffusionSVCInferenceType from const import DiffusionSVCInferenceType
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
import os
class InferencerManager: class InferencerManager:
currentInferencer: Inferencer | None = None currentInferencer: Inferencer | None = None
params: VoiceChangerParams
@classmethod
def initialize(cls, params: VoiceChangerParams):
cls.params = params
@classmethod @classmethod
def getInferencer( def getInferencer(
@ -24,6 +31,8 @@ class InferencerManager:
gpu: int, gpu: int,
) -> Inferencer: ) -> Inferencer:
if inferencerType == "combo": if inferencerType == "combo":
return DiffusionSVCInferencer().loadModel(file, gpu) vocoder_onnx_path = os.path.join(os.path.dirname(cls.params.nsf_hifigan), "nsf_hifigan.onnx")
return DiffusionSVCInferencer(cls.params.nsf_hifigan, vocoder_onnx_path).loadModel(file, gpu)
else: else:
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)

View File

@ -49,22 +49,23 @@ def load_model_vocoder_from_combo(combo_model_path, device='cpu'):
# args # args
diff_args = DotDict(read_dict["diff_config_dict"]) diff_args = DotDict(read_dict["diff_config_dict"])
naive_args = DotDict(read_dict["naive_config_dict"]) naive_args = DotDict(read_dict["naive_config_dict"])
# vocoder # # vocoder
vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device) # vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
# diff_model # diff_model
print(' [Loading] ' + combo_model_path) print(' [Loading] ' + combo_model_path)
diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension) # diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
diff_model = load_svc_model(args=diff_args, vocoder_dimension=128)
diff_model.to(device) diff_model.to(device)
diff_model.load_state_dict(read_dict["diff_model"]['model']) diff_model.load_state_dict(read_dict["diff_model"]['model'])
diff_model.eval() diff_model.eval()
# naive_model # naive_model
naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension) naive_model = load_svc_model(args=naive_args, vocoder_dimension=128)
naive_model.to(device) naive_model.to(device)
naive_model.load_state_dict(read_dict["naive_model"]['model']) naive_model.load_state_dict(read_dict["naive_model"]['model'])
naive_model.eval() naive_model.eval()
return diff_model, diff_args, naive_model, naive_args, vocoder return diff_model, diff_args, naive_model, naive_args # , vocoder
def load_svc_model(args, vocoder_dimension): def load_svc_model(args, vocoder_dimension):

View File

@ -0,0 +1,11 @@
class VocoderOnnx:
def __init__(self) -> None:
pass
def initialize(self, onnx_path: str, gpu: int):
raise Exception("Not implemented")
def infer(self, out_mel, pitch, silence_front, mask):
raise Exception("Not implemented")

View File

@ -45,8 +45,8 @@ class EmbedderManager:
raise Exception("[Voice Changer][Embedder] onnx is off") raise Exception("[Voice Changer][Embedder] onnx is off")
file = cls.params.content_vec_500_onnx file = cls.params.content_vec_500_onnx
return OnnxContentvec().loadModel(file, dev) return OnnxContentvec().loadModel(file, dev)
except Exception as e: except Exception as e: # noqa
print(e) print("[Voice Changer] use torch contentvec")
file = cls.params.hubert_base file = cls.params.hubert_base
return FairseqHubert().loadModel(file, dev, isHalf) return FairseqHubert().loadModel(file, dev, isHalf)
elif embederType == "hubert-base-japanese": elif embederType == "hubert-base-japanese":