From 72702ee70ad9e410a52c7fc0c9f7dadb2c4c725d Mon Sep 17 00:00:00 2001 From: w-okada Date: Wed, 19 Jul 2023 10:20:30 +0900 Subject: [PATCH] WIP: vocoder --- server/downloader/WeightDownloader.py | 12 +++++- server/restapi/MMVC_Rest_Fileuploader.py | 4 +- .../DiffusionSVC/DiffusionSVC.py | 2 + .../DiffusionSVCModelSlotGenerator.py | 2 +- .../inferencer/DiffusionSVCInferencer.py | 43 +++++++++++-------- .../inferencer/InferencerManager.py | 11 ++++- .../diffusion_svc_model/diffusion/unit2mel.py | 11 ++--- .../inferencer/onnx/VocoderOnnx.py | 11 +++++ .../RVC/embedder/EmbedderManager.py | 4 +- 9 files changed, 69 insertions(+), 31 deletions(-) create mode 100644 server/voice_changer/DiffusionSVC/inferencer/onnx/VocoderOnnx.py diff --git a/server/downloader/WeightDownloader.py b/server/downloader/WeightDownloader.py index 26ca2fc0..3603af98 100644 --- a/server/downloader/WeightDownloader.py +++ b/server/downloader/WeightDownloader.py @@ -60,6 +60,15 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams): "position": 4, } ) + nsf_hifigan_onnx = os.path.join(os.path.dirname(nsf_hifigan), "nsf_hifigan.onnx") + if os.path.exists(nsf_hifigan_onnx) is False: + downloadParams.append( + { + "url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_onnx_20221211/nsf_hifigan.onnx", + "saveTo": nsf_hifigan_onnx, + "position": 4, + } + ) if os.path.exists(crepe_onnx_full) is False: downloadParams.append( @@ -86,8 +95,7 @@ def downloadWeight(voiceChangerParams: VoiceChangerParams): "position": 7, } ) - - if os.path.exists(content_vec_500_onnx) is False: + if os.path.exists(rmvpe) is False: downloadParams.append( { "url": "https://huggingface.co/wok000/weights/resolve/main/rmvpe/rmvpe.pt", diff --git a/server/restapi/MMVC_Rest_Fileuploader.py b/server/restapi/MMVC_Rest_Fileuploader.py index 5361d6df..7d2d8055 100644 --- a/server/restapi/MMVC_Rest_Fileuploader.py +++ b/server/restapi/MMVC_Rest_Fileuploader.py @@ -94,7 +94,9 @@ class MMVC_Rest_Fileuploader: return JSONResponse(content=json_compatible_item_data) except Exception as e: print("[Voice Changer] post_load_model ex:", e) - + import traceback + traceback.print_exc() + def get_onnx(self): try: info = self.voiceChangerManager.export2onnx() diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index d202c000..f1da6a53 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -2,6 +2,7 @@ from dataclasses import asdict import numpy as np from data.ModelSlot import DiffusionSVCModelSlot from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings +from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager @@ -21,6 +22,7 @@ class DiffusionSVC(VoiceChangerModel): self.deviceManager = DeviceManager.get_instance() EmbedderManager.initialize(params) PitchExtractorManager.initialize(params) + InferencerManager.initialize(params) self.settings = DiffusionSVCSettings() self.params = params self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVCModelSlotGenerator.py b/server/voice_changer/DiffusionSVC/DiffusionSVCModelSlotGenerator.py index a6bdfc88..29f2cf10 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVCModelSlotGenerator.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVCModelSlotGenerator.py @@ -31,7 +31,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator): @classmethod def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot): - diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(slot.modelFile, device="cpu") + diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(slot.modelFile, device="cpu") slot.kStepMax = diff_args.model.k_step_max slot.nLayers = diff_args.model.n_layers slot.nnLayers = naive_args.model.n_layers diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py index f54f024c..557db7f2 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py +++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py @@ -4,41 +4,43 @@ from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder +from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.utils.Timer import Timer class DiffusionSVCInferencer(Inferencer): - def __init__(self): + def __init__(self, vocoder_torch_path, vocoder_onnx_path): self.diff_model: Unit2Mel | None = None self.naive_model: Unit2MelNaive | None = None self.vocoder: Vocoder | None = None + self.vocoder_onnx_path = vocoder_onnx_path + self.vocoder_torch_path = vocoder_torch_path + self.vocoder_onnx = None def loadModel(self, file: str, gpu: int): self.setProps("DiffusionSVCCombo", file, True, gpu) - self.dev = DeviceManager.get_instance().getDevice(gpu) - # isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu) - diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev) + diff_model, diff_args, naive_model, naive_args = load_model_vocoder_from_combo(file, device=self.dev) + # vocoder + try: # try onnx + self.vocoder_onnx = VocoderOnnx() + self.vocoder_onnx.initialize(self.vocoder_onnx_path, gpu) + print("[Voice Changer] load onnx nsf-hifigan") + vocoder = None + except Exception as e: # noqa + print("[Voice Changer] load torch nsf-hifigan") + vocoder = Vocoder("nsf-hifigan", self.vocoder_torch_path, device=self.dev) + self.vocoder_onnx = None + self.diff_model = diff_model self.naive_model = naive_model self.vocoder = vocoder self.diff_args = diff_args self.naive_args = naive_args - # cpt = torch.load(file, map_location="cpu") - # model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf) - - # model.eval() - # model.load_state_dict(cpt["weight"], strict=False) - - # model = model.to(dev) - # if isHalf: - # model = model.half() - - # self.model = model return self def getConfig(self) -> tuple[int, int]: @@ -123,9 +125,12 @@ class DiffusionSVCInferencer(Inferencer): # print("[ ----Timer::2: ]", t.secs) with Timer("pre-process") as t: # NOQA - start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) - out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) - out_wav *= mask - # print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape) + if self.vocoder_onnx is None: + start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) + out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) + out_wav *= mask + else: + out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask) + # print("[ ----Timer::3: ]", t.secs) return out_wav.squeeze() diff --git a/server/voice_changer/DiffusionSVC/inferencer/InferencerManager.py b/server/voice_changer/DiffusionSVC/inferencer/InferencerManager.py index 579dc892..3fdbc19a 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/InferencerManager.py +++ b/server/voice_changer/DiffusionSVC/inferencer/InferencerManager.py @@ -1,10 +1,17 @@ from const import DiffusionSVCInferenceType from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams +import os class InferencerManager: currentInferencer: Inferencer | None = None + params: VoiceChangerParams + + @classmethod + def initialize(cls, params: VoiceChangerParams): + cls.params = params @classmethod def getInferencer( @@ -24,6 +31,8 @@ class InferencerManager: gpu: int, ) -> Inferencer: if inferencerType == "combo": - return DiffusionSVCInferencer().loadModel(file, gpu) + vocoder_onnx_path = os.path.join(os.path.dirname(cls.params.nsf_hifigan), "nsf_hifigan.onnx") + return DiffusionSVCInferencer(cls.params.nsf_hifigan, vocoder_onnx_path).loadModel(file, gpu) + else: raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) diff --git a/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/diffusion/unit2mel.py b/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/diffusion/unit2mel.py index 1abdd3e8..73ff8076 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/diffusion/unit2mel.py +++ b/server/voice_changer/DiffusionSVC/inferencer/diffusion_svc_model/diffusion/unit2mel.py @@ -49,22 +49,23 @@ def load_model_vocoder_from_combo(combo_model_path, device='cpu'): # args diff_args = DotDict(read_dict["diff_config_dict"]) naive_args = DotDict(read_dict["naive_config_dict"]) - # vocoder - vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device) + # # vocoder + # vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device) # diff_model print(' [Loading] ' + combo_model_path) - diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension) + # diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension) + diff_model = load_svc_model(args=diff_args, vocoder_dimension=128) diff_model.to(device) diff_model.load_state_dict(read_dict["diff_model"]['model']) diff_model.eval() # naive_model - naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension) + naive_model = load_svc_model(args=naive_args, vocoder_dimension=128) naive_model.to(device) naive_model.load_state_dict(read_dict["naive_model"]['model']) naive_model.eval() - return diff_model, diff_args, naive_model, naive_args, vocoder + return diff_model, diff_args, naive_model, naive_args # , vocoder def load_svc_model(args, vocoder_dimension): diff --git a/server/voice_changer/DiffusionSVC/inferencer/onnx/VocoderOnnx.py b/server/voice_changer/DiffusionSVC/inferencer/onnx/VocoderOnnx.py new file mode 100644 index 00000000..2203c2a7 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/inferencer/onnx/VocoderOnnx.py @@ -0,0 +1,11 @@ + + +class VocoderOnnx: + def __init__(self) -> None: + pass + + def initialize(self, onnx_path: str, gpu: int): + raise Exception("Not implemented") + + def infer(self, out_mel, pitch, silence_front, mask): + raise Exception("Not implemented") diff --git a/server/voice_changer/RVC/embedder/EmbedderManager.py b/server/voice_changer/RVC/embedder/EmbedderManager.py index 65fcd093..a893524f 100644 --- a/server/voice_changer/RVC/embedder/EmbedderManager.py +++ b/server/voice_changer/RVC/embedder/EmbedderManager.py @@ -45,8 +45,8 @@ class EmbedderManager: raise Exception("[Voice Changer][Embedder] onnx is off") file = cls.params.content_vec_500_onnx return OnnxContentvec().loadModel(file, dev) - except Exception as e: - print(e) + except Exception as e: # noqa + print("[Voice Changer] use torch contentvec") file = cls.params.hubert_base return FairseqHubert().loadModel(file, dev, isHalf) elif embederType == "hubert-base-japanese":