From 80a5ba91b842ac0a05e8f4b151263401985d7c4f Mon Sep 17 00:00:00 2001 From: w-okada Date: Wed, 6 Sep 2023 08:04:39 +0900 Subject: [PATCH] WIP onnx improve --- server/data/ModelSlot.py | 2 ++ server/voice_changer/RVC/RVCModelSlotGenerator.py | 10 ++++++++++ .../RVC/inferencer/InferencerManager.py | 8 +++++--- .../RVC/inferencer/OnnxRVCInferencer.py | 14 ++++++++++++-- .../RVC/inferencer/OnnxRVCInferencerNono.py | 11 ++++++++--- .../voice_changer/RVC/inferencer/RVCInferencer.py | 6 +++++- .../RVC/inferencer/RVCInferencerNono.py | 5 ++++- .../RVC/inferencer/RVCInferencerv2.py | 6 +++++- .../RVC/inferencer/RVCInferencerv2Nono.py | 5 ++++- .../RVC/inferencer/WebUIInferencer.py | 6 +++++- .../RVC/inferencer/WebUIInferencerNono.py | 5 ++++- .../onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py | 6 ++++-- .../voice_changer/RVC/onnxExporter/export2onnx.py | 2 +- server/voice_changer/RVC/pipeline/Pipeline.py | 10 ++-------- .../RVC/pipeline/PipelineGenerator.py | 2 +- 15 files changed, 72 insertions(+), 26 deletions(-) diff --git a/server/data/ModelSlot.py b/server/data/ModelSlot.py index a54d84be..495af6f5 100644 --- a/server/data/ModelSlot.py +++ b/server/data/ModelSlot.py @@ -40,6 +40,8 @@ class RVCModelSlot(ModelSlot): sampleId: str = "" speakers: dict = field(default_factory=lambda: {0: "target"}) + version:str = "v2" + @dataclass class MMVCv13ModelSlot(ModelSlot): diff --git a/server/voice_changer/RVC/RVCModelSlotGenerator.py b/server/voice_changer/RVC/RVCModelSlotGenerator.py index 610a3713..6ac9aa9f 100644 --- a/server/voice_changer/RVC/RVCModelSlotGenerator.py +++ b/server/voice_changer/RVC/RVCModelSlotGenerator.py @@ -154,6 +154,16 @@ class RVCModelSlotGenerator(ModelSlotGenerator): slot.samplingRate = metadata["samplingRate"] slot.deprecated = False + if slot.embChannels == 256: + if metadata["version"] == "2.1": + slot.version = "v1.1" # 1.1はclipをonnx内部で実施. realtimeをdisable + else: + slot.version = "v1" + elif metadata["version"] == "2": + slot.version = "v2" + elif metadata["version"] == "2.1": # 2.1はclipをonnx内部で実施. realtimeをdisable + slot.version = "v2.1" + except Exception as e: slot.modelType = EnumInferenceTypes.onnxRVC.value slot.embChannels = 256 diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index 56e17112..884af19a 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -20,8 +20,9 @@ class InferencerManager: inferencerType: EnumInferenceTypes, file: str, gpu: int, + inferencerTypeVersion: str | None = None, ) -> Inferencer: - cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu) + cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu, inferencerTypeVersion) return cls.currentInferencer @classmethod @@ -30,6 +31,7 @@ class InferencerManager: inferencerType: EnumInferenceTypes, file: str, gpu: int, + inferencerTypeVersion: str | None = None, ) -> Inferencer: if inferencerType == EnumInferenceTypes.pyTorchRVC or inferencerType == EnumInferenceTypes.pyTorchRVC.value: return RVCInferencer().loadModel(file, gpu) @@ -50,8 +52,8 @@ class InferencerManager: elif inferencerType == EnumInferenceTypes.pyTorchWebUINono or inferencerType == EnumInferenceTypes.pyTorchWebUINono.value: return WebUIInferencerNono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.onnxRVC or inferencerType == EnumInferenceTypes.onnxRVC.value: - return OnnxRVCInferencer().loadModel(file, gpu) + return OnnxRVCInferencer().loadModel(file, gpu, inferencerTypeVersion) elif inferencerType == EnumInferenceTypes.onnxRVCNono or inferencerType == EnumInferenceTypes.onnxRVCNono.value: - return OnnxRVCInferencerNono().loadModel(file, gpu) + return OnnxRVCInferencerNono().loadModel(file, gpu, inferencerTypeVersion) else: raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType) diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py index 9d3c8b7e..f95b5307 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py @@ -7,7 +7,7 @@ import numpy as np class OnnxRVCInferencer(Inferencer): - def loadModel(self, file: str, gpu: int): + def loadModel(self, file: str, gpu: int, inferencerTypeVersion: str | None = None): self.setProps(EnumInferenceTypes.onnxRVC, file, True, gpu) ( onnxProviders, @@ -26,6 +26,9 @@ class OnnxRVCInferencer(Inferencer): self.isHalf = True self.model = onnx_session + + self.inferencerTypeVersion = inferencerTypeVersion + return self def infer( @@ -66,7 +69,14 @@ class OnnxRVCInferencer(Inferencer): }, ) - return torch.tensor(np.array(audio1)) + if self.inferencerTypeVersion == "v2.1" or self.inferencerTypeVersion == "v1.1": + res = audio1[0] + else: + res = np.array(audio1)[0][0, 0] + res = np.clip(res, -1.0, 1.0) + return torch.tensor(res) + + # return torch.tensor(np.array(audio1)) def getInferencerInfo(self): inferencer = super().getInferencerInfo() diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py index 9c8bad34..fb18985a 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py @@ -6,8 +6,8 @@ from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer class OnnxRVCInferencerNono(OnnxRVCInferencer): - def loadModel(self, file: str, gpu: int): - super().loadModel(file, gpu) + def loadModel(self, file: str, gpu: int, inferencerTypeVersion: str | None = None): + super().loadModel(file, gpu, inferencerTypeVersion) self.setProps(EnumInferenceTypes.onnxRVCNono, file, self.isHalf, gpu) return self @@ -39,4 +39,9 @@ class OnnxRVCInferencerNono(OnnxRVCInferencer): }, ) - return torch.tensor(np.array(audio1)) + if self.inferencerTypeVersion == "v2.1" or self.inferencerTypeVersion == "v1.1": + res = audio1[0] + else: + res = np.array(audio1)[0][0, 0] + res = np.clip(res, -1.0, 1.0) + return torch.tensor(res) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencer.py b/server/voice_changer/RVC/inferencer/RVCInferencer.py index 9539d77c..a332036f 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py @@ -35,4 +35,8 @@ class RVCInferencer(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res + diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py index 30355b9a..b6086e5f 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py @@ -35,4 +35,7 @@ class RVCInferencerNono(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py index 31fbe484..ff814d6a 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py @@ -34,4 +34,8 @@ class RVCInferencerv2(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res + diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py index 7b85dc96..c2a40252 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py @@ -35,4 +35,7 @@ class RVCInferencerv2Nono(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res \ No newline at end of file diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencer.py b/server/voice_changer/RVC/inferencer/WebUIInferencer.py index d884f4c7..88a6890a 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencer.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencer.py @@ -35,4 +35,8 @@ class WebUIInferencer(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res + diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py index 7bc54f14..382aa14a 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py @@ -35,4 +35,7 @@ class WebUIInferencerNono(Inferencer): sid: torch.Tensor, convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length) + res = res[0][0, 0].to(dtype=torch.float32) + res = torch.clip(res, -1.0, 1.0) + return res \ No newline at end of file diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py index a7193feb..40ed3e5e 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py @@ -64,5 +64,7 @@ class SynthesizerTrnMs768NSFsid_ONNX(nn.Module): m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) - return o, x_mask, (z, z_p, m_p, logs_p) + # o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = torch.clip(o[0, 0], -1.0, 1.0) + return o \ No newline at end of file diff --git a/server/voice_changer/RVC/onnxExporter/export2onnx.py b/server/voice_changer/RVC/onnxExporter/export2onnx.py index 66855f1c..eb1405c6 100644 --- a/server/voice_changer/RVC/onnxExporter/export2onnx.py +++ b/server/voice_changer/RVC/onnxExporter/export2onnx.py @@ -37,7 +37,7 @@ def export2onnx(gpu: int, modelSlot: RVCModelSlot): output_path_simple = os.path.join(TMP_DIR, output_file_simple) metadata = { "application": "VC_CLIENT", - "version": "2", + "version": "2.1", "modelType": modelSlot.modelType, "samplingRate": modelSlot.samplingRate, "f0": modelSlot.f0, diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index eb80b525..05a17177 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -118,14 +118,8 @@ class Pipeline(object): try: with torch.no_grad(): with autocast(enabled=self.isHalf): - audio1 = ( - torch.clip( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32), - -1.0, - 1.0, - ) - * 32767.5 - ).data.to(dtype=torch.int16) + audio1 = self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size) + audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16) return audio1 except RuntimeError as e: if "HALF" in e.__str__().upper(): diff --git a/server/voice_changer/RVC/pipeline/PipelineGenerator.py b/server/voice_changer/RVC/pipeline/PipelineGenerator.py index 4bab0965..6f12b91d 100644 --- a/server/voice_changer/RVC/pipeline/PipelineGenerator.py +++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py @@ -19,7 +19,7 @@ def createPipeline(params: VoiceChangerParams, modelSlot: RVCModelSlot, gpu: int # Inferencer 生成 try: modelPath = os.path.join(params.model_dir, str(modelSlot.slotIndex), os.path.basename(modelSlot.modelFile)) - inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, gpu) + inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, gpu, modelSlot.version) except Exception as e: print("[Voice Changer] exception! loading inferencer", e) traceback.print_exc()