WIP onnx improve

2025-01-23 13:35:12 +03:00 · 2023-09-06 08:04:39 +09:00 · 2023-09-06 08:04:39 +09:00 · 80a5ba91b8
commit 80a5ba91b8
parent 93480636a3
15 changed files with 72 additions and 26 deletions
--- a/server/data/ModelSlot.py
+++ b/server/data/ModelSlot.py
@ -40,6 +40,8 @@ class RVCModelSlot(ModelSlot):
    sampleId: str = ""
    speakers: dict = field(default_factory=lambda: {0: "target"})

+    version:str =  "v2"
+

@dataclass
 class MMVCv13ModelSlot(ModelSlot):
--- a/server/voice_changer/RVC/RVCModelSlotGenerator.py
+++ b/server/voice_changer/RVC/RVCModelSlotGenerator.py
@ -154,6 +154,16 @@ class RVCModelSlotGenerator(ModelSlotGenerator):
            slot.samplingRate = metadata["samplingRate"]
            slot.deprecated = False

+            if slot.embChannels == 256:
+                if metadata["version"] == "2.1":
+                    slot.version = "v1.1"  # 1.1はclipをonnx内部で実施. realtimeをdisable
+                else:
+                    slot.version = "v1"
+            elif metadata["version"] == "2":
+                slot.version = "v2"
+            elif metadata["version"] == "2.1":  # 2.1はclipをonnx内部で実施. realtimeをdisable
+                slot.version = "v2.1"
+
        except Exception as e:
            slot.modelType = EnumInferenceTypes.onnxRVC.value
            slot.embChannels = 256
--- a/server/voice_changer/RVC/inferencer/InferencerManager.py
+++ b/server/voice_changer/RVC/inferencer/InferencerManager.py
@ -20,8 +20,9 @@ class InferencerManager:
        inferencerType: EnumInferenceTypes,
        file: str,
        gpu: int,
+        inferencerTypeVersion: str | None = None,
    ) -> Inferencer:
-        cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu)
+        cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu, inferencerTypeVersion)
        return cls.currentInferencer

    @classmethod
@ -30,6 +31,7 @@ class InferencerManager:
        inferencerType: EnumInferenceTypes,
        file: str,
        gpu: int,
+        inferencerTypeVersion: str | None = None,
    ) -> Inferencer:
        if inferencerType == EnumInferenceTypes.pyTorchRVC or inferencerType == EnumInferenceTypes.pyTorchRVC.value:
            return RVCInferencer().loadModel(file, gpu)
@ -50,8 +52,8 @@ class InferencerManager:
        elif inferencerType == EnumInferenceTypes.pyTorchWebUINono or inferencerType == EnumInferenceTypes.pyTorchWebUINono.value:
            return WebUIInferencerNono().loadModel(file, gpu)
        elif inferencerType == EnumInferenceTypes.onnxRVC or inferencerType == EnumInferenceTypes.onnxRVC.value:
-            return OnnxRVCInferencer().loadModel(file, gpu)
+            return OnnxRVCInferencer().loadModel(file, gpu, inferencerTypeVersion)
        elif inferencerType == EnumInferenceTypes.onnxRVCNono or inferencerType == EnumInferenceTypes.onnxRVCNono.value:
-            return OnnxRVCInferencerNono().loadModel(file, gpu)
+            return OnnxRVCInferencerNono().loadModel(file, gpu, inferencerTypeVersion)
        else:
            raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
--- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
+++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
@ -7,7 +7,7 @@ import numpy as np


 class OnnxRVCInferencer(Inferencer):
-    def loadModel(self, file: str, gpu: int):
+    def loadModel(self, file: str, gpu: int, inferencerTypeVersion: str | None = None):
        self.setProps(EnumInferenceTypes.onnxRVC, file, True, gpu)
        (
            onnxProviders,
@ -26,6 +26,9 @@ class OnnxRVCInferencer(Inferencer):
            self.isHalf = True

        self.model = onnx_session
+
+        self.inferencerTypeVersion = inferencerTypeVersion
+
        return self

    def infer(
@ -66,7 +69,14 @@ class OnnxRVCInferencer(Inferencer):
                },
            )

-        return torch.tensor(np.array(audio1))
+        if self.inferencerTypeVersion == "v2.1" or self.inferencerTypeVersion == "v1.1":
+            res = audio1[0]
+        else:
+            res = np.array(audio1)[0][0, 0]
+            res = np.clip(res, -1.0, 1.0)
+        return torch.tensor(res)
+
+        # return torch.tensor(np.array(audio1))

    def getInferencerInfo(self):
        inferencer = super().getInferencerInfo()
--- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
+++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
@ -6,8 +6,8 @@ from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer


 class OnnxRVCInferencerNono(OnnxRVCInferencer):
-    def loadModel(self, file: str, gpu: int):
-        super().loadModel(file, gpu)
+    def loadModel(self, file: str, gpu: int, inferencerTypeVersion: str | None = None):
+        super().loadModel(file, gpu,  inferencerTypeVersion)
        self.setProps(EnumInferenceTypes.onnxRVCNono, file, self.isHalf, gpu)
        return self

@ -39,4 +39,9 @@ class OnnxRVCInferencerNono(OnnxRVCInferencer):
                },
            )

-        return torch.tensor(np.array(audio1))
+        if self.inferencerTypeVersion == "v2.1" or self.inferencerTypeVersion == "v1.1":
+            res = audio1[0]
+        else:
+            res = np.array(audio1)[0][0, 0]
+            res = np.clip(res, -1.0, 1.0)
+        return torch.tensor(res)
--- a/server/voice_changer/RVC/inferencer/RVCInferencer.py
+++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py
@ -35,4 +35,8 @@ class RVCInferencer(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res        
+
--- a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py
+++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py
@ -35,4 +35,7 @@ class RVCInferencerNono(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res  
--- a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py
+++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py
@ -34,4 +34,8 @@ class RVCInferencerv2(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res        
+
--- a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py
+++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py
@ -35,4 +35,7 @@ class RVCInferencerv2Nono(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res
--- a/server/voice_changer/RVC/inferencer/WebUIInferencer.py
+++ b/server/voice_changer/RVC/inferencer/WebUIInferencer.py
@ -35,4 +35,8 @@ class WebUIInferencer(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res        
+
--- a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py
+++ b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py
@ -35,4 +35,7 @@ class WebUIInferencerNono(Inferencer):
        sid: torch.Tensor,
        convert_length: int | None,
    ) -> torch.Tensor:
-        return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = self.model.infer(feats, pitch_length, sid, convert_length=convert_length)
+        res = res[0][0, 0].to(dtype=torch.float32)
+        res = torch.clip(res, -1.0, 1.0)
+        return res
--- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py
+++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py
@ -64,5 +64,7 @@ class SynthesizerTrnMs768NSFsid_ONNX(nn.Module):
        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
        z = self.flow(z_p, x_mask, g=g, reverse=True)
-        o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
-        return o, x_mask, (z, z_p, m_p, logs_p)
+        # o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
+        o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
+        o = torch.clip(o[0, 0], -1.0, 1.0)
+        return o
--- a/server/voice_changer/RVC/onnxExporter/export2onnx.py
+++ b/server/voice_changer/RVC/onnxExporter/export2onnx.py
@ -37,7 +37,7 @@ def export2onnx(gpu: int, modelSlot: RVCModelSlot):
    output_path_simple = os.path.join(TMP_DIR, output_file_simple)
    metadata = {
        "application": "VC_CLIENT",
-        "version": "2",
+        "version": "2.1",
        "modelType": modelSlot.modelType,
        "samplingRate": modelSlot.samplingRate,
        "f0": modelSlot.f0,
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -118,14 +118,8 @@ class Pipeline(object):
        try:
            with torch.no_grad():
                with autocast(enabled=self.isHalf):
-                    audio1 = (
-                        torch.clip(
-                            self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
-                            -1.0,
-                            1.0,
-                        )
-                        * 32767.5
-                    ).data.to(dtype=torch.int16)
+                    audio1 = self.inferencer.infer(feats,  p_len, pitch, pitchf, sid, out_size)                    
+                    audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
            return audio1
        except RuntimeError as e:
            if "HALF" in e.__str__().upper():
--- a/server/voice_changer/RVC/pipeline/PipelineGenerator.py
+++ b/server/voice_changer/RVC/pipeline/PipelineGenerator.py
@ -19,7 +19,7 @@ def createPipeline(params: VoiceChangerParams, modelSlot: RVCModelSlot, gpu: int
    # Inferencer 生成
    try:
        modelPath = os.path.join(params.model_dir, str(modelSlot.slotIndex), os.path.basename(modelSlot.modelFile))
-        inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, gpu)
+        inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelPath, gpu, modelSlot.version)
    except Exception as e:
        print("[Voice Changer] exception! loading inferencer", e)
        traceback.print_exc()