diff --git a/server/voice_changer/RVC/ModelSlot.py b/server/voice_changer/RVC/ModelSlot.py index 55b22b91..46f8ce65 100644 --- a/server/voice_changer/RVC/ModelSlot.py +++ b/server/voice_changer/RVC/ModelSlot.py @@ -17,6 +17,8 @@ class ModelSlot: samplingRate: int = -1 f0: bool = True embChannels: int = 256 + embOutputLayter: int = 9 + useFinalProj: bool = True deprecated: bool = False embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py index 04798ba8..580b9888 100644 --- a/server/voice_changer/RVC/ModelSlotGenerator.py +++ b/server/voice_changer/RVC/ModelSlotGenerator.py @@ -54,7 +54,9 @@ def generateModelSlot(slotDir: str): def _setInfoByPytorch(slot: ModelSlot): cpt = torch.load(slot.modelFile, map_location="cpu") config_len = len(cpt["config"]) + if config_len == 18: + # Original RVC slot.f0 = True if cpt["f0"] == 1 else False version = cpt.get("version", "v1") if version is None or version == "v1": @@ -64,6 +66,8 @@ def _setInfoByPytorch(slot: ModelSlot): else EnumInferenceTypes.pyTorchRVCNono ) slot.embChannels = 256 + slot.embOutputLayter = 9 + slot.useFinalProj = True slot.embedder = EnumEmbedderTypes.hubert else: slot.modelType = ( @@ -72,9 +76,12 @@ def _setInfoByPytorch(slot: ModelSlot): else EnumInferenceTypes.pyTorchRVCv2Nono ) slot.embChannels = 768 + slot.embOutputLayter = 12 + slot.useFinalProj = False slot.embedder = EnumEmbedderTypes.hubert else: + # DDPN RVC slot.f0 = True if cpt["f0"] == 1 else False slot.modelType = ( EnumInferenceTypes.pyTorchWebUI @@ -82,6 +89,32 @@ def _setInfoByPytorch(slot: ModelSlot): else EnumInferenceTypes.pyTorchWebUINono ) slot.embChannels = cpt["config"][17] + slot.embOutputLayter = ( + cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9 + ) + if slot.embChannels == 256: + slot.useFinalProj = True + else: + slot.useFinalProj = False + + # DDPNモデルの情報を表示 + if ( + slot.embChannels == 256 + and slot.embOutputLayter == 9 + and slot.useFinalProj is True + ): + print("[Voice Changer] DDPN Model: Original v1 like") + elif ( + slot.embChannels == 768 + and slot.embOutputLayter == 12 + and slot.useFinalProj is False + ): + print("[Voice Changer] DDPN Model: Original v2 like") + else: + print( + f"[Voice Changer] DDPN Model: ch:{slot.embChannels}, L:{slot.embOutputLayter}, FP:{slot.useFinalProj}" + ) + slot.embedder = cpt["embedder_name"] if slot.embedder.endswith("768"): slot.embedder = slot.embedder[:-3] @@ -111,6 +144,19 @@ def _setInfoByONNX(slot: ModelSlot): # slot.modelType = metadata["modelType"] slot.embChannels = metadata["embChannels"] + slot.embOutputLayter = ( + metadata["embedder_output_layer"] + if "embedder_output_layer" in metadata + else 9 + ) + + if slot.embChannels == 256: + slot.useFinalProj = True + else: + slot.useFinalProj = False + + print("ONNX", slot) + if "embedder" not in metadata: slot.embedder = EnumEmbedderTypes.hubert elif metadata["embedder"] == EnumEmbedderTypes.hubert.value: diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index deafe0a2..b78f7b82 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -333,8 +333,8 @@ class RVC: f0_up_key = self.settings.tran index_rate = self.settings.indexRatio if_f0 = 1 if self.settings.modelSlots[self.currentSlot].f0 else 0 - - embChannels = self.settings.modelSlots[self.currentSlot].embChannels + embOutputLayer = self.settings.modelSlots[self.currentSlot].embOutputLayter + useFinalProj = self.settings.modelSlots[self.currentSlot].useFinalProj audio_out = self.pipeline.exec( sid, @@ -343,7 +343,8 @@ class RVC: index_rate, if_f0, self.settings.extraConvertSize / self.settings.modelSamplingRate, - embChannels, + embOutputLayer, + useFinalProj, repeat, ) diff --git a/server/voice_changer/RVC/embedder/Embedder.py b/server/voice_changer/RVC/embedder/Embedder.py index be8d1c49..374044d0 100644 --- a/server/voice_changer/RVC/embedder/Embedder.py +++ b/server/voice_changer/RVC/embedder/Embedder.py @@ -17,7 +17,9 @@ class Embedder(Protocol): def loadModel(self, file: str, dev: device, isHalf: bool = True): ... - def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: + def extractFeatures( + self, feats: torch.Tensor, embOutputLayer=9, useFinalProj=True + ) -> torch.Tensor: ... def setProps( diff --git a/server/voice_changer/RVC/embedder/FairseqHubert.py b/server/voice_changer/RVC/embedder/FairseqHubert.py index 2a981502..b62150dd 100644 --- a/server/voice_changer/RVC/embedder/FairseqHubert.py +++ b/server/voice_changer/RVC/embedder/FairseqHubert.py @@ -23,23 +23,23 @@ class FairseqHubert(Embedder): self.model = model return self - def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor: + def extractFeatures( + self, feats: torch.Tensor, embOutputLayer=9, useFinalProj=True + ) -> torch.Tensor: padding_mask = torch.BoolTensor(feats.shape).to(self.dev).fill_(False) - if embChannels == 256: - inputs = { - "source": feats.to(self.dev), - "padding_mask": padding_mask, - "output_layer": 9, # layer 9 - } - else: - inputs = { - "source": feats.to(self.dev), - "padding_mask": padding_mask, - } + + # オリジナル_v1は L9にfinal_projをかけていた。(-> 256) + # オリジナル_v2は L12にfinal_projをかけない。(-> 768) + + inputs = { + "source": feats.to(self.dev), + "padding_mask": padding_mask, + "output_layer": embOutputLayer, # 9 or 12 + } with torch.no_grad(): logits = self.model.extract_features(**inputs) - if embChannels == 256: + if useFinalProj: feats = self.model.final_proj(logits[0]) else: feats = logits[0] diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index fad1b634..8c4364e5 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -82,7 +82,8 @@ class Pipeline(object): index_rate, if_f0, silence_front, - embChannels, + embOutputLayer, + useFinalProj, repeat, ): self.t_pad = self.sr * repeat @@ -127,7 +128,7 @@ class Pipeline(object): # embedding padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) try: - feats = self.embedder.extractFeatures(feats, embChannels) + feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) except RuntimeError as e: if "HALF" in e.__str__().upper(): raise HalfPrecisionChangingException()