WIP: configure diffusion svc

This commit is contained in:
w-okada 2023-07-15 10:01:42 +09:00
parent 3ffacaed97
commit 01291dc4e3
6 changed files with 39 additions and 106 deletions

View File

@ -112,7 +112,9 @@ class DiffusionSVCModelSlot(ModelSlot):
sampleId: str = "" sampleId: str = ""
defaultTune: int = 0 defaultTune: int = 0
kstep: int = 100 defaultKstep: int = 20
defaultSpeedup: int = 10
kStepMax: int = 100
speakers: dict = field(default_factory=lambda: {1: "user"}) speakers: dict = field(default_factory=lambda: {1: "user"})
embedder: EmbedderType = "hubert_base" embedder: EmbedderType = "hubert_base"
samplingRate: int = 44100 samplingRate: int = 44100

View File

@ -42,7 +42,7 @@ class DiffusionSVC(VoiceChangerModel):
# その他の設定 # その他の設定
self.settings.tran = self.slotInfo.defaultTune self.settings.tran = self.slotInfo.defaultTune
self.settings.dstId = self.slotInfo.dstId self.settings.dstId = self.slotInfo.dstId
self.settings.kstep = self.slotInfo.kstep self.settings.kstep = self.slotInfo.defaultKstep
print("[Voice Changer] [DiffusionSVC] Initializing... done") print("[Voice Changer] [DiffusionSVC] Initializing... done")
@ -190,18 +190,18 @@ class DiffusionSVC(VoiceChangerModel):
# "filename": output_file_simple, # "filename": output_file_simple,
# } # }
# def get_model_current(self): def get_model_current(self):
# return [ return [
# { {
# "key": "defaultTune", "key": "defaultTune",
# "val": self.settings.tran, "val": self.settings.tran,
# }, },
# { {
# "key": "defaultIndexRatio", "key": "defaultKstep",
# "val": self.settings.indexRatio, "val": self.settings.kstep,
# }, },
# { {
# "key": "defaultProtect", "key": "defaultSpeedup",
# "val": self.settings.protect, "val": self.settings.speedup,
# }, },
# ] ]

View File

@ -1,11 +1,11 @@
import os import os
from const import EnumInferenceTypes from const import EnumInferenceTypes
from dataclasses import asdict from dataclasses import asdict
import torch
import onnxruntime import onnxruntime
import json import json
from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot, RVCModelSlot from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot, RVCModelSlot
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder_from_combo
from voice_changer.utils.LoadModelParams import LoadModelParams from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
@ -23,90 +23,16 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
slotInfo.iconFile = "/assets/icons/noimage.png" slotInfo.iconFile = "/assets/icons/noimage.png"
slotInfo.embChannels = 768 slotInfo.embChannels = 768
# if slotInfo.isONNX: if slotInfo.isONNX:
# slotInfo = cls._setInfoByONNX(slotInfo) slotInfo = cls._setInfoByONNX(slotInfo)
# else: else:
# slotInfo = cls._setInfoByPytorch(slotInfo) slotInfo = cls._setInfoByPytorch(slotInfo)
return slotInfo return slotInfo
@classmethod @classmethod
def _setInfoByPytorch(cls, slot: ModelSlot): def _setInfoByPytorch(cls, slot: DiffusionSVCModelSlot):
cpt = torch.load(slot.modelFile, map_location="cpu") diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(slot.modelFile, device="cpu")
config_len = len(cpt["config"]) slot.kStepMax = diff_args.model.k_step_max
version = cpt.get("version", "v1")
slot = RVCModelSlot(**asdict(slot))
if version == "voras_beta":
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value
slot.embChannels = 768
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
slot.useFinalProj = False
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
# if slot.embedder == "hubert":
# slot.embedder = "hubert"
# elif slot.embedder == "contentvec":
# slot.embedder = "contentvec"
# elif slot.embedder == "hubert_jp":
# slot.embedder = "hubert_jp"
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
elif config_len == 18:
# Original RVC
slot.f0 = True if cpt["f0"] == 1 else False
version = cpt.get("version", "v1")
if version is None or version == "v1":
slot.modelType = EnumInferenceTypes.pyTorchRVC.value if slot.f0 else EnumInferenceTypes.pyTorchRVCNono.value
slot.embChannels = 256
slot.embOutputLayer = 9
slot.useFinalProj = True
slot.embedder = "hubert_base"
print("[Voice Changer] Official Model(pyTorch) : v1")
else:
slot.modelType = EnumInferenceTypes.pyTorchRVCv2.value if slot.f0 else EnumInferenceTypes.pyTorchRVCv2Nono.value
slot.embChannels = 768
slot.embOutputLayer = 12
slot.useFinalProj = False
slot.embedder = "hubert_base"
print("[Voice Changer] Official Model(pyTorch) : v2")
else:
# DDPN RVC
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = EnumInferenceTypes.pyTorchWebUI.value if slot.f0 else EnumInferenceTypes.pyTorchWebUINono.value
slot.embChannels = cpt["config"][17]
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
if slot.embChannels == 256:
slot.useFinalProj = True
else:
slot.useFinalProj = False
# DDPNモデルの情報を表示
if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj is True:
print("[Voice Changer] DDPN Model(pyTorch) : Official v1 like")
elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False:
print("[Voice Changer] DDPN Model(pyTorch): Official v2 like")
else:
print(f"[Voice Changer] DDPN Model(pyTorch): ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}")
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
if "speaker_info" in cpt.keys():
for k, v in cpt["speaker_info"].items():
slot.speakers[int(k)] = str(v)
slot.samplingRate = cpt["config"][-1]
del cpt
return slot return slot
@classmethod @classmethod

View File

@ -11,7 +11,8 @@ class DiffusionSVCSettings:
silentThreshold: float = 0.00001 silentThreshold: float = 0.00001
extraConvertSize: int = 1024 * 4 extraConvertSize: int = 1024 * 4
kstep: int = 100 kstep: int = 20
speedup: int = 10
silenceFront: int = 1 # 0:off, 1:on silenceFront: int = 1 # 0:off, 1:on
modelSamplingRate: int = 44100 modelSamplingRate: int = 44100

View File

@ -39,9 +39,13 @@ class PitchExtractorManager(Protocol):
elif pitchExtractorType == "crepe_full": elif pitchExtractorType == "crepe_full":
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu) return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
elif pitchExtractorType == "rmvpe": elif pitchExtractorType == "rmvpe":
print("pitchExtractorType", pitchExtractorType)
return RMVPEPitchExtractor(cls.params.rmvpe, gpu) return RMVPEPitchExtractor(cls.params.rmvpe, gpu)
else: else:
# return hubert as default # return hubert as default
raise RuntimeError( print("[Voice Changer] PitchExctractor not found", pitchExtractorType)
"[Voice Changer] PitchExctractor not found", pitchExtractorType print(" fallback to dio")
) return DioPitchExtractor()
# raise RuntimeError(
# "[Voice Changer] PitchExctractor not found", pitchExtractorType
# )

View File

@ -39,6 +39,6 @@ class PitchExtractorManager(Protocol):
return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu) return CrepeOnnxPitchExtractor(pitchExtractorType, cls.params.crepe_onnx_full, gpu)
else: else:
# return hubert as default # return hubert as default
raise RuntimeError( print("[Voice Changer] PitchExctractor not found", pitchExtractorType)
"[Voice Changer] PitchExctractor not found", pitchExtractorType print(" fallback to dio")
) return DioPitchExtractor()