mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
WIP: Japanese Hubert
This commit is contained in:
parent
7c70999f33
commit
72fb482dc7
@ -66,7 +66,22 @@ def getFrontendPath():
|
||||
return frontend_path
|
||||
|
||||
|
||||
# "hubert_base", "contentvec", "distilhubert"
|
||||
class EnumEmbedderTypes(Enum):
|
||||
hubert = "hubert"
|
||||
hubert = "hubert_base"
|
||||
contentvec = "contentvec"
|
||||
hubert_jp = "hubert_jp"
|
||||
hubert_jp = "hubert-base-japanese"
|
||||
|
||||
|
||||
class EnumInferenceTypes(Enum):
|
||||
pyTorchRVC = "pyTorchRVC"
|
||||
pyTorchRVCNono = "pyTorchRVCNono"
|
||||
pyTorchWebUI = "pyTorchWebUI"
|
||||
pyTorchWebUINono = "pyTorchWebUINono"
|
||||
onnxRVC = "onnxRVC"
|
||||
onnxRVCNono = "onnxRVCNono"
|
||||
|
||||
|
||||
class EnumFrameworkTypes(Enum):
|
||||
pyTorch = "pyTorch"
|
||||
onnx = "onnx"
|
||||
|
@ -1,5 +1,6 @@
|
||||
from const import EnumInferenceTypes, EnumEmbedderTypes
|
||||
|
||||
from dataclasses import dataclass
|
||||
from voice_changer.RVC.const import RVC_MODEL_TYPE_RVC
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -9,9 +10,10 @@ class ModelSlot:
|
||||
featureFile: str = ""
|
||||
indexFile: str = ""
|
||||
defaultTrans: int = 0
|
||||
modelType: int = RVC_MODEL_TYPE_RVC
|
||||
isONNX: bool = False
|
||||
modelType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
|
||||
samplingRate: int = -1
|
||||
f0: bool = True
|
||||
embChannels: int = 256
|
||||
deprecated: bool = False
|
||||
embedder: str = "hubert_base" # "hubert_base", "contentvec", "distilhubert"
|
||||
embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert
|
||||
|
88
server/voice_changer/RVC/ModelSlotGenerator.py
Normal file
88
server/voice_changer/RVC/ModelSlotGenerator.py
Normal file
@ -0,0 +1,88 @@
|
||||
from const import EnumEmbedderTypes, EnumInferenceTypes
|
||||
from voice_changer.RVC.ModelSlot import ModelSlot
|
||||
|
||||
from voice_changer.utils.LoadModelParams import FilePaths
|
||||
import torch
|
||||
import onnxruntime
|
||||
import json
|
||||
|
||||
|
||||
def generateModelSlot(files: FilePaths, params):
|
||||
modelSlot = ModelSlot()
|
||||
modelSlot.pyTorchModelFile = files.pyTorchModelFilename
|
||||
modelSlot.onnxModelFile = files.onnxModelFilename
|
||||
modelSlot.featureFile = files.featureFilename
|
||||
modelSlot.indexFile = files.indexFilename
|
||||
modelSlot.defaultTrans = params["trans"] if "trans" in params else 0
|
||||
|
||||
modelSlot.isONNX = True if modelSlot.onnxModelFile is not None else False
|
||||
|
||||
if modelSlot.isONNX:
|
||||
_setInfoByONNX(modelSlot, modelSlot.onnxModelFile)
|
||||
else:
|
||||
_setInfoByPytorch(modelSlot, modelSlot.pyTorchModelFile)
|
||||
return modelSlot
|
||||
|
||||
|
||||
def _setInfoByPytorch(slot: ModelSlot, file: str):
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
config_len = len(cpt["config"])
|
||||
if config_len == 18:
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.pyTorchRVC
|
||||
if slot.f0
|
||||
else EnumInferenceTypes.pyTorchRVCNono
|
||||
)
|
||||
slot.embChannels = 256
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
else:
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.pyTorchWebUI
|
||||
if slot.f0
|
||||
else EnumInferenceTypes.pyTorchWebUINono
|
||||
)
|
||||
slot.embChannels = cpt["config"][17]
|
||||
slot.embedder = cpt["embedder_name"]
|
||||
if slot.embedder.endswith("768"):
|
||||
slot.embedder = slot.embedder[:-3]
|
||||
|
||||
slot.samplingRate = cpt["config"][-1]
|
||||
|
||||
del cpt
|
||||
|
||||
|
||||
def _setInfoByONNX(slot: ModelSlot, file: str):
|
||||
tmp_onnx_session = onnxruntime.InferenceSession(
|
||||
file, providers=["CPUExecutionProvider"]
|
||||
)
|
||||
modelmeta = tmp_onnx_session.get_modelmeta()
|
||||
try:
|
||||
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
|
||||
|
||||
slot.modelType = metadata["modelType"]
|
||||
slot.embChannels = metadata["embChannels"]
|
||||
slot.embedder = (
|
||||
metadata["embedder"] if "embedder" in metadata else EnumEmbedderTypes.hubert
|
||||
)
|
||||
slot.f0 = metadata["f0"]
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono
|
||||
)
|
||||
slot.samplingRate = metadata["samplingRate"]
|
||||
slot.deprecated = False
|
||||
|
||||
except:
|
||||
slot.modelType = EnumInferenceTypes.onnxRVC
|
||||
slot.embChannels = 256
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
slot.f0 = True
|
||||
slot.samplingRate = 48000
|
||||
slot.deprecated = True
|
||||
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
|
||||
del tmp_onnx_session
|
@ -1,29 +1,5 @@
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import resampy
|
||||
from voice_changer.RVC.MergeModel import merge_model
|
||||
from voice_changer.RVC.MergeModelRequest import MergeModelRequest
|
||||
from voice_changer.RVC.ModelWrapper import ModelWrapper
|
||||
from Exceptions import NoModeLoadedException
|
||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
|
||||
from dataclasses import asdict
|
||||
from typing import cast
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
# from fairseq import checkpoint_utils
|
||||
import traceback
|
||||
import faiss
|
||||
|
||||
from const import TMP_DIR, UPLOAD_DIR # type:ignore
|
||||
|
||||
|
||||
# avoiding parse arg error in RVC
|
||||
sys.argv = ["MMVCServerSIO.py"]
|
||||
@ -37,16 +13,35 @@ if sys.platform.startswith("darwin"):
|
||||
sys.path.append(modulePath)
|
||||
else:
|
||||
sys.path.append("RVC")
|
||||
import json
|
||||
import resampy
|
||||
from voice_changer.RVC.MergeModel import merge_model
|
||||
from voice_changer.RVC.MergeModelRequest import MergeModelRequest
|
||||
from voice_changer.RVC.ModelSlotGenerator import generateModelSlot
|
||||
from Exceptions import NoModeLoadedException
|
||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
|
||||
from dataclasses import asdict
|
||||
from typing import cast
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
# from fairseq import checkpoint_utils
|
||||
import traceback
|
||||
import faiss
|
||||
|
||||
from const import TMP_DIR, UPLOAD_DIR
|
||||
|
||||
|
||||
from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMsNSFsid_webui
|
||||
from .models import SynthesizerTrnMsNSFsidNono as SynthesizerTrnMsNSFsidNono_webui
|
||||
from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI
|
||||
from voice_changer.RVC.custom_vc_infer_pipeline import VC
|
||||
from infer_pack.models import ( # type:ignore
|
||||
SynthesizerTrnMs256NSFsid,
|
||||
SynthesizerTrnMs256NSFsid_nono,
|
||||
)
|
||||
|
||||
providers = [
|
||||
"OpenVINOExecutionProvider",
|
||||
@ -59,13 +54,12 @@ providers = [
|
||||
class RVC:
|
||||
audio_buffer: AudioInOut | None = None
|
||||
embedder: Embedder | None = None
|
||||
inferencer: Inferencer | None = None
|
||||
|
||||
def __init__(self, params: VoiceChangerParams):
|
||||
self.initialLoad = True
|
||||
self.settings = RVCSettings()
|
||||
|
||||
self.net_g = None
|
||||
self.onnx_session = None
|
||||
self.feature_file = None
|
||||
self.index_file = None
|
||||
|
||||
@ -83,173 +77,66 @@ class RVC:
|
||||
|
||||
def loadModel(self, props: LoadModelParams):
|
||||
"""
|
||||
loadModelはスロットへのエントリ(推論向けにはロードしない)。
|
||||
例外的に、まだ一つも推論向けにロードされていない場合は、ロードする。
|
||||
loadModelはスロットへのエントリ(推論向けにはロードしない)。
|
||||
例外的に、まだ一つも推論向けにロードされていない場合と稼働中スロットの場合は、ロードする。
|
||||
"""
|
||||
self.is_half = props.isHalf
|
||||
tmp_slot = props.slot
|
||||
target_slot_idx = props.slot
|
||||
params_str = props.params
|
||||
params = json.loads(params_str)
|
||||
|
||||
self.settings.modelSlots[
|
||||
tmp_slot
|
||||
].pyTorchModelFile = props.files.pyTorchModelFilename
|
||||
self.settings.modelSlots[tmp_slot].onnxModelFile = props.files.onnxModelFilename
|
||||
self.settings.modelSlots[tmp_slot].featureFile = props.files.featureFilename
|
||||
self.settings.modelSlots[tmp_slot].indexFile = props.files.indexFilename
|
||||
self.settings.modelSlots[tmp_slot].defaultTrans = (
|
||||
params["trans"] if "trans" in params else 0
|
||||
)
|
||||
|
||||
isONNX = (
|
||||
True
|
||||
if self.settings.modelSlots[tmp_slot].onnxModelFile is not None
|
||||
else False
|
||||
)
|
||||
|
||||
# メタデータ設定
|
||||
if isONNX:
|
||||
self._setInfoByONNX(
|
||||
tmp_slot, self.settings.modelSlots[tmp_slot].onnxModelFile
|
||||
)
|
||||
else:
|
||||
self._setInfoByPytorch(
|
||||
tmp_slot, self.settings.modelSlots[tmp_slot].pyTorchModelFile
|
||||
)
|
||||
|
||||
modelSlot = generateModelSlot(props.files, params)
|
||||
self.settings.modelSlots[target_slot_idx] = modelSlot
|
||||
print(
|
||||
f"[Voice Changer] RVC loading... slot:{tmp_slot}",
|
||||
asdict(self.settings.modelSlots[tmp_slot]),
|
||||
f"[Voice Changer] RVC new model is uploaded,{target_slot_idx}",
|
||||
asdict(modelSlot),
|
||||
)
|
||||
# hubertロード
|
||||
# try:
|
||||
# hubert_path = self.params.hubert_base
|
||||
# hubert_path_jp = self.params.hubert_base_jp
|
||||
# print(hubert_path, hubert_path_jp)
|
||||
|
||||
# models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
# [hubert_path],
|
||||
# suffix="",
|
||||
# )
|
||||
# model = models[0]
|
||||
# model.eval()
|
||||
# if self.is_half:
|
||||
# model = model.half()
|
||||
# self.hubert_model = model
|
||||
|
||||
# except Exception as e:
|
||||
# print("EXCEPTION during loading hubert/contentvec model", e)
|
||||
# print(" hubert_path:", hubert_path)
|
||||
|
||||
# 初回のみロード
|
||||
if self.initialLoad or tmp_slot == self.currentSlot:
|
||||
self.prepareModel(tmp_slot)
|
||||
self.settings.modelSlotIndex = tmp_slot
|
||||
self.currentSlot = self.settings.modelSlotIndex
|
||||
if self.initialLoad or target_slot_idx == self.currentSlot:
|
||||
self.prepareModel(target_slot_idx)
|
||||
self.settings.modelSlotIndex = target_slot_idx
|
||||
# self.currentSlot = self.settings.modelSlotIndex
|
||||
self.switchModel()
|
||||
self.initialLoad = False
|
||||
|
||||
return self.get_info()
|
||||
|
||||
def _setInfoByPytorch(self, slot, file):
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
config_len = len(cpt["config"])
|
||||
if config_len == 18:
|
||||
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC
|
||||
self.settings.modelSlots[slot].embChannels = 256
|
||||
self.settings.modelSlots[slot].embedder = "hubert_base"
|
||||
else:
|
||||
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI
|
||||
self.settings.modelSlots[slot].embChannels = cpt["config"][17]
|
||||
self.settings.modelSlots[slot].embedder = cpt["embedder_name"]
|
||||
if self.settings.modelSlots[slot].embedder.endswith("768"):
|
||||
self.settings.modelSlots[slot].embedder = self.settings.modelSlots[
|
||||
slot
|
||||
].embedder[:-3]
|
||||
|
||||
self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False
|
||||
self.settings.modelSlots[slot].samplingRate = cpt["config"][-1]
|
||||
|
||||
# self.settings.modelSamplingRate = cpt["config"][-1]
|
||||
|
||||
def _setInfoByONNX(self, slot, file):
|
||||
tmp_onnx_session = ModelWrapper(file)
|
||||
self.settings.modelSlots[slot].modelType = tmp_onnx_session.getModelType()
|
||||
self.settings.modelSlots[slot].embChannels = tmp_onnx_session.getEmbChannels()
|
||||
self.settings.modelSlots[slot].embedder = tmp_onnx_session.getEmbedder()
|
||||
self.settings.modelSlots[slot].f0 = tmp_onnx_session.getF0()
|
||||
self.settings.modelSlots[slot].samplingRate = tmp_onnx_session.getSamplingRate()
|
||||
self.settings.modelSlots[slot].deprecated = tmp_onnx_session.getDeprecated()
|
||||
|
||||
def prepareModel(self, slot: int):
|
||||
if slot < 0:
|
||||
return self.get_info()
|
||||
print("[Voice Changer] Prepare Model of slot:", slot)
|
||||
onnxModelFile = self.settings.modelSlots[slot].onnxModelFile
|
||||
isONNX = (
|
||||
True if self.settings.modelSlots[slot].onnxModelFile is not None else False
|
||||
modelSlot = self.settings.modelSlots[slot]
|
||||
filename = (
|
||||
modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile
|
||||
)
|
||||
|
||||
# モデルのロード
|
||||
if isONNX:
|
||||
print("[Voice Changer] Loading ONNX Model...")
|
||||
self.next_onnx_session = ModelWrapper(onnxModelFile)
|
||||
self.next_net_g = None
|
||||
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
|
||||
dev = torch.device("cpu")
|
||||
elif self.mps_enabled:
|
||||
dev = torch.device("mps")
|
||||
else:
|
||||
print("[Voice Changer] Loading Pytorch Model...")
|
||||
torchModelSlot = self.settings.modelSlots[slot]
|
||||
cpt = torch.load(torchModelSlot.pyTorchModelFile, map_location="cpu")
|
||||
dev = torch.device("cuda", index=self.settings.gpu)
|
||||
|
||||
if (
|
||||
torchModelSlot.modelType == RVC_MODEL_TYPE_RVC
|
||||
and torchModelSlot.f0 is True
|
||||
):
|
||||
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
|
||||
elif (
|
||||
torchModelSlot.modelType == RVC_MODEL_TYPE_RVC
|
||||
and torchModelSlot.f0 is False
|
||||
):
|
||||
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
|
||||
elif (
|
||||
torchModelSlot.modelType == RVC_MODEL_TYPE_WEBUI
|
||||
and torchModelSlot.f0 is True
|
||||
):
|
||||
net_g = SynthesizerTrnMsNSFsid_webui(
|
||||
**cpt["params"], is_half=self.is_half
|
||||
)
|
||||
else:
|
||||
net_g = SynthesizerTrnMsNSFsidNono_webui(
|
||||
**cpt["params"], is_half=self.is_half
|
||||
)
|
||||
net_g.eval()
|
||||
net_g.load_state_dict(cpt["weight"], strict=False)
|
||||
|
||||
if self.is_half:
|
||||
net_g = net_g.half()
|
||||
|
||||
self.next_net_g = net_g
|
||||
self.next_onnx_session = None
|
||||
# Inferencerのロード
|
||||
inferencer = InferencerManager.getInferencer(
|
||||
modelSlot.modelType,
|
||||
filename,
|
||||
self.settings.isHalf,
|
||||
torch.device("cuda:0"),
|
||||
)
|
||||
self.next_inferencer = inferencer
|
||||
|
||||
# Indexのロード
|
||||
print("[Voice Changer] Loading index...")
|
||||
self.next_feature_file = self.settings.modelSlots[slot].featureFile
|
||||
self.next_index_file = self.settings.modelSlots[slot].indexFile
|
||||
|
||||
if (
|
||||
self.settings.modelSlots[slot].featureFile is not None
|
||||
and self.settings.modelSlots[slot].indexFile is not None
|
||||
):
|
||||
if modelSlot.featureFile is not None and modelSlot.indexFile is not None:
|
||||
if (
|
||||
os.path.exists(self.settings.modelSlots[slot].featureFile) is True
|
||||
and os.path.exists(self.settings.modelSlots[slot].indexFile) is True
|
||||
os.path.exists(modelSlot.featureFile) is True
|
||||
and os.path.exists(modelSlot.indexFile) is True
|
||||
):
|
||||
try:
|
||||
self.next_index = faiss.read_index(
|
||||
self.settings.modelSlots[slot].indexFile
|
||||
)
|
||||
self.next_feature = np.load(
|
||||
self.settings.modelSlots[slot].featureFile
|
||||
)
|
||||
self.next_index = faiss.read_index(modelSlot.indexFile)
|
||||
self.next_feature = np.load(modelSlot.featureFile)
|
||||
except:
|
||||
print("[Voice Changer] load index failed. Use no index.")
|
||||
traceback.print_exc()
|
||||
@ -260,12 +147,10 @@ class RVC:
|
||||
else:
|
||||
self.next_index = self.next_feature = None
|
||||
|
||||
self.next_trans = self.settings.modelSlots[slot].defaultTrans
|
||||
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
|
||||
self.next_embedder = self.settings.modelSlots[slot].embedder
|
||||
self.next_framework = (
|
||||
"ONNX" if self.next_onnx_session is not None else "PyTorch"
|
||||
)
|
||||
self.next_trans = modelSlot.defaultTrans
|
||||
self.next_samplingRate = modelSlot.samplingRate
|
||||
self.next_embedder = modelSlot.embedder
|
||||
self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch"
|
||||
print("[Voice Changer] Prepare done.")
|
||||
return self.get_info()
|
||||
|
||||
@ -284,15 +169,13 @@ class RVC:
|
||||
print("[Voice Changer] load hubert error", e)
|
||||
traceback.print_exc()
|
||||
|
||||
self.net_g = self.next_net_g
|
||||
self.onnx_session = self.next_onnx_session
|
||||
self.feature_file = self.next_feature_file
|
||||
self.index_file = self.next_index_file
|
||||
self.inferencer = self.next_inferencer
|
||||
self.feature = self.next_feature
|
||||
self.index = self.next_index
|
||||
self.settings.tran = self.next_trans
|
||||
self.settings.framework = self.next_framework
|
||||
self.settings.modelSamplingRate = self.next_samplingRate
|
||||
|
||||
self.next_net_g = None
|
||||
self.next_onnx_session = None
|
||||
print(
|
||||
@ -300,41 +183,41 @@ class RVC:
|
||||
)
|
||||
|
||||
def update_settings(self, key: str, val: int | float | str):
|
||||
if key == "onnxExecutionProvider" and self.onnx_session is not None:
|
||||
if val == "CUDAExecutionProvider":
|
||||
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
|
||||
self.settings.gpu = 0
|
||||
provider_options = [{"device_id": self.settings.gpu}]
|
||||
self.onnx_session.set_providers(
|
||||
providers=[val], provider_options=provider_options
|
||||
)
|
||||
if hasattr(self, "hubert_onnx"):
|
||||
self.hubert_onnx.set_providers(
|
||||
providers=[val], provider_options=provider_options
|
||||
)
|
||||
else:
|
||||
self.onnx_session.set_providers(providers=[val])
|
||||
if hasattr(self, "hubert_onnx"):
|
||||
self.hubert_onnx.set_providers(providers=[val])
|
||||
elif key == "onnxExecutionProvider" and self.onnx_session is None:
|
||||
print("Onnx is not enabled. Please load model.")
|
||||
return False
|
||||
elif key in self.settings.intData:
|
||||
# if key == "onnxExecutionProvider" and self.onnx_session is not None:
|
||||
# if val == "CUDAExecutionProvider":
|
||||
# if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
|
||||
# self.settings.gpu = 0
|
||||
# provider_options = [{"device_id": self.settings.gpu}]
|
||||
# self.onnx_session.set_providers(
|
||||
# providers=[val], provider_options=provider_options
|
||||
# )
|
||||
# if hasattr(self, "hubert_onnx"):
|
||||
# self.hubert_onnx.set_providers(
|
||||
# providers=[val], provider_options=provider_options
|
||||
# )
|
||||
# else:
|
||||
# self.onnx_session.set_providers(providers=[val])
|
||||
# if hasattr(self, "hubert_onnx"):
|
||||
# self.hubert_onnx.set_providers(providers=[val])
|
||||
# elif key == "onnxExecutionProvider" and self.onnx_session is None:
|
||||
# print("Onnx is not enabled. Please load model.")
|
||||
# return False
|
||||
if key in self.settings.intData:
|
||||
val = cast(int, val)
|
||||
if (
|
||||
key == "gpu"
|
||||
and val >= 0
|
||||
and val < self.gpu_num
|
||||
and self.onnx_session is not None
|
||||
):
|
||||
providers = self.onnx_session.get_providers()
|
||||
print("Providers:", providers)
|
||||
if "CUDAExecutionProvider" in providers:
|
||||
provider_options = [{"device_id": self.settings.gpu}]
|
||||
self.onnx_session.set_providers(
|
||||
providers=["CUDAExecutionProvider"],
|
||||
provider_options=provider_options,
|
||||
)
|
||||
# if (
|
||||
# key == "gpu"
|
||||
# and val >= 0
|
||||
# and val < self.gpu_num
|
||||
# and self.onnx_session is not None
|
||||
# ):
|
||||
# providers = self.onnx_session.get_providers()
|
||||
# print("Providers:", providers)
|
||||
# if "CUDAExecutionProvider" in providers:
|
||||
# provider_options = [{"device_id": self.settings.gpu}]
|
||||
# self.onnx_session.set_providers(
|
||||
# providers=["CUDAExecutionProvider"],
|
||||
# provider_options=provider_options,
|
||||
# )
|
||||
if key == "modelSlotIndex":
|
||||
if int(val) < 0:
|
||||
return True
|
||||
@ -355,9 +238,9 @@ class RVC:
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
|
||||
data["onnxExecutionProviders"] = (
|
||||
self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||
)
|
||||
# data["onnxExecutionProviders"] = (
|
||||
# self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||
# )
|
||||
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
||||
for f in files:
|
||||
if data[f] is not None and os.path.exists(data[f]):
|
||||
@ -430,7 +313,12 @@ class RVC:
|
||||
with torch.no_grad():
|
||||
repeat = 3 if self.is_half else 1
|
||||
repeat *= self.settings.rvcQuality # 0 or 3
|
||||
vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat)
|
||||
vc = VC(
|
||||
self.settings.modelSamplingRate,
|
||||
torch.device("cuda:0"),
|
||||
self.is_half,
|
||||
repeat,
|
||||
)
|
||||
sid = 0
|
||||
f0_up_key = self.settings.tran
|
||||
f0_method = self.settings.f0Detector
|
||||
@ -459,13 +347,13 @@ class RVC:
|
||||
return result
|
||||
|
||||
def _pyTorch_inference(self, data):
|
||||
if hasattr(self, "net_g") is False or self.net_g is None:
|
||||
print(
|
||||
"[Voice Changer] No pyTorch session.",
|
||||
hasattr(self, "net_g"),
|
||||
self.net_g,
|
||||
)
|
||||
raise NoModeLoadedException("pytorch")
|
||||
# if hasattr(self, "net_g") is False or self.net_g is None:
|
||||
# print(
|
||||
# "[Voice Changer] No pyTorch session.",
|
||||
# hasattr(self, "net_g"),
|
||||
# self.net_g,
|
||||
# )
|
||||
# raise NoModeLoadedException("pytorch")
|
||||
|
||||
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
|
||||
dev = torch.device("cpu")
|
||||
@ -475,7 +363,10 @@ class RVC:
|
||||
dev = torch.device("cuda", index=self.settings.gpu)
|
||||
|
||||
self.embedder = self.embedder.to(dev)
|
||||
self.net_g = self.net_g.to(dev)
|
||||
self.inferencer = self.inferencer.to(dev)
|
||||
|
||||
# self.embedder.printDevice()
|
||||
# self.inferencer.printDevice()
|
||||
|
||||
audio = data[0]
|
||||
convertSize = data[1]
|
||||
@ -498,9 +389,8 @@ class RVC:
|
||||
|
||||
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
|
||||
audio_out = vc.pipeline(
|
||||
# self.hubert_model,
|
||||
self.embedder,
|
||||
self.net_g,
|
||||
self.inferencer,
|
||||
sid,
|
||||
audio,
|
||||
f0_up_key,
|
||||
|
@ -28,7 +28,7 @@ class RVCSettings:
|
||||
modelSlotIndex: int = -1
|
||||
|
||||
speakers: dict[str, int] = field(default_factory=lambda: {})
|
||||
|
||||
isHalf: int = 1 # 0:off, 1:on
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = [
|
||||
"gpu",
|
||||
@ -39,6 +39,7 @@ class RVCSettings:
|
||||
"modelSamplingRate",
|
||||
"silenceFront",
|
||||
"modelSlotIndex",
|
||||
"isHalf",
|
||||
]
|
||||
floatData = ["silentThreshold", "indexRatio"]
|
||||
strData = ["framework", "f0Detector"]
|
||||
|
@ -1,2 +0,0 @@
|
||||
RVC_MODEL_TYPE_RVC = 0
|
||||
RVC_MODEL_TYPE_WEBUI = 1
|
@ -15,14 +15,24 @@ class Embedder(Protocol):
|
||||
model: Any | None = None
|
||||
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
self.embedderType = EnumEmbedderTypes.hubert
|
||||
self.file = file
|
||||
self.isHalf = isHalf
|
||||
self.dev = dev
|
||||
...
|
||||
|
||||
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
|
||||
...
|
||||
|
||||
def setProps(
|
||||
self,
|
||||
embedderType: EnumEmbedderTypes,
|
||||
file: str,
|
||||
dev: device,
|
||||
isHalf: bool = True,
|
||||
):
|
||||
self.embedderType = embedderType
|
||||
self.file = file
|
||||
self.isHalf = isHalf
|
||||
self.dev = dev
|
||||
print("hubert initialize dev::::", self.dev, dev)
|
||||
|
||||
def setHalf(self, isHalf: bool):
|
||||
self.isHalf = isHalf
|
||||
if self.model is not None and isHalf:
|
||||
@ -59,3 +69,6 @@ class Embedder(Protocol):
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(dev)
|
||||
return self
|
||||
|
||||
def printDevice(self):
|
||||
print("embedder device:", self.model.device)
|
||||
|
@ -16,7 +16,6 @@ class EmbedderManager:
|
||||
) -> Embedder:
|
||||
if cls.currentEmbedder is None:
|
||||
print("[Voice Changer] generate new embedder. (no embedder)")
|
||||
cls.loadEmbedder(embederType, file, isHalf, dev)
|
||||
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
|
||||
elif cls.currentEmbedder.matchCondition(embederType, file) is False:
|
||||
print("[Voice Changer] generate new embedder. (not match)")
|
||||
@ -24,7 +23,6 @@ class EmbedderManager:
|
||||
else:
|
||||
cls.currentEmbedder.setDevice(dev)
|
||||
cls.currentEmbedder.setHalf(isHalf)
|
||||
print("RETURN", cls.currentEmbedder)
|
||||
return cls.currentEmbedder
|
||||
|
||||
@classmethod
|
||||
|
@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
|
||||
class FairseqContentvec(FairseqHubert):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
self.embedderType = EnumEmbedderTypes.contentvec
|
||||
super().setProps(EnumEmbedderTypes.contentvec, file, dev, isHalf)
|
||||
return self
|
||||
|
@ -7,7 +7,7 @@ from fairseq import checkpoint_utils
|
||||
|
||||
class FairseqHubert(Embedder):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
super().setProps(EnumEmbedderTypes.hubert, file, dev, isHalf)
|
||||
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[file],
|
||||
@ -21,7 +21,6 @@ class FairseqHubert(Embedder):
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
self.embedderType = EnumEmbedderTypes.hubert
|
||||
return self
|
||||
|
||||
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
|
||||
@ -38,6 +37,8 @@ class FairseqHubert(Embedder):
|
||||
"padding_mask": padding_mask,
|
||||
}
|
||||
|
||||
print("feat dev", self.dev)
|
||||
|
||||
with torch.no_grad():
|
||||
logits = self.model.extract_features(**inputs)
|
||||
if embChannels == 256:
|
||||
|
@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
|
||||
class FairseqHubertJp(FairseqHubert):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
|
||||
super().loadModel(file, dev, isHalf)
|
||||
self.embedderType = EnumEmbedderTypes.hubert_jp
|
||||
super().setProps(EnumEmbedderTypes.hubert_jp, file, dev, isHalf)
|
||||
return self
|
||||
|
58
server/voice_changer/RVC/inferencer/Inferencer.py
Normal file
58
server/voice_changer/RVC/inferencer/Inferencer.py
Normal file
@ -0,0 +1,58 @@
|
||||
from typing import Any, Protocol
|
||||
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
|
||||
|
||||
class Inferencer(Protocol):
|
||||
inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
|
||||
file: str
|
||||
isHalf: bool = True
|
||||
dev: device
|
||||
|
||||
model: Any | None = None
|
||||
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
...
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
|
||||
def setProps(
|
||||
self,
|
||||
inferencerType: EnumInferenceTypes,
|
||||
file: str,
|
||||
dev: device,
|
||||
isHalf: bool = True,
|
||||
):
|
||||
self.inferencerType = inferencerType
|
||||
self.file = file
|
||||
self.isHalf = isHalf
|
||||
self.dev = dev
|
||||
|
||||
def setHalf(self, isHalf: bool):
|
||||
self.isHalf = isHalf
|
||||
if self.model is not None and isHalf:
|
||||
self.model = self.model.half()
|
||||
|
||||
def setDevice(self, dev: device):
|
||||
self.dev = dev
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(self.dev)
|
||||
|
||||
def to(self, dev: torch.device):
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(dev)
|
||||
return self
|
||||
|
||||
def printDevice(self):
|
||||
print("inferencer device:", self.model.device)
|
42
server/voice_changer/RVC/inferencer/InferencerManager.py
Normal file
42
server/voice_changer/RVC/inferencer/InferencerManager.py
Normal file
@ -0,0 +1,42 @@
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono
|
||||
from voice_changer.RVC.inferencer.RVCInferencer import RVCInferencer
|
||||
from voice_changer.RVC.inferencer.RVCInferencerNono import RVCInferencerNono
|
||||
from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer
|
||||
from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono
|
||||
|
||||
|
||||
class InferencerManager:
|
||||
currentInferencer: Inferencer | None = None
|
||||
|
||||
@classmethod
|
||||
def getInferencer(
|
||||
cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device
|
||||
) -> Inferencer:
|
||||
cls.currentInferencer = cls.loadInferencer(inferencerType, file, isHalf, dev)
|
||||
return cls.currentInferencer
|
||||
|
||||
@classmethod
|
||||
def loadInferencer(
|
||||
cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device
|
||||
) -> Embedder:
|
||||
if inferencerType == EnumInferenceTypes.pyTorchRVC:
|
||||
return RVCInferencer().loadModel(file, dev, isHalf)
|
||||
elif inferencerType == EnumInferenceTypes.pyTorchRVCNono:
|
||||
return RVCInferencerNono().loadModel(file, dev, isHalf)
|
||||
elif inferencerType == EnumInferenceTypes.pyTorchWebUI:
|
||||
return WebUIInferencer().loadModel(file, dev, isHalf)
|
||||
elif inferencerType == EnumInferenceTypes.pyTorchWebUINono:
|
||||
return WebUIInferencerNono().loadModel(file, dev, isHalf)
|
||||
elif inferencerType == EnumInferenceTypes.onnxRVC:
|
||||
return OnnxRVCInference().loadModel(file, dev, isHalf)
|
||||
elif inferencerType == EnumInferenceTypes.onnxRVCNono:
|
||||
return OnnxRVCInferenceNono().loadModel(file, dev, isHalf)
|
||||
else:
|
||||
# return hubert as default
|
||||
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
78
server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
Normal file
78
server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py
Normal file
@ -0,0 +1,78 @@
|
||||
import torch
|
||||
from torch import device
|
||||
import onnxruntime
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
import numpy as np
|
||||
|
||||
providers = ["CPUExecutionProvider"]
|
||||
|
||||
|
||||
class OnnxRVCInference(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
|
||||
# ort_options = onnxruntime.SessionOptions()
|
||||
# ort_options.intra_op_num_threads = 8
|
||||
|
||||
onnx_session = onnxruntime.InferenceSession(
|
||||
self.onnx_model, providers=providers
|
||||
)
|
||||
|
||||
# check half-precision
|
||||
first_input_type = self.onnx_session.get_inputs()[0].type
|
||||
if first_input_type == "tensor(float)":
|
||||
self.isHalf = False
|
||||
else:
|
||||
self.isHalf = True
|
||||
|
||||
self.model = onnx_session
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if pitch is None or pitchf is None:
|
||||
raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.")
|
||||
|
||||
if self.isHalf:
|
||||
audio1 = self.model.run(
|
||||
["audio"],
|
||||
{
|
||||
"feats": feats.cpu().numpy().astype(np.float16),
|
||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||
"pitch": pitch.cpu().numpy().astype(np.int64),
|
||||
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
||||
"sid": sid.cpu().numpy().astype(np.int64),
|
||||
},
|
||||
)
|
||||
else:
|
||||
audio1 = self.model.run(
|
||||
["audio"],
|
||||
{
|
||||
"feats": feats.cpu().numpy().astype(np.float32),
|
||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||
"pitch": pitch.cpu().numpy().astype(np.int64),
|
||||
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
||||
"sid": sid.cpu().numpy().astype(np.int64),
|
||||
},
|
||||
)
|
||||
|
||||
return torch.tensor(np.array(audio1))
|
||||
|
||||
def setHalf(self, isHalf: bool):
|
||||
raise RuntimeError("half-precision is not changable.", self.isHalf)
|
||||
|
||||
def setDevice(self, dev: device):
|
||||
self.dev = dev
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(self.dev)
|
||||
|
||||
def to(self, dev: torch.device):
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(dev)
|
||||
return self
|
71
server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
Normal file
71
server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py
Normal file
@ -0,0 +1,71 @@
|
||||
import torch
|
||||
from torch import device
|
||||
import onnxruntime
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
import numpy as np
|
||||
|
||||
providers = ["CPUExecutionProvider"]
|
||||
|
||||
|
||||
class OnnxRVCInferenceNono(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
|
||||
# ort_options = onnxruntime.SessionOptions()
|
||||
# ort_options.intra_op_num_threads = 8
|
||||
|
||||
onnx_session = onnxruntime.InferenceSession(
|
||||
self.onnx_model, providers=providers
|
||||
)
|
||||
|
||||
# check half-precision
|
||||
first_input_type = self.onnx_session.get_inputs()[0].type
|
||||
if first_input_type == "tensor(float)":
|
||||
self.isHalf = False
|
||||
else:
|
||||
self.isHalf = True
|
||||
|
||||
self.model = onnx_session
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
if self.isHalf:
|
||||
audio1 = self.model.run(
|
||||
["audio"],
|
||||
{
|
||||
"feats": feats.cpu().numpy().astype(np.float16),
|
||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||
"sid": sid.cpu().numpy().astype(np.int64),
|
||||
},
|
||||
)
|
||||
else:
|
||||
audio1 = self.model.run(
|
||||
["audio"],
|
||||
{
|
||||
"feats": feats.cpu().numpy().astype(np.float32),
|
||||
"p_len": pitch_length.cpu().numpy().astype(np.int64),
|
||||
"sid": sid.cpu().numpy().astype(np.int64),
|
||||
},
|
||||
)
|
||||
|
||||
return torch.tensor(np.array(audio1))
|
||||
|
||||
def setHalf(self, isHalf: bool):
|
||||
raise RuntimeError("half-precision is not changable.", self.isHalf)
|
||||
|
||||
def setDevice(self, dev: device):
|
||||
self.dev = dev
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(self.dev)
|
||||
|
||||
def to(self, dev: torch.device):
|
||||
if self.model is not None:
|
||||
self.model = self.model.to(dev)
|
||||
return self
|
33
server/voice_changer/RVC/inferencer/RVCInferencer.py
Normal file
33
server/voice_changer/RVC/inferencer/RVCInferencer.py
Normal file
@ -0,0 +1,33 @@
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from infer_pack.models import ( # type:ignore
|
||||
SynthesizerTrnMs256NSFsid,
|
||||
)
|
||||
|
||||
|
||||
class RVCInferencer(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
|
33
server/voice_changer/RVC/inferencer/RVCInferencerNono.py
Normal file
33
server/voice_changer/RVC/inferencer/RVCInferencerNono.py
Normal file
@ -0,0 +1,33 @@
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from infer_pack.models import ( # type:ignore
|
||||
SynthesizerTrnMs256NSFsid_nono,
|
||||
)
|
||||
|
||||
|
||||
class RVCInferencerNono(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMs256NSFsid_nono(*cpt["config"], is_half=isHalf)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, sid)
|
31
server/voice_changer/RVC/inferencer/WebUIInferencer.py
Normal file
31
server/voice_changer/RVC/inferencer/WebUIInferencer.py
Normal file
@ -0,0 +1,31 @@
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from .models import SynthesizerTrnMsNSFsid
|
||||
|
||||
|
||||
class WebUIInferencer(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMsNSFsid(**cpt["params"], is_half=isHalf)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
|
31
server/voice_changer/RVC/inferencer/WebUIInferencerNono.py
Normal file
31
server/voice_changer/RVC/inferencer/WebUIInferencerNono.py
Normal file
@ -0,0 +1,31 @@
|
||||
import torch
|
||||
from torch import device
|
||||
|
||||
from const import EnumInferenceTypes
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from .models import SynthesizerTrnMsNSFsidNono
|
||||
|
||||
|
||||
class WebUIInferencerNono(Inferencer):
|
||||
def loadModel(self, file: str, dev: device, isHalf: bool = True):
|
||||
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMsNSFsidNono(**cpt["params"], is_half=isHalf)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, sid)
|
277
server/voice_changer/RVC/inferencer/models.py
Normal file
277
server/voice_changer/RVC/inferencer/models.py
Normal file
@ -0,0 +1,277 @@
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from infer_pack.models import ( # type:ignore
|
||||
GeneratorNSF,
|
||||
PosteriorEncoder,
|
||||
ResidualCouplingBlock,
|
||||
Generator,
|
||||
)
|
||||
from infer_pack import commons, attentions # type:ignore
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
emb_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
f0=True,
|
||||
):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.emb_channels = emb_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.emb_phone = nn.Linear(emb_channels, hidden_channels)
|
||||
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
|
||||
if f0 is True:
|
||||
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
|
||||
self.encoder = attentions.Encoder(
|
||||
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
|
||||
)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, phone, pitch, lengths):
|
||||
if pitch is None:
|
||||
x = self.emb_phone(phone)
|
||||
else:
|
||||
x = self.emb_phone(phone) + self.emb_pitch(pitch)
|
||||
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
|
||||
x = self.lrelu(x)
|
||||
x = torch.transpose(x, 1, -1) # [b, h, t]
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
|
||||
x.dtype
|
||||
)
|
||||
x = self.encoder(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
return m, logs, x_mask
|
||||
|
||||
|
||||
class SynthesizerTrnMsNSFsid(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
spk_embed_dim,
|
||||
gin_channels,
|
||||
emb_channels,
|
||||
sr,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.gin_channels = gin_channels
|
||||
self.emb_channels = emb_channels
|
||||
# self.hop_length = hop_length#
|
||||
self.spk_embed_dim = spk_embed_dim
|
||||
self.enc_p = TextEncoder(
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
emb_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
)
|
||||
self.dec = GeneratorNSF(
|
||||
inter_channels,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
gin_channels=gin_channels,
|
||||
sr=sr,
|
||||
is_half=kwargs["is_half"],
|
||||
)
|
||||
self.enc_q = PosteriorEncoder(
|
||||
spec_channels,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
5,
|
||||
1,
|
||||
16,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.flow = ResidualCouplingBlock(
|
||||
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
||||
)
|
||||
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
||||
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
||||
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def forward(
|
||||
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
|
||||
): # 这里ds是id,[bs,1]
|
||||
# print(1,pitch.shape)#[bs,t]
|
||||
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
z_slice, ids_slice = commons.rand_slice_segments(
|
||||
z, y_lengths, self.segment_size
|
||||
)
|
||||
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
|
||||
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
|
||||
# print(-2,pitchf.shape,z_slice.shape)
|
||||
o = self.dec(z_slice, pitchf, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
||||
|
||||
|
||||
class SynthesizerTrnMsNSFsidNono(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
spk_embed_dim,
|
||||
gin_channels,
|
||||
emb_channels,
|
||||
sr=None,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__()
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.gin_channels = gin_channels
|
||||
self.emb_channels = emb_channels
|
||||
# self.hop_length = hop_length#
|
||||
self.spk_embed_dim = spk_embed_dim
|
||||
self.enc_p = TextEncoder(
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
emb_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
f0=False,
|
||||
)
|
||||
self.dec = Generator(
|
||||
inter_channels,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.enc_q = PosteriorEncoder(
|
||||
spec_channels,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
5,
|
||||
1,
|
||||
16,
|
||||
gin_channels=gin_channels,
|
||||
)
|
||||
self.flow = ResidualCouplingBlock(
|
||||
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
|
||||
)
|
||||
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
|
||||
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
|
||||
|
||||
def remove_weight_norm(self):
|
||||
self.dec.remove_weight_norm()
|
||||
self.flow.remove_weight_norm()
|
||||
self.enc_q.remove_weight_norm()
|
||||
|
||||
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
|
||||
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
z_slice, ids_slice = commons.rand_slice_segments(
|
||||
z, y_lengths, self.segment_size
|
||||
)
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def infer(self, phone, phone_lengths, sid, max_len=None):
|
||||
g = self.emb_g(sid).unsqueeze(-1)
|
||||
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
|
||||
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
|
||||
z = self.flow(z_p, x_mask, g=g, reverse=True)
|
||||
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
|
||||
return o, x_mask, (z, z_p, m_p, logs_p)
|
7
server/voice_changer/RVC/pipeline/PipelineGenerator.py
Normal file
7
server/voice_changer/RVC/pipeline/PipelineGenerator.py
Normal file
@ -0,0 +1,7 @@
|
||||
from voice_changer.RVC.ModelSlot import ModelSlot
|
||||
|
||||
|
||||
class PipelineGenerator:
|
||||
@classmethod
|
||||
def generatePipeline(cls, modelSlot: ModelSlot):
|
||||
pass
|
Loading…
Reference in New Issue
Block a user