WIP: Japanese Hubert

This commit is contained in:
wataru 2023-05-02 20:57:12 +09:00
parent 7c70999f33
commit 72fb482dc7
22 changed files with 915 additions and 248 deletions

View File

@ -66,7 +66,22 @@ def getFrontendPath():
return frontend_path
# "hubert_base", "contentvec", "distilhubert"
class EnumEmbedderTypes(Enum):
hubert = "hubert"
hubert = "hubert_base"
contentvec = "contentvec"
hubert_jp = "hubert_jp"
hubert_jp = "hubert-base-japanese"
class EnumInferenceTypes(Enum):
pyTorchRVC = "pyTorchRVC"
pyTorchRVCNono = "pyTorchRVCNono"
pyTorchWebUI = "pyTorchWebUI"
pyTorchWebUINono = "pyTorchWebUINono"
onnxRVC = "onnxRVC"
onnxRVCNono = "onnxRVCNono"
class EnumFrameworkTypes(Enum):
pyTorch = "pyTorch"
onnx = "onnx"

View File

@ -1,5 +1,6 @@
from const import EnumInferenceTypes, EnumEmbedderTypes
from dataclasses import dataclass
from voice_changer.RVC.const import RVC_MODEL_TYPE_RVC
@dataclass
@ -9,9 +10,10 @@ class ModelSlot:
featureFile: str = ""
indexFile: str = ""
defaultTrans: int = 0
modelType: int = RVC_MODEL_TYPE_RVC
isONNX: bool = False
modelType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
samplingRate: int = -1
f0: bool = True
embChannels: int = 256
deprecated: bool = False
embedder: str = "hubert_base" # "hubert_base", "contentvec", "distilhubert"
embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert

View File

@ -0,0 +1,88 @@
from const import EnumEmbedderTypes, EnumInferenceTypes
from voice_changer.RVC.ModelSlot import ModelSlot
from voice_changer.utils.LoadModelParams import FilePaths
import torch
import onnxruntime
import json
def generateModelSlot(files: FilePaths, params):
modelSlot = ModelSlot()
modelSlot.pyTorchModelFile = files.pyTorchModelFilename
modelSlot.onnxModelFile = files.onnxModelFilename
modelSlot.featureFile = files.featureFilename
modelSlot.indexFile = files.indexFilename
modelSlot.defaultTrans = params["trans"] if "trans" in params else 0
modelSlot.isONNX = True if modelSlot.onnxModelFile is not None else False
if modelSlot.isONNX:
_setInfoByONNX(modelSlot, modelSlot.onnxModelFile)
else:
_setInfoByPytorch(modelSlot, modelSlot.pyTorchModelFile)
return modelSlot
def _setInfoByPytorch(slot: ModelSlot, file: str):
cpt = torch.load(file, map_location="cpu")
config_len = len(cpt["config"])
if config_len == 18:
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = (
EnumInferenceTypes.pyTorchRVC
if slot.f0
else EnumInferenceTypes.pyTorchRVCNono
)
slot.embChannels = 256
slot.embedder = EnumEmbedderTypes.hubert
else:
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = (
EnumInferenceTypes.pyTorchWebUI
if slot.f0
else EnumInferenceTypes.pyTorchWebUINono
)
slot.embChannels = cpt["config"][17]
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
slot.samplingRate = cpt["config"][-1]
del cpt
def _setInfoByONNX(slot: ModelSlot, file: str):
tmp_onnx_session = onnxruntime.InferenceSession(
file, providers=["CPUExecutionProvider"]
)
modelmeta = tmp_onnx_session.get_modelmeta()
try:
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
slot.modelType = metadata["modelType"]
slot.embChannels = metadata["embChannels"]
slot.embedder = (
metadata["embedder"] if "embedder" in metadata else EnumEmbedderTypes.hubert
)
slot.f0 = metadata["f0"]
slot.modelType = (
EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono
)
slot.samplingRate = metadata["samplingRate"]
slot.deprecated = False
except:
slot.modelType = EnumInferenceTypes.onnxRVC
slot.embChannels = 256
slot.embedder = EnumEmbedderTypes.hubert
slot.f0 = True
slot.samplingRate = 48000
slot.deprecated = True
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
del tmp_onnx_session

View File

@ -1,29 +1,5 @@
import sys
import os
import json
import resampy
from voice_changer.RVC.MergeModel import merge_model
from voice_changer.RVC.MergeModelRequest import MergeModelRequest
from voice_changer.RVC.ModelWrapper import ModelWrapper
from Exceptions import NoModeLoadedException
from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from dataclasses import asdict
from typing import cast
import numpy as np
import torch
# from fairseq import checkpoint_utils
import traceback
import faiss
from const import TMP_DIR, UPLOAD_DIR # type:ignore
# avoiding parse arg error in RVC
sys.argv = ["MMVCServerSIO.py"]
@ -37,16 +13,35 @@ if sys.platform.startswith("darwin"):
sys.path.append(modulePath)
else:
sys.path.append("RVC")
import json
import resampy
from voice_changer.RVC.MergeModel import merge_model
from voice_changer.RVC.MergeModelRequest import MergeModelRequest
from voice_changer.RVC.ModelSlotGenerator import generateModelSlot
from Exceptions import NoModeLoadedException
from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from dataclasses import asdict
from typing import cast
import numpy as np
import torch
# from fairseq import checkpoint_utils
import traceback
import faiss
from const import TMP_DIR, UPLOAD_DIR
from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMsNSFsid_webui
from .models import SynthesizerTrnMsNSFsidNono as SynthesizerTrnMsNSFsidNono_webui
from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI
from voice_changer.RVC.custom_vc_infer_pipeline import VC
from infer_pack.models import ( # type:ignore
SynthesizerTrnMs256NSFsid,
SynthesizerTrnMs256NSFsid_nono,
)
providers = [
"OpenVINOExecutionProvider",
@ -59,13 +54,12 @@ providers = [
class RVC:
audio_buffer: AudioInOut | None = None
embedder: Embedder | None = None
inferencer: Inferencer | None = None
def __init__(self, params: VoiceChangerParams):
self.initialLoad = True
self.settings = RVCSettings()
self.net_g = None
self.onnx_session = None
self.feature_file = None
self.index_file = None
@ -83,173 +77,66 @@ class RVC:
def loadModel(self, props: LoadModelParams):
"""
loadModelはスロットへのエントリ推論向けにはロードしない
例外的にまだ一つも推論向けにロードされていない場合ロードする
loadModelはスロットへのエントリ(推論向けにはロードしない)
例外的にまだ一つも推論向けにロードされていない場合と稼働中スロットの場合ロードする
"""
self.is_half = props.isHalf
tmp_slot = props.slot
target_slot_idx = props.slot
params_str = props.params
params = json.loads(params_str)
self.settings.modelSlots[
tmp_slot
].pyTorchModelFile = props.files.pyTorchModelFilename
self.settings.modelSlots[tmp_slot].onnxModelFile = props.files.onnxModelFilename
self.settings.modelSlots[tmp_slot].featureFile = props.files.featureFilename
self.settings.modelSlots[tmp_slot].indexFile = props.files.indexFilename
self.settings.modelSlots[tmp_slot].defaultTrans = (
params["trans"] if "trans" in params else 0
)
isONNX = (
True
if self.settings.modelSlots[tmp_slot].onnxModelFile is not None
else False
)
# メタデータ設定
if isONNX:
self._setInfoByONNX(
tmp_slot, self.settings.modelSlots[tmp_slot].onnxModelFile
)
else:
self._setInfoByPytorch(
tmp_slot, self.settings.modelSlots[tmp_slot].pyTorchModelFile
)
modelSlot = generateModelSlot(props.files, params)
self.settings.modelSlots[target_slot_idx] = modelSlot
print(
f"[Voice Changer] RVC loading... slot:{tmp_slot}",
asdict(self.settings.modelSlots[tmp_slot]),
f"[Voice Changer] RVC new model is uploaded,{target_slot_idx}",
asdict(modelSlot),
)
# hubertロード
# try:
# hubert_path = self.params.hubert_base
# hubert_path_jp = self.params.hubert_base_jp
# print(hubert_path, hubert_path_jp)
# models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
# [hubert_path],
# suffix="",
# )
# model = models[0]
# model.eval()
# if self.is_half:
# model = model.half()
# self.hubert_model = model
# except Exception as e:
# print("EXCEPTION during loading hubert/contentvec model", e)
# print(" hubert_path:", hubert_path)
# 初回のみロード
if self.initialLoad or tmp_slot == self.currentSlot:
self.prepareModel(tmp_slot)
self.settings.modelSlotIndex = tmp_slot
self.currentSlot = self.settings.modelSlotIndex
if self.initialLoad or target_slot_idx == self.currentSlot:
self.prepareModel(target_slot_idx)
self.settings.modelSlotIndex = target_slot_idx
# self.currentSlot = self.settings.modelSlotIndex
self.switchModel()
self.initialLoad = False
return self.get_info()
def _setInfoByPytorch(self, slot, file):
cpt = torch.load(file, map_location="cpu")
config_len = len(cpt["config"])
if config_len == 18:
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC
self.settings.modelSlots[slot].embChannels = 256
self.settings.modelSlots[slot].embedder = "hubert_base"
else:
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI
self.settings.modelSlots[slot].embChannels = cpt["config"][17]
self.settings.modelSlots[slot].embedder = cpt["embedder_name"]
if self.settings.modelSlots[slot].embedder.endswith("768"):
self.settings.modelSlots[slot].embedder = self.settings.modelSlots[
slot
].embedder[:-3]
self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False
self.settings.modelSlots[slot].samplingRate = cpt["config"][-1]
# self.settings.modelSamplingRate = cpt["config"][-1]
def _setInfoByONNX(self, slot, file):
tmp_onnx_session = ModelWrapper(file)
self.settings.modelSlots[slot].modelType = tmp_onnx_session.getModelType()
self.settings.modelSlots[slot].embChannels = tmp_onnx_session.getEmbChannels()
self.settings.modelSlots[slot].embedder = tmp_onnx_session.getEmbedder()
self.settings.modelSlots[slot].f0 = tmp_onnx_session.getF0()
self.settings.modelSlots[slot].samplingRate = tmp_onnx_session.getSamplingRate()
self.settings.modelSlots[slot].deprecated = tmp_onnx_session.getDeprecated()
def prepareModel(self, slot: int):
if slot < 0:
return self.get_info()
print("[Voice Changer] Prepare Model of slot:", slot)
onnxModelFile = self.settings.modelSlots[slot].onnxModelFile
isONNX = (
True if self.settings.modelSlots[slot].onnxModelFile is not None else False
modelSlot = self.settings.modelSlots[slot]
filename = (
modelSlot.onnxModelFile if modelSlot.isONNX else modelSlot.pyTorchModelFile
)
# モデルのロード
if isONNX:
print("[Voice Changer] Loading ONNX Model...")
self.next_onnx_session = ModelWrapper(onnxModelFile)
self.next_net_g = None
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
elif self.mps_enabled:
dev = torch.device("mps")
else:
print("[Voice Changer] Loading Pytorch Model...")
torchModelSlot = self.settings.modelSlots[slot]
cpt = torch.load(torchModelSlot.pyTorchModelFile, map_location="cpu")
dev = torch.device("cuda", index=self.settings.gpu)
if (
torchModelSlot.modelType == RVC_MODEL_TYPE_RVC
and torchModelSlot.f0 is True
):
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
elif (
torchModelSlot.modelType == RVC_MODEL_TYPE_RVC
and torchModelSlot.f0 is False
):
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif (
torchModelSlot.modelType == RVC_MODEL_TYPE_WEBUI
and torchModelSlot.f0 is True
):
net_g = SynthesizerTrnMsNSFsid_webui(
**cpt["params"], is_half=self.is_half
)
else:
net_g = SynthesizerTrnMsNSFsidNono_webui(
**cpt["params"], is_half=self.is_half
)
net_g.eval()
net_g.load_state_dict(cpt["weight"], strict=False)
if self.is_half:
net_g = net_g.half()
self.next_net_g = net_g
self.next_onnx_session = None
# Inferencerのロード
inferencer = InferencerManager.getInferencer(
modelSlot.modelType,
filename,
self.settings.isHalf,
torch.device("cuda:0"),
)
self.next_inferencer = inferencer
# Indexのロード
print("[Voice Changer] Loading index...")
self.next_feature_file = self.settings.modelSlots[slot].featureFile
self.next_index_file = self.settings.modelSlots[slot].indexFile
if (
self.settings.modelSlots[slot].featureFile is not None
and self.settings.modelSlots[slot].indexFile is not None
):
if modelSlot.featureFile is not None and modelSlot.indexFile is not None:
if (
os.path.exists(self.settings.modelSlots[slot].featureFile) is True
and os.path.exists(self.settings.modelSlots[slot].indexFile) is True
os.path.exists(modelSlot.featureFile) is True
and os.path.exists(modelSlot.indexFile) is True
):
try:
self.next_index = faiss.read_index(
self.settings.modelSlots[slot].indexFile
)
self.next_feature = np.load(
self.settings.modelSlots[slot].featureFile
)
self.next_index = faiss.read_index(modelSlot.indexFile)
self.next_feature = np.load(modelSlot.featureFile)
except:
print("[Voice Changer] load index failed. Use no index.")
traceback.print_exc()
@ -260,12 +147,10 @@ class RVC:
else:
self.next_index = self.next_feature = None
self.next_trans = self.settings.modelSlots[slot].defaultTrans
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
self.next_embedder = self.settings.modelSlots[slot].embedder
self.next_framework = (
"ONNX" if self.next_onnx_session is not None else "PyTorch"
)
self.next_trans = modelSlot.defaultTrans
self.next_samplingRate = modelSlot.samplingRate
self.next_embedder = modelSlot.embedder
self.next_framework = "ONNX" if modelSlot.isONNX else "PyTorch"
print("[Voice Changer] Prepare done.")
return self.get_info()
@ -284,15 +169,13 @@ class RVC:
print("[Voice Changer] load hubert error", e)
traceback.print_exc()
self.net_g = self.next_net_g
self.onnx_session = self.next_onnx_session
self.feature_file = self.next_feature_file
self.index_file = self.next_index_file
self.inferencer = self.next_inferencer
self.feature = self.next_feature
self.index = self.next_index
self.settings.tran = self.next_trans
self.settings.framework = self.next_framework
self.settings.modelSamplingRate = self.next_samplingRate
self.next_net_g = None
self.next_onnx_session = None
print(
@ -300,41 +183,41 @@ class RVC:
)
def update_settings(self, key: str, val: int | float | str):
if key == "onnxExecutionProvider" and self.onnx_session is not None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=[val], provider_options=provider_options
)
if hasattr(self, "hubert_onnx"):
self.hubert_onnx.set_providers(
providers=[val], provider_options=provider_options
)
else:
self.onnx_session.set_providers(providers=[val])
if hasattr(self, "hubert_onnx"):
self.hubert_onnx.set_providers(providers=[val])
elif key == "onnxExecutionProvider" and self.onnx_session is None:
print("Onnx is not enabled. Please load model.")
return False
elif key in self.settings.intData:
# if key == "onnxExecutionProvider" and self.onnx_session is not None:
# if val == "CUDAExecutionProvider":
# if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
# self.settings.gpu = 0
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=[val], provider_options=provider_options
# )
# if hasattr(self, "hubert_onnx"):
# self.hubert_onnx.set_providers(
# providers=[val], provider_options=provider_options
# )
# else:
# self.onnx_session.set_providers(providers=[val])
# if hasattr(self, "hubert_onnx"):
# self.hubert_onnx.set_providers(providers=[val])
# elif key == "onnxExecutionProvider" and self.onnx_session is None:
# print("Onnx is not enabled. Please load model.")
# return False
if key in self.settings.intData:
val = cast(int, val)
if (
key == "gpu"
and val >= 0
and val < self.gpu_num
and self.onnx_session is not None
):
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=["CUDAExecutionProvider"],
provider_options=provider_options,
)
# if (
# key == "gpu"
# and val >= 0
# and val < self.gpu_num
# and self.onnx_session is not None
# ):
# providers = self.onnx_session.get_providers()
# print("Providers:", providers)
# if "CUDAExecutionProvider" in providers:
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=["CUDAExecutionProvider"],
# provider_options=provider_options,
# )
if key == "modelSlotIndex":
if int(val) < 0:
return True
@ -355,9 +238,9 @@ class RVC:
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = (
self.onnx_session.get_providers() if self.onnx_session is not None else []
)
# data["onnxExecutionProviders"] = (
# self.onnx_session.get_providers() if self.onnx_session is not None else []
# )
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] is not None and os.path.exists(data[f]):
@ -430,7 +313,12 @@ class RVC:
with torch.no_grad():
repeat = 3 if self.is_half else 1
repeat *= self.settings.rvcQuality # 0 or 3
vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat)
vc = VC(
self.settings.modelSamplingRate,
torch.device("cuda:0"),
self.is_half,
repeat,
)
sid = 0
f0_up_key = self.settings.tran
f0_method = self.settings.f0Detector
@ -459,13 +347,13 @@ class RVC:
return result
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") is False or self.net_g is None:
print(
"[Voice Changer] No pyTorch session.",
hasattr(self, "net_g"),
self.net_g,
)
raise NoModeLoadedException("pytorch")
# if hasattr(self, "net_g") is False or self.net_g is None:
# print(
# "[Voice Changer] No pyTorch session.",
# hasattr(self, "net_g"),
# self.net_g,
# )
# raise NoModeLoadedException("pytorch")
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
@ -475,7 +363,10 @@ class RVC:
dev = torch.device("cuda", index=self.settings.gpu)
self.embedder = self.embedder.to(dev)
self.net_g = self.net_g.to(dev)
self.inferencer = self.inferencer.to(dev)
# self.embedder.printDevice()
# self.inferencer.printDevice()
audio = data[0]
convertSize = data[1]
@ -498,9 +389,8 @@ class RVC:
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(
# self.hubert_model,
self.embedder,
self.net_g,
self.inferencer,
sid,
audio,
f0_up_key,

View File

@ -28,7 +28,7 @@ class RVCSettings:
modelSlotIndex: int = -1
speakers: dict[str, int] = field(default_factory=lambda: {})
isHalf: int = 1 # 0:off, 1:on
# ↓mutableな物だけ列挙
intData = [
"gpu",
@ -39,6 +39,7 @@ class RVCSettings:
"modelSamplingRate",
"silenceFront",
"modelSlotIndex",
"isHalf",
]
floatData = ["silentThreshold", "indexRatio"]
strData = ["framework", "f0Detector"]

View File

@ -1,2 +0,0 @@
RVC_MODEL_TYPE_RVC = 0
RVC_MODEL_TYPE_WEBUI = 1

View File

@ -15,14 +15,24 @@ class Embedder(Protocol):
model: Any | None = None
def loadModel(self, file: str, dev: device, isHalf: bool = True):
self.embedderType = EnumEmbedderTypes.hubert
self.file = file
self.isHalf = isHalf
self.dev = dev
...
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
...
def setProps(
self,
embedderType: EnumEmbedderTypes,
file: str,
dev: device,
isHalf: bool = True,
):
self.embedderType = embedderType
self.file = file
self.isHalf = isHalf
self.dev = dev
print("hubert initialize dev::::", self.dev, dev)
def setHalf(self, isHalf: bool):
self.isHalf = isHalf
if self.model is not None and isHalf:
@ -59,3 +69,6 @@ class Embedder(Protocol):
if self.model is not None:
self.model = self.model.to(dev)
return self
def printDevice(self):
print("embedder device:", self.model.device)

View File

@ -16,7 +16,6 @@ class EmbedderManager:
) -> Embedder:
if cls.currentEmbedder is None:
print("[Voice Changer] generate new embedder. (no embedder)")
cls.loadEmbedder(embederType, file, isHalf, dev)
cls.currentEmbedder = cls.loadEmbedder(embederType, file, isHalf, dev)
elif cls.currentEmbedder.matchCondition(embederType, file) is False:
print("[Voice Changer] generate new embedder. (not match)")
@ -24,7 +23,6 @@ class EmbedderManager:
else:
cls.currentEmbedder.setDevice(dev)
cls.currentEmbedder.setHalf(isHalf)
print("RETURN", cls.currentEmbedder)
return cls.currentEmbedder
@classmethod

View File

@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
class FairseqContentvec(FairseqHubert):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
self.embedderType = EnumEmbedderTypes.contentvec
super().setProps(EnumEmbedderTypes.contentvec, file, dev, isHalf)
return self

View File

@ -7,7 +7,7 @@ from fairseq import checkpoint_utils
class FairseqHubert(Embedder):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
super().setProps(EnumEmbedderTypes.hubert, file, dev, isHalf)
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[file],
@ -21,7 +21,6 @@ class FairseqHubert(Embedder):
model = model.half()
self.model = model
self.embedderType = EnumEmbedderTypes.hubert
return self
def extractFeatures(self, feats: torch.Tensor, embChannels=256) -> torch.Tensor:
@ -38,6 +37,8 @@ class FairseqHubert(Embedder):
"padding_mask": padding_mask,
}
print("feat dev", self.dev)
with torch.no_grad():
logits = self.model.extract_features(**inputs)
if embChannels == 256:

View File

@ -7,5 +7,5 @@ from voice_changer.RVC.embedder.FairseqHubert import FairseqHubert
class FairseqHubertJp(FairseqHubert):
def loadModel(self, file: str, dev: device, isHalf: bool = True) -> Embedder:
super().loadModel(file, dev, isHalf)
self.embedderType = EnumEmbedderTypes.hubert_jp
super().setProps(EnumEmbedderTypes.hubert_jp, file, dev, isHalf)
return self

View File

@ -0,0 +1,58 @@
from typing import Any, Protocol
import torch
from torch import device
from const import EnumInferenceTypes
class Inferencer(Protocol):
inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC
file: str
isHalf: bool = True
dev: device
model: Any | None = None
def loadModel(self, file: str, dev: device, isHalf: bool = True):
...
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
...
def setProps(
self,
inferencerType: EnumInferenceTypes,
file: str,
dev: device,
isHalf: bool = True,
):
self.inferencerType = inferencerType
self.file = file
self.isHalf = isHalf
self.dev = dev
def setHalf(self, isHalf: bool):
self.isHalf = isHalf
if self.model is not None and isHalf:
self.model = self.model.half()
def setDevice(self, dev: device):
self.dev = dev
if self.model is not None:
self.model = self.model.to(self.dev)
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self
def printDevice(self):
print("inferencer device:", self.model.device)

View File

@ -0,0 +1,42 @@
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInference
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferenceNono
from voice_changer.RVC.inferencer.RVCInferencer import RVCInferencer
from voice_changer.RVC.inferencer.RVCInferencerNono import RVCInferencerNono
from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer
from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono
class InferencerManager:
currentInferencer: Inferencer | None = None
@classmethod
def getInferencer(
cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device
) -> Inferencer:
cls.currentInferencer = cls.loadInferencer(inferencerType, file, isHalf, dev)
return cls.currentInferencer
@classmethod
def loadInferencer(
cls, inferencerType: EnumInferenceTypes, file: str, isHalf: bool, dev: device
) -> Embedder:
if inferencerType == EnumInferenceTypes.pyTorchRVC:
return RVCInferencer().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchRVCNono:
return RVCInferencerNono().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchWebUI:
return WebUIInferencer().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.pyTorchWebUINono:
return WebUIInferencerNono().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.onnxRVC:
return OnnxRVCInference().loadModel(file, dev, isHalf)
elif inferencerType == EnumInferenceTypes.onnxRVCNono:
return OnnxRVCInferenceNono().loadModel(file, dev, isHalf)
else:
# return hubert as default
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)

View File

@ -0,0 +1,78 @@
import torch
from torch import device
import onnxruntime
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
import numpy as np
providers = ["CPUExecutionProvider"]
class OnnxRVCInference(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
onnx_session = onnxruntime.InferenceSession(
self.onnx_model, providers=providers
)
# check half-precision
first_input_type = self.onnx_session.get_inputs()[0].type
if first_input_type == "tensor(float)":
self.isHalf = False
else:
self.isHalf = True
self.model = onnx_session
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
if pitch is None or pitchf is None:
raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.")
if self.isHalf:
audio1 = self.model.run(
["audio"],
{
"feats": feats.cpu().numpy().astype(np.float16),
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
},
)
else:
audio1 = self.model.run(
["audio"],
{
"feats": feats.cpu().numpy().astype(np.float32),
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
},
)
return torch.tensor(np.array(audio1))
def setHalf(self, isHalf: bool):
raise RuntimeError("half-precision is not changable.", self.isHalf)
def setDevice(self, dev: device):
self.dev = dev
if self.model is not None:
self.model = self.model.to(self.dev)
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self

View File

@ -0,0 +1,71 @@
import torch
from torch import device
import onnxruntime
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
import numpy as np
providers = ["CPUExecutionProvider"]
class OnnxRVCInferenceNono(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.onnxRVC, file, dev, isHalf)
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
onnx_session = onnxruntime.InferenceSession(
self.onnx_model, providers=providers
)
# check half-precision
first_input_type = self.onnx_session.get_inputs()[0].type
if first_input_type == "tensor(float)":
self.isHalf = False
else:
self.isHalf = True
self.model = onnx_session
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
if self.isHalf:
audio1 = self.model.run(
["audio"],
{
"feats": feats.cpu().numpy().astype(np.float16),
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64),
},
)
else:
audio1 = self.model.run(
["audio"],
{
"feats": feats.cpu().numpy().astype(np.float32),
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64),
},
)
return torch.tensor(np.array(audio1))
def setHalf(self, isHalf: bool):
raise RuntimeError("half-precision is not changable.", self.isHalf)
def setDevice(self, dev: device):
self.dev = dev
if self.model is not None:
self.model = self.model.to(self.dev)
def to(self, dev: torch.device):
if self.model is not None:
self.model = self.model.to(dev)
return self

View File

@ -0,0 +1,33 @@
import torch
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from infer_pack.models import ( # type:ignore
SynthesizerTrnMs256NSFsid,
)
class RVCInferencer(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
if isHalf:
model = model.half()
self.model = model
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)

View File

@ -0,0 +1,33 @@
import torch
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from infer_pack.models import ( # type:ignore
SynthesizerTrnMs256NSFsid_nono,
)
class RVCInferencerNono(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMs256NSFsid_nono(*cpt["config"], is_half=isHalf)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
if isHalf:
model = model.half()
self.model = model
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)

View File

@ -0,0 +1,31 @@
import torch
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from .models import SynthesizerTrnMsNSFsid
class WebUIInferencer(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMsNSFsid(**cpt["params"], is_half=isHalf)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
if isHalf:
model = model.half()
self.model = model
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)

View File

@ -0,0 +1,31 @@
import torch
from torch import device
from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from .models import SynthesizerTrnMsNSFsidNono
class WebUIInferencerNono(Inferencer):
def loadModel(self, file: str, dev: device, isHalf: bool = True):
super().setProps(EnumInferenceTypes.pyTorchRVC, file, dev, isHalf)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMsNSFsidNono(**cpt["params"], is_half=isHalf)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
if isHalf:
model = model.half()
self.model = model
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)

View File

@ -0,0 +1,277 @@
import math
import torch
from torch import nn
from infer_pack.models import ( # type:ignore
GeneratorNSF,
PosteriorEncoder,
ResidualCouplingBlock,
Generator,
)
from infer_pack import commons, attentions # type:ignore
class TextEncoder(nn.Module):
def __init__(
self,
out_channels,
hidden_channels,
filter_channels,
emb_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=True,
):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.emb_channels = emb_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb_phone = nn.Linear(emb_channels, hidden_channels)
self.lrelu = nn.LeakyReLU(0.1, inplace=True)
if f0 is True:
self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
self.encoder = attentions.Encoder(
hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, phone, pitch, lengths):
if pitch is None:
x = self.emb_phone(phone)
else:
x = self.emb_phone(phone) + self.emb_pitch(pitch)
x = x * math.sqrt(self.hidden_channels) # [b, t, h]
x = self.lrelu(x)
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
x.dtype
)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return m, logs, x_mask
class SynthesizerTrnMsNSFsid(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
emb_channels,
sr,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
self.emb_channels = emb_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder(
inter_channels,
hidden_channels,
filter_channels,
emb_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
)
self.dec = GeneratorNSF(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
sr=sr,
is_half=kwargs["is_half"],
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(
self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
): # 这里ds是id[bs,1]
# print(1,pitch.shape)#[bs,t]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
# print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
# print(-2,pitchf.shape,z_slice.shape)
o = self.dec(z_slice, pitchf, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMsNSFsidNono(nn.Module):
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
spk_embed_dim,
gin_channels,
emb_channels,
sr=None,
**kwargs
):
super().__init__()
self.spec_channels = spec_channels
self.inter_channels = inter_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.gin_channels = gin_channels
self.emb_channels = emb_channels
# self.hop_length = hop_length#
self.spk_embed_dim = spk_embed_dim
self.enc_p = TextEncoder(
inter_channels,
hidden_channels,
filter_channels,
emb_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
f0=False,
)
self.dec = Generator(
inter_channels,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=gin_channels,
)
self.enc_q = PosteriorEncoder(
spec_channels,
inter_channels,
hidden_channels,
5,
1,
16,
gin_channels=gin_channels,
)
self.flow = ResidualCouplingBlock(
inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
)
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def remove_weight_norm(self):
self.dec.remove_weight_norm()
self.flow.remove_weight_norm()
self.enc_q.remove_weight_norm()
def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id[bs,1]
g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t广播的
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
z_slice, ids_slice = commons.rand_slice_segments(
z, y_lengths, self.segment_size
)
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, max_len=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -0,0 +1,7 @@
from voice_changer.RVC.ModelSlot import ModelSlot
class PipelineGenerator:
@classmethod
def generatePipeline(cls, modelSlot: ModelSlot):
pass