mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-03-16 21:03:58 +03:00
WIP: integrate vcs to new gui 2
This commit is contained in:
parent
e6b191abd2
commit
fa7894de50
@ -2,7 +2,6 @@ import sys
|
|||||||
import os
|
import os
|
||||||
from data.ModelSlot import MMVCv13ModelSlot
|
from data.ModelSlot import MMVCv13ModelSlot
|
||||||
|
|
||||||
from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
@ -22,8 +21,10 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
|
|
||||||
from symbols import symbols # type:ignore
|
# from symbols import symbols # type:ignore
|
||||||
from models import SynthesizerTrn # type:ignore
|
# from models import SynthesizerTrn # type:ignore
|
||||||
|
from voice_changer.MMVCv13.models.models import SynthesizerTrn
|
||||||
|
from voice_changer.MMVCv13.models.symbols import symbols
|
||||||
from voice_changer.MMVCv13.TrainerFunctions import (
|
from voice_changer.MMVCv13.TrainerFunctions import (
|
||||||
TextAudioSpeakerCollate,
|
TextAudioSpeakerCollate,
|
||||||
spectrogram_torch,
|
spectrogram_torch,
|
||||||
@ -40,21 +41,15 @@ class MMVCv13Settings:
|
|||||||
srcId: int = 0
|
srcId: int = 0
|
||||||
dstId: int = 101
|
dstId: int = 101
|
||||||
|
|
||||||
framework: str = "PyTorch" # PyTorch or ONNX
|
|
||||||
pyTorchModelFile: str = ""
|
|
||||||
onnxModelFile: str = ""
|
|
||||||
configFile: str = ""
|
|
||||||
|
|
||||||
# ↓mutableな物だけ列挙
|
# ↓mutableな物だけ列挙
|
||||||
intData = ["gpu", "srcId", "dstId"]
|
intData = ["gpu", "srcId", "dstId"]
|
||||||
floatData: list[str] = field(default_factory=lambda: [])
|
floatData: list[str] = field(default_factory=lambda: [])
|
||||||
strData = ["framework"]
|
strData: list[str] = field(default_factory=lambda: [])
|
||||||
|
|
||||||
|
|
||||||
class MMVCv13:
|
class MMVCv13:
|
||||||
audio_buffer: AudioInOut | None = None
|
def __init__(self, slotInfo: MMVCv13ModelSlot):
|
||||||
|
print("[Voice Changer] [MMVCv13] Creating instance ")
|
||||||
def __init__(self):
|
|
||||||
self.settings = MMVCv13Settings()
|
self.settings = MMVCv13Settings()
|
||||||
self.net_g = None
|
self.net_g = None
|
||||||
self.onnx_session = None
|
self.onnx_session = None
|
||||||
@ -62,43 +57,35 @@ class MMVCv13:
|
|||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
self.text_norm = torch.LongTensor([0, 6, 0])
|
self.text_norm = torch.LongTensor([0, 6, 0])
|
||||||
|
|
||||||
def loadModel(self, props: LoadModelParams):
|
self.audio_buffer: AudioInOut | None = None
|
||||||
params = props.params
|
self.slotInfo = slotInfo
|
||||||
|
self.initialize()
|
||||||
|
|
||||||
self.settings.configFile = params["files"]["mmvcv13Config"]
|
def initialize(self):
|
||||||
self.hps = get_hparams_from_file(self.settings.configFile)
|
print("[Voice Changer] [MMVCv13] Initializing... ")
|
||||||
|
|
||||||
modelFile = params["files"]["mmvcv13Model"]
|
self.hps = get_hparams_from_file(self.slotInfo.configFile)
|
||||||
if modelFile.endswith(".onnx"):
|
if self.slotInfo.isONNX:
|
||||||
self.settings.pyTorchModelFile = None
|
|
||||||
self.settings.onnxModelFile = modelFile
|
|
||||||
else:
|
|
||||||
self.settings.pyTorchModelFile = modelFile
|
|
||||||
self.settings.onnxModelFile = None
|
|
||||||
|
|
||||||
# PyTorchモデル生成
|
|
||||||
if self.settings.pyTorchModelFile is not None:
|
|
||||||
self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model)
|
|
||||||
self.net_g.eval()
|
|
||||||
load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None)
|
|
||||||
|
|
||||||
# ONNXモデル生成
|
|
||||||
if self.settings.onnxModelFile is not None:
|
|
||||||
# ort_options = onnxruntime.SessionOptions()
|
|
||||||
# ort_options.intra_op_num_threads = 8
|
|
||||||
# ort_options.execution_mode = ort_options.ExecutionMode.ORT_PARALLEL
|
|
||||||
# ort_options.inter_op_num_threads = 8
|
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
self.settings.onnxModelFile,
|
self.slotInfo.modelFile,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=options,
|
||||||
)
|
)
|
||||||
return self.get_info()
|
else:
|
||||||
|
self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model)
|
||||||
|
self.net_g.eval()
|
||||||
|
load_checkpoint(self.slotInfo.modelFile, self.net_g, None)
|
||||||
|
|
||||||
|
# その他の設定
|
||||||
|
self.settings.srcId = self.slotInfo.srcId
|
||||||
|
self.settings.dstId = self.slotInfo.dstId
|
||||||
|
print("[Voice Changer] [MMVCv13] Initializing... done")
|
||||||
|
|
||||||
def getOnnxExecutionProvider(self):
|
def getOnnxExecutionProvider(self):
|
||||||
availableProviders = onnxruntime.get_available_providers()
|
availableProviders = onnxruntime.get_available_providers()
|
||||||
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders:
|
devNum = torch.cuda.device_count()
|
||||||
|
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0:
|
||||||
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
||||||
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
||||||
return ["DmlExecutionProvider"], [{}]
|
return ["DmlExecutionProvider"], [{}]
|
||||||
@ -111,21 +98,15 @@ class MMVCv13:
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
def isOnnx(self):
|
|
||||||
if self.settings.onnxModelFile is not None:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def update_settings(self, key: str, val: int | float | str):
|
def update_settings(self, key: str, val: int | float | str):
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
val = int(val)
|
val = int(val)
|
||||||
setattr(self.settings, key, val)
|
setattr(self.settings, key, val)
|
||||||
|
|
||||||
if key == "gpu" and self.isOnnx():
|
if key == "gpu" and self.slotInfo.isONNX:
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
self.settings.onnxModelFile,
|
self.slotInfo.modelFile,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=options,
|
||||||
)
|
)
|
||||||
@ -150,13 +131,6 @@ class MMVCv13:
|
|||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||||
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
|
||||||
for f in files:
|
|
||||||
if data[f] is not None and os.path.exists(data[f]):
|
|
||||||
data[f] = os.path.basename(data[f])
|
|
||||||
else:
|
|
||||||
data[f] = ""
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_processing_sampling_rate(self):
|
def get_processing_sampling_rate(self):
|
||||||
@ -211,7 +185,7 @@ class MMVCv13:
|
|||||||
return data
|
return data
|
||||||
|
|
||||||
def _onnx_inference(self, data):
|
def _onnx_inference(self, data):
|
||||||
if hasattr(self, "onnx_session") is False or self.onnx_session is None:
|
if self.onnx_session is None:
|
||||||
print("[Voice Changer] No ONNX session.")
|
print("[Voice Changer] No ONNX session.")
|
||||||
raise NoModeLoadedException("ONNX")
|
raise NoModeLoadedException("ONNX")
|
||||||
|
|
||||||
@ -254,24 +228,12 @@ class MMVCv13:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
def inference(self, data):
|
def inference(self, data):
|
||||||
if self.isOnnx():
|
if self.slotInfo.isONNX:
|
||||||
audio = self._onnx_inference(data)
|
audio = self._onnx_inference(data)
|
||||||
else:
|
else:
|
||||||
audio = self._pyTorch_inference(data)
|
audio = self._pyTorch_inference(data)
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def loadModel2(cls, props: LoadModelParams2):
|
|
||||||
slotInfo: MMVCv13ModelSlot = MMVCv13ModelSlot()
|
|
||||||
for file in props.files:
|
|
||||||
if file.kind == "mmvcv13Model":
|
|
||||||
slotInfo.modelFile = file.name
|
|
||||||
elif file.kind == "mmvcv13Config":
|
|
||||||
slotInfo.configFile = file.name
|
|
||||||
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
|
||||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
|
||||||
return slotInfo
|
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
del self.net_g
|
del self.net_g
|
||||||
del self.onnx_session
|
del self.onnx_session
|
||||||
|
19
server/voice_changer/MMVCv13/MMVCv13ModelSlotGenerator.py
Normal file
19
server/voice_changer/MMVCv13/MMVCv13ModelSlotGenerator.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from data.ModelSlot import MMVCv13ModelSlot
|
||||||
|
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||||
|
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class MMVCv13ModelSlotGenerator(ModelSlotGenerator):
|
||||||
|
@classmethod
|
||||||
|
def loadModel(cls, props: LoadModelParams):
|
||||||
|
slotInfo: MMVCv13ModelSlot = MMVCv13ModelSlot()
|
||||||
|
for file in props.files:
|
||||||
|
if file.kind == "mmvcv13Model":
|
||||||
|
slotInfo.modelFile = file.name
|
||||||
|
elif file.kind == "mmvcv13Config":
|
||||||
|
slotInfo.configFile = file.name
|
||||||
|
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
||||||
|
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||||
|
return slotInfo
|
27
server/voice_changer/MMVCv13/models/commons.py
Normal file
27
server/voice_changer/MMVCv13/models/commons.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def init_weights(m, mean=0.0, std=0.01):
|
||||||
|
classname = m.__class__.__name__
|
||||||
|
if classname.find("Conv") != -1:
|
||||||
|
m.weight.data.normal_(mean, std)
|
||||||
|
|
||||||
|
|
||||||
|
def get_padding(kernel_size, dilation=1):
|
||||||
|
return int((kernel_size * dilation - dilation) / 2)
|
||||||
|
|
||||||
|
|
||||||
|
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||||
|
n_channels_int = n_channels[0]
|
||||||
|
in_act = input_a + input_b
|
||||||
|
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||||
|
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||||
|
acts = t_act * s_act
|
||||||
|
return acts
|
||||||
|
|
||||||
|
|
||||||
|
def sequence_mask(length, max_length=None):
|
||||||
|
if max_length is None:
|
||||||
|
max_length = length.max()
|
||||||
|
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||||
|
return x.unsqueeze(0) < length.unsqueeze(1)
|
166
server/voice_changer/MMVCv13/models/models.py
Normal file
166
server/voice_changer/MMVCv13/models/models.py
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE
|
||||||
|
|
||||||
|
|
||||||
|
from torch.nn import Conv1d, ConvTranspose1d
|
||||||
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
from .commons import init_weights, sequence_mask
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualCouplingBlock(nn.Module):
|
||||||
|
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_flows = n_flows
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
|
||||||
|
self.flows = nn.ModuleList()
|
||||||
|
for i in range(n_flows):
|
||||||
|
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
||||||
|
self.flows.append(Flip())
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, reverse=False):
|
||||||
|
if not reverse:
|
||||||
|
for flow in self.flows:
|
||||||
|
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||||
|
else:
|
||||||
|
for flow in reversed(self.flows):
|
||||||
|
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class PosteriorEncoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
|
||||||
|
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||||
|
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||||
|
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||||
|
# self.randn = torch.randn(1, 1, 1) # ダミーで初期化
|
||||||
|
|
||||||
|
def forward(self, x, x_lengths, g=None):
|
||||||
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||||
|
x = self.pre(x) * x_mask
|
||||||
|
x = self.enc(x, x_mask, g=g)
|
||||||
|
stats = self.proj(x) * x_mask
|
||||||
|
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||||
|
# if self.randn.size() != m.size(): # m の形が違う時だけ生成
|
||||||
|
self.randn = torch.randn_like(m)
|
||||||
|
z = (m + self.randn * torch.exp(logs)) * x_mask
|
||||||
|
return z, m, logs, x_mask
|
||||||
|
|
||||||
|
|
||||||
|
class Generator(torch.nn.Module):
|
||||||
|
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
||||||
|
super(Generator, self).__init__()
|
||||||
|
self.num_kernels = len(resblock_kernel_sizes)
|
||||||
|
self.num_upsamples = len(upsample_rates)
|
||||||
|
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
||||||
|
resblock = ResBlock1 if resblock == "1" else ResBlock2
|
||||||
|
|
||||||
|
self.ups = nn.ModuleList()
|
||||||
|
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||||
|
self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
|
||||||
|
|
||||||
|
self.resblocks = nn.ModuleList()
|
||||||
|
for i in range(len(self.ups)):
|
||||||
|
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||||
|
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||||
|
self.resblocks.append(resblock(ch, k, d))
|
||||||
|
|
||||||
|
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||||
|
self.ups.apply(init_weights)
|
||||||
|
|
||||||
|
if gin_channels != 0:
|
||||||
|
# self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||||
|
gin_channels = 0
|
||||||
|
|
||||||
|
def forward(self, x, g=None):
|
||||||
|
x = self.conv_pre(x)
|
||||||
|
if g is not None:
|
||||||
|
# x = x + self.cond(g)
|
||||||
|
g = None
|
||||||
|
|
||||||
|
for i in range(self.num_upsamples):
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
x = self.ups[i](x)
|
||||||
|
xs = None
|
||||||
|
for j in range(self.num_kernels):
|
||||||
|
if xs is None:
|
||||||
|
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
else:
|
||||||
|
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
x = xs / self.num_kernels
|
||||||
|
x = F.leaky_relu(x)
|
||||||
|
x = self.conv_post(x)
|
||||||
|
x = torch.tanh(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
print("Removing weight norm...")
|
||||||
|
for l in self.ups:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.resblocks:
|
||||||
|
l.remove_weight_norm()
|
||||||
|
|
||||||
|
|
||||||
|
class SynthesizerTrn(nn.Module):
|
||||||
|
"""
|
||||||
|
Synthesizer for Training
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, n_vocab, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, n_flow, n_speakers=0, gin_channels=0, use_sdp=True, **kwargs):
|
||||||
|
super().__init__()
|
||||||
|
self.n_vocab = n_vocab
|
||||||
|
self.spec_channels = spec_channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.filter_channels = filter_channels
|
||||||
|
self.n_heads = n_heads
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.p_dropout = p_dropout
|
||||||
|
self.resblock = resblock
|
||||||
|
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||||
|
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||||
|
self.upsample_rates = upsample_rates
|
||||||
|
self.upsample_initial_channel = upsample_initial_channel
|
||||||
|
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||||
|
self.segment_size = segment_size
|
||||||
|
self.n_speakers = n_speakers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
self.use_sdp = use_sdp
|
||||||
|
|
||||||
|
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
||||||
|
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
||||||
|
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels)
|
||||||
|
|
||||||
|
if n_speakers > 1:
|
||||||
|
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||||
|
|
||||||
|
def forward(self, y, y_lengths, sid_src, sid_tgt):
|
||||||
|
return self.voice_conversion(y, y_lengths, sid_src, sid_tgt)
|
||||||
|
|
||||||
|
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
||||||
|
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||||
|
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||||
|
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||||
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||||
|
z_p = self.flow(z, y_mask, g=g_src)
|
||||||
|
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||||
|
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
||||||
|
return o_hat
|
186
server/voice_changer/MMVCv13/models/modules.py
Normal file
186
server/voice_changer/MMVCv13/models/modules.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from torch.nn import Conv1d
|
||||||
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
|
||||||
|
from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
|
||||||
|
|
||||||
|
|
||||||
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
class WN(torch.nn.Module):
|
||||||
|
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
||||||
|
super(WN, self).__init__()
|
||||||
|
assert kernel_size % 2 == 1
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = (kernel_size,)
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
self.p_dropout = p_dropout
|
||||||
|
|
||||||
|
self.in_layers = torch.nn.ModuleList()
|
||||||
|
self.res_skip_layers = torch.nn.ModuleList()
|
||||||
|
self.drop = nn.Dropout(p_dropout)
|
||||||
|
|
||||||
|
if gin_channels != 0:
|
||||||
|
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
|
||||||
|
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||||
|
|
||||||
|
for i in range(n_layers):
|
||||||
|
dilation = dilation_rate**i
|
||||||
|
padding = int((kernel_size * dilation - dilation) / 2)
|
||||||
|
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
|
||||||
|
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||||
|
self.in_layers.append(in_layer)
|
||||||
|
|
||||||
|
# last one is not necessary
|
||||||
|
if i < n_layers - 1:
|
||||||
|
res_skip_channels = 2 * hidden_channels
|
||||||
|
else:
|
||||||
|
res_skip_channels = hidden_channels
|
||||||
|
|
||||||
|
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||||
|
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
||||||
|
self.res_skip_layers.append(res_skip_layer)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, **kwargs):
|
||||||
|
output = torch.zeros_like(x)
|
||||||
|
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||||
|
|
||||||
|
if g is not None:
|
||||||
|
g = self.cond_layer(g)
|
||||||
|
|
||||||
|
for i in range(self.n_layers):
|
||||||
|
x_in = self.in_layers[i](x)
|
||||||
|
if g is not None:
|
||||||
|
cond_offset = i * 2 * self.hidden_channels
|
||||||
|
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
||||||
|
else:
|
||||||
|
g_l = torch.zeros_like(x_in)
|
||||||
|
|
||||||
|
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
||||||
|
acts = self.drop(acts)
|
||||||
|
|
||||||
|
res_skip_acts = self.res_skip_layers[i](acts)
|
||||||
|
if i < self.n_layers - 1:
|
||||||
|
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
||||||
|
x = (x + res_acts) * x_mask
|
||||||
|
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
||||||
|
else:
|
||||||
|
output = output + res_skip_acts
|
||||||
|
return output * x_mask
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
if self.gin_channels != 0:
|
||||||
|
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||||
|
for l in self.in_layers:
|
||||||
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
|
for l in self.res_skip_layers:
|
||||||
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock1(torch.nn.Module):
|
||||||
|
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||||
|
super(ResBlock1, self).__init__()
|
||||||
|
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
|
||||||
|
self.convs1.apply(init_weights)
|
||||||
|
|
||||||
|
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
|
||||||
|
self.convs2.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask=None):
|
||||||
|
for c1, c2 in zip(self.convs1, self.convs2):
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c1(xt)
|
||||||
|
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c2(xt)
|
||||||
|
x = xt + x
|
||||||
|
if x_mask is not None:
|
||||||
|
x = x * x_mask
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs1:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.convs2:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock2(torch.nn.Module):
|
||||||
|
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
||||||
|
super(ResBlock2, self).__init__()
|
||||||
|
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
|
||||||
|
self.convs.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask=None):
|
||||||
|
for c in self.convs:
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c(xt)
|
||||||
|
x = xt + x
|
||||||
|
if x_mask is not None:
|
||||||
|
x = x * x_mask
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class Flip(nn.Module):
|
||||||
|
def forward(self, x, *args, reverse=False, **kwargs):
|
||||||
|
x = torch.flip(x, [1])
|
||||||
|
if not reverse:
|
||||||
|
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
||||||
|
return x, logdet
|
||||||
|
else:
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualCouplingLayer(nn.Module):
|
||||||
|
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
|
||||||
|
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.half_channels = channels // 2
|
||||||
|
self.mean_only = mean_only
|
||||||
|
|
||||||
|
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||||
|
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
||||||
|
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||||
|
self.post.weight.data.zero_()
|
||||||
|
self.post.bias.data.zero_()
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, reverse=False):
|
||||||
|
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
||||||
|
h = self.pre(x0) * x_mask
|
||||||
|
h = self.enc(h, x_mask, g=g)
|
||||||
|
stats = self.post(h) * x_mask
|
||||||
|
if not self.mean_only:
|
||||||
|
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
||||||
|
else:
|
||||||
|
m = stats
|
||||||
|
logs = torch.zeros_like(m)
|
||||||
|
|
||||||
|
if not reverse:
|
||||||
|
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||||
|
x = torch.cat([x0, x1], 1)
|
||||||
|
logdet = torch.sum(logs, [1, 2])
|
||||||
|
return x, logdet
|
||||||
|
else:
|
||||||
|
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||||
|
x = torch.cat([x0, x1], 1)
|
||||||
|
return x
|
1
server/voice_changer/MMVCv13/models/readme.txt
Normal file
1
server/voice_changer/MMVCv13/models/readme.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
modules in this folder from https://github.com/isletennos/MMVC_Client.git at 04f3fec4fd82dea6657026ec4e1cd80fb29a415c
|
64
server/voice_changer/MMVCv13/models/symbols.py
Normal file
64
server/voice_changer/MMVCv13/models/symbols.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """
|
||||||
|
""" from https://github.com/keithito/tacotron """
|
||||||
|
|
||||||
|
"""
|
||||||
|
Defines the set of symbols used in text input to the model.
|
||||||
|
"""
|
||||||
|
_pad = "_"
|
||||||
|
_punctuation = ';:,.!?¡¿—…"«»“” '
|
||||||
|
_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||||
|
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
||||||
|
|
||||||
|
|
||||||
|
# Export all symbols:
|
||||||
|
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
||||||
|
|
||||||
|
# Special symbol ids
|
||||||
|
SPACE_ID = symbols.index(" ")
|
||||||
|
|
||||||
|
symbols = [
|
||||||
|
"A",
|
||||||
|
"E",
|
||||||
|
"I",
|
||||||
|
"N",
|
||||||
|
"O",
|
||||||
|
"U",
|
||||||
|
"a",
|
||||||
|
"b",
|
||||||
|
"by",
|
||||||
|
"ch",
|
||||||
|
"cl",
|
||||||
|
"d",
|
||||||
|
"dy",
|
||||||
|
"e",
|
||||||
|
"f",
|
||||||
|
"g",
|
||||||
|
"gy",
|
||||||
|
"h",
|
||||||
|
"hy",
|
||||||
|
"i",
|
||||||
|
"j",
|
||||||
|
"k",
|
||||||
|
"ky",
|
||||||
|
"m",
|
||||||
|
"my",
|
||||||
|
"n",
|
||||||
|
"ny",
|
||||||
|
"o",
|
||||||
|
"p",
|
||||||
|
"py",
|
||||||
|
"r",
|
||||||
|
"ry",
|
||||||
|
"s",
|
||||||
|
"sh",
|
||||||
|
"t",
|
||||||
|
"ts",
|
||||||
|
"ty",
|
||||||
|
"u",
|
||||||
|
"v",
|
||||||
|
"w",
|
||||||
|
"y",
|
||||||
|
"z",
|
||||||
|
"pau",
|
||||||
|
"sil",
|
||||||
|
]
|
@ -1,8 +1,6 @@
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
from data.ModelSlot import MMVCv15ModelSlot
|
from data.ModelSlot import MMVCv15ModelSlot
|
||||||
|
|
||||||
from voice_changer.utils.LoadModelParams import LoadModelParams, LoadModelParams2
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
@ -49,42 +47,29 @@ class MMVCv15Settings:
|
|||||||
f0Factor: float = 1.0
|
f0Factor: float = 1.0
|
||||||
f0Detector: str = "dio" # dio or harvest
|
f0Detector: str = "dio" # dio or harvest
|
||||||
|
|
||||||
framework: str = "PyTorch" # PyTorch or ONNX
|
|
||||||
pyTorchModelFile: str = ""
|
|
||||||
onnxModelFile: str = ""
|
|
||||||
configFile: str = ""
|
|
||||||
|
|
||||||
# ↓mutableな物だけ列挙
|
# ↓mutableな物だけ列挙
|
||||||
intData = ["gpu", "srcId", "dstId"]
|
intData = ["gpu", "srcId", "dstId"]
|
||||||
floatData = ["f0Factor"]
|
floatData = ["f0Factor"]
|
||||||
strData = ["framework", "f0Detector"]
|
strData = ["f0Detector"]
|
||||||
|
|
||||||
|
|
||||||
class MMVCv15:
|
class MMVCv15:
|
||||||
audio_buffer: AudioInOut | None = None
|
def __init__(self, slotInfo: MMVCv15ModelSlot):
|
||||||
|
print("[Voice Changer] [MMVCv15] Creating instance ")
|
||||||
def __init__(self):
|
|
||||||
self.settings = MMVCv15Settings()
|
self.settings = MMVCv15Settings()
|
||||||
self.net_g = None
|
self.net_g = None
|
||||||
self.onnx_session = None
|
self.onnx_session: onnxruntime.InferenceSession | None = None
|
||||||
|
|
||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
|
|
||||||
def loadModel(self, props: LoadModelParams):
|
self.slotInfo = slotInfo
|
||||||
params = props.params
|
self.audio_buffer: AudioInOut | None = None
|
||||||
|
self.initialize()
|
||||||
|
|
||||||
self.settings.configFile = params["files"]["mmvcv15Config"]
|
def initialize(self):
|
||||||
self.hps = get_hparams_from_file(self.settings.configFile)
|
print("[Voice Changer] [MMVCv15] Initializing... ")
|
||||||
|
self.hps = get_hparams_from_file(self.slotInfo.configFile)
|
||||||
|
|
||||||
modelFile = params["files"]["mmvcv15Model"]
|
|
||||||
if modelFile.endswith(".onnx"):
|
|
||||||
self.settings.pyTorchModelFile = None
|
|
||||||
self.settings.onnxModelFile = modelFile
|
|
||||||
else:
|
|
||||||
self.settings.pyTorchModelFile = modelFile
|
|
||||||
self.settings.onnxModelFile = None
|
|
||||||
|
|
||||||
# PyTorchモデル生成
|
|
||||||
self.net_g = SynthesizerTrn(
|
self.net_g = SynthesizerTrn(
|
||||||
spec_channels=self.hps.data.filter_length // 2 + 1,
|
spec_channels=self.hps.data.filter_length // 2 + 1,
|
||||||
segment_size=self.hps.train.segment_size // self.hps.data.hop_length,
|
segment_size=self.hps.train.segment_size // self.hps.data.hop_length,
|
||||||
@ -103,18 +88,12 @@ class MMVCv15:
|
|||||||
requires_grad_text_enc=self.hps.requires_grad.text_enc,
|
requires_grad_text_enc=self.hps.requires_grad.text_enc,
|
||||||
requires_grad_dec=self.hps.requires_grad.dec,
|
requires_grad_dec=self.hps.requires_grad.dec,
|
||||||
)
|
)
|
||||||
if self.settings.pyTorchModelFile is not None:
|
|
||||||
self.settings.framework = "PyTorch"
|
|
||||||
self.net_g.eval()
|
|
||||||
load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None)
|
|
||||||
|
|
||||||
# ONNXモデル生成
|
if self.slotInfo.isONNX:
|
||||||
self.onxx_input_length = 8192
|
self.onxx_input_length = 8192
|
||||||
if self.settings.onnxModelFile is not None:
|
|
||||||
self.settings.framework = "ONNX"
|
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
self.settings.onnxModelFile,
|
self.slotInfo.modelFile,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=options,
|
||||||
)
|
)
|
||||||
@ -123,11 +102,21 @@ class MMVCv15:
|
|||||||
# print("ONNX INPUT SHAPE", i.name, i.shape)
|
# print("ONNX INPUT SHAPE", i.name, i.shape)
|
||||||
if i.name == "sin":
|
if i.name == "sin":
|
||||||
self.onxx_input_length = i.shape[2]
|
self.onxx_input_length = i.shape[2]
|
||||||
return self.get_info()
|
else:
|
||||||
|
self.net_g.eval()
|
||||||
|
load_checkpoint(self.slotInfo.modelFile, self.net_g, None)
|
||||||
|
|
||||||
|
# その他の設定
|
||||||
|
self.settings.srcId = self.slotInfo.srcId
|
||||||
|
self.settings.dstId = self.slotInfo.dstId
|
||||||
|
self.settings.f0Factor = self.slotInfo.f0Factor
|
||||||
|
|
||||||
|
print("[Voice Changer] [MMVCv15] Initializing... done")
|
||||||
|
|
||||||
def getOnnxExecutionProvider(self):
|
def getOnnxExecutionProvider(self):
|
||||||
availableProviders = onnxruntime.get_available_providers()
|
availableProviders = onnxruntime.get_available_providers()
|
||||||
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders:
|
devNum = torch.cuda.device_count()
|
||||||
|
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0:
|
||||||
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
||||||
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
||||||
return ["DmlExecutionProvider"], [{}]
|
return ["DmlExecutionProvider"], [{}]
|
||||||
@ -140,20 +129,14 @@ class MMVCv15:
|
|||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
def isOnnx(self):
|
|
||||||
if self.settings.onnxModelFile is not None:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def update_settings(self, key: str, val: int | float | str):
|
def update_settings(self, key: str, val: int | float | str):
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
val = int(val)
|
val = int(val)
|
||||||
setattr(self.settings, key, val)
|
setattr(self.settings, key, val)
|
||||||
if key == "gpu" and self.isOnnx():
|
if key == "gpu" and self.slotInfo.isONNX:
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
self.settings.onnxModelFile,
|
self.slotInfo.modelFile,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
provider_options=options,
|
provider_options=options,
|
||||||
)
|
)
|
||||||
@ -174,12 +157,6 @@ class MMVCv15:
|
|||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
|
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
|
||||||
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
|
||||||
for f in files:
|
|
||||||
if data[f] is not None and os.path.exists(data[f]):
|
|
||||||
data[f] = os.path.basename(data[f])
|
|
||||||
else:
|
|
||||||
data[f] = ""
|
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@ -241,7 +218,7 @@ class MMVCv15:
|
|||||||
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
||||||
|
|
||||||
# ONNX は固定長
|
# ONNX は固定長
|
||||||
if self.settings.framework == "ONNX":
|
if self.slotInfo.isONNX:
|
||||||
convertSize = self.onxx_input_length
|
convertSize = self.onxx_input_length
|
||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
@ -286,10 +263,6 @@ class MMVCv15:
|
|||||||
return audio1
|
return audio1
|
||||||
|
|
||||||
def _pyTorch_inference(self, data):
|
def _pyTorch_inference(self, data):
|
||||||
if self.settings.pyTorchModelFile == "" or self.settings.pyTorchModelFile is None:
|
|
||||||
print("[Voice Changer] No pyTorch session.")
|
|
||||||
raise NoModeLoadedException("pytorch")
|
|
||||||
|
|
||||||
if self.settings.gpu < 0 or self.gpu_num == 0:
|
if self.settings.gpu < 0 or self.gpu_num == 0:
|
||||||
dev = torch.device("cpu")
|
dev = torch.device("cpu")
|
||||||
else:
|
else:
|
||||||
@ -309,7 +282,7 @@ class MMVCv15:
|
|||||||
|
|
||||||
def inference(self, data):
|
def inference(self, data):
|
||||||
try:
|
try:
|
||||||
if self.isOnnx():
|
if self.slotInfo.isONNX:
|
||||||
audio = self._onnx_inference(data)
|
audio = self._onnx_inference(data)
|
||||||
else:
|
else:
|
||||||
audio = self._pyTorch_inference(data)
|
audio = self._pyTorch_inference(data)
|
||||||
@ -318,18 +291,6 @@ class MMVCv15:
|
|||||||
print(_e)
|
print(_e)
|
||||||
raise ONNXInputArgumentException()
|
raise ONNXInputArgumentException()
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def loadModel2(cls, props: LoadModelParams2):
|
|
||||||
slotInfo: MMVCv15ModelSlot = MMVCv15ModelSlot()
|
|
||||||
for file in props.files:
|
|
||||||
if file.kind == "mmvcv15Model":
|
|
||||||
slotInfo.modelFile = file.name
|
|
||||||
elif file.kind == "mmvcv15Config":
|
|
||||||
slotInfo.configFile = file.name
|
|
||||||
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
|
||||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
|
||||||
return slotInfo
|
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
del self.net_g
|
del self.net_g
|
||||||
del self.onnx_session
|
del self.onnx_session
|
||||||
|
19
server/voice_changer/MMVCv15/MMVCv15ModelSlotGenerator.py
Normal file
19
server/voice_changer/MMVCv15/MMVCv15ModelSlotGenerator.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
from data.ModelSlot import MMVCv15ModelSlot
|
||||||
|
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||||
|
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class MMVCv15ModelSlotGenerator(ModelSlotGenerator):
|
||||||
|
@classmethod
|
||||||
|
def loadModel(cls, props: LoadModelParams):
|
||||||
|
slotInfo: MMVCv15ModelSlot = MMVCv15ModelSlot()
|
||||||
|
for file in props.files:
|
||||||
|
if file.kind == "mmvcv15Model":
|
||||||
|
slotInfo.modelFile = file.name
|
||||||
|
elif file.kind == "mmvcv15Config":
|
||||||
|
slotInfo.configFile = file.name
|
||||||
|
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
||||||
|
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||||
|
return slotInfo
|
@ -35,26 +35,17 @@ from Exceptions import DeviceCannotSupportHalfPrecisionException
|
|||||||
|
|
||||||
|
|
||||||
class RVC(VoiceChangerModel):
|
class RVC(VoiceChangerModel):
|
||||||
initialLoad: bool = True
|
|
||||||
settings: RVCSettings = RVCSettings()
|
|
||||||
|
|
||||||
pipeline: Pipeline | None = None
|
|
||||||
|
|
||||||
deviceManager = DeviceManager.get_instance()
|
|
||||||
|
|
||||||
audio_buffer: AudioInOut | None = None
|
|
||||||
prevVol: float = 0
|
|
||||||
params: VoiceChangerParams
|
|
||||||
currentSlot: int = 0
|
|
||||||
needSwitch: bool = False
|
|
||||||
|
|
||||||
def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot):
|
def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot):
|
||||||
print("[Voice Changer] [RVC] Creating instance ")
|
print("[Voice Changer] [RVC] Creating instance ")
|
||||||
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
EmbedderManager.initialize(params)
|
EmbedderManager.initialize(params)
|
||||||
|
self.settings = RVCSettings()
|
||||||
self.params = params
|
self.params = params
|
||||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector)
|
||||||
|
|
||||||
|
self.pipeline: Pipeline | None = None
|
||||||
|
|
||||||
|
self.audio_buffer: AudioInOut | None = None
|
||||||
self.prevVol = 0.0
|
self.prevVol = 0.0
|
||||||
self.slotInfo = slotInfo
|
self.slotInfo = slotInfo
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
@ -116,14 +116,14 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
|||||||
slotInfo = RVCModelSlotGenerator.loadModel(params)
|
slotInfo = RVCModelSlotGenerator.loadModel(params)
|
||||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||||
elif params.voiceChangerType == "MMVCv13":
|
elif params.voiceChangerType == "MMVCv13":
|
||||||
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
from voice_changer.MMVCv13.MMVCv13ModelSlotGenerator import MMVCv13ModelSlotGenerator
|
||||||
|
|
||||||
slotInfo = MMVCv13.loadModel(params)
|
slotInfo = MMVCv13ModelSlotGenerator.loadModel(params)
|
||||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||||
elif params.voiceChangerType == "MMVCv15":
|
elif params.voiceChangerType == "MMVCv15":
|
||||||
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
|
from voice_changer.MMVCv15.MMVCv15ModelSlotGenerator import MMVCv15ModelSlotGenerator
|
||||||
|
|
||||||
slotInfo = MMVCv15.loadModel(params)
|
slotInfo = MMVCv15ModelSlotGenerator.loadModel(params)
|
||||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||||
elif params.voiceChangerType == "so-vits-svc-40":
|
elif params.voiceChangerType == "so-vits-svc-40":
|
||||||
from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40
|
from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40
|
||||||
@ -174,7 +174,20 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
|||||||
self.voiceChangerModel = RVC(self.params, slotInfo)
|
self.voiceChangerModel = RVC(self.params, slotInfo)
|
||||||
self.voiceChanger = VoiceChanger(self.params)
|
self.voiceChanger = VoiceChanger(self.params)
|
||||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||||
|
elif slotInfo.voiceChangerType == "MMVCv13":
|
||||||
|
print("................MMVCv13")
|
||||||
|
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
||||||
|
|
||||||
|
self.voiceChangerModel = MMVCv13(slotInfo)
|
||||||
|
self.voiceChanger = VoiceChanger(self.params)
|
||||||
|
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||||
|
elif slotInfo.voiceChangerType == "MMVCv15":
|
||||||
|
print("................MMVCv15")
|
||||||
|
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
|
||||||
|
|
||||||
|
self.voiceChangerModel = MMVCv15(slotInfo)
|
||||||
|
self.voiceChanger = VoiceChanger(self.params)
|
||||||
|
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||||
else:
|
else:
|
||||||
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
||||||
del self.voiceChangerModel
|
del self.voiceChangerModel
|
||||||
|
Loading…
x
Reference in New Issue
Block a user