diff --git a/client/demo/src/components/demo/components/301-2-5_ModelSelectRow copy.tsx b/client/demo/src/components/demo/components/301-2-5_ModelSelectRow.tsx similarity index 100% rename from client/demo/src/components/demo/components/301-2-5_ModelSelectRow copy.tsx rename to client/demo/src/components/demo/components/301-2-5_ModelSelectRow.tsx diff --git a/client/demo/src/components/demo/components/301_ModelUploaderRow.tsx b/client/demo/src/components/demo/components/301_ModelUploaderRow.tsx index c2aec6d8..b4578550 100644 --- a/client/demo/src/components/demo/components/301_ModelUploaderRow.tsx +++ b/client/demo/src/components/demo/components/301_ModelUploaderRow.tsx @@ -1,7 +1,7 @@ import React, { useMemo, useEffect } from "react" import { useGuiState } from "../001_GuiStateProvider" import { ConfigSelectRow } from "./301-1_ConfigSelectRow" -import { ModelSelectRow } from "./301-2-5_ModelSelectRow copy" +import { ModelSelectRow } from "./301-2-5_ModelSelectRow" import { ONNXSelectRow } from "./301-2_ONNXSelectRow" import { PyTorchSelectRow } from "./301-3_PyTorchSelectRow" import { CorrespondenceSelectRow } from "./301-4_CorrespondenceSelectRow" diff --git a/server/voice_changer/RVC/ModelWrapper.py b/server/voice_changer/RVC/ModelWrapper.py index eef7c240..5f2650f2 100644 --- a/server/voice_changer/RVC/ModelWrapper.py +++ b/server/voice_changer/RVC/ModelWrapper.py @@ -27,6 +27,7 @@ class ModelWrapper: metadata = json.loads(modelmeta.custom_metadata_map["metadata"]) self.samplingRate = metadata["samplingRate"] self.f0 = metadata["f0"] + self.embChannels = metadata["embChannels"] print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}") except: self.samplingRate = -1 @@ -40,6 +41,9 @@ class ModelWrapper: def getF0(self): return self.f0 + def getEmbChannels(self): + return self.embChannels + def set_providers(self, providers, provider_options=[{}]): self.onnx_session.set_providers(providers=providers, provider_options=provider_options) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 86ff9ad3..555d8a5e 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -52,6 +52,7 @@ class ModelSlot(): embChannels: int = 256 samplingRateOnnx: int = -1 f0Onnx: bool = True + embChannelsOnnx: int = 256 @dataclass @@ -169,9 +170,6 @@ class RVC: (2-2) rvc-webuiの、(256 or 768) x (ノーマルor pitchレス)判定 ⇒ 256, or 768 は17番目の要素で判定。, ノーマルor pitchレスはckp["f0"]で判定 ''' - - # print("config shape:1::::", cpt["config"], cpt["f0"]) - # print("config shape:2::::", (cpt).keys) config_len = len(cpt["config"]) if config_len == 18: self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC @@ -217,11 +215,12 @@ class RVC: self.settings.modelSlots[slot].f0Onnx = self.next_onnx_session.getF0() if self.settings.modelSlots[slot].samplingRate == -1: # ONNXにsampling rateが入っていない self.settings.modelSlots[slot].samplingRate = self.settings.modelSamplingRate + self.settings.modelSlots[slot].embChannelsOnnx = self.next_onnx_session.getEmbChannels() # ONNXがある場合は、ONNXの設定を優先 self.settings.modelSlots[slot].samplingRate = self.settings.modelSlots[slot].samplingRateOnnx self.settings.modelSlots[slot].f0 = self.settings.modelSlots[slot].f0Onnx - + self.settings.modelSlots[slot].embChannels = self.settings.modelSlots[slot].embChannelsOnnx else: print("[Voice Changer] Skip Loading ONNX Model...") self.next_onnx_session = None @@ -357,6 +356,7 @@ class RVC: f0 = self.settings.modelSlots[self.currentSlot].f0 embChannels = self.settings.modelSlots[self.currentSlot].embChannels + print("embChannels::1:", embChannels) audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels) result = audio_out * np.sqrt(vol) @@ -403,7 +403,6 @@ class RVC: f0_file = None f0 = self.settings.modelSlots[self.currentSlot].f0 - embChannels = self.settings.modelSlots[self.currentSlot].embChannels audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, f0=f0, embChannels=embChannels) diff --git a/server/voice_changer/RVC/export2onnx.py b/server/voice_changer/RVC/export2onnx.py index d40523f6..07a6a60a 100644 --- a/server/voice_changer/RVC/export2onnx.py +++ b/server/voice_changer/RVC/export2onnx.py @@ -6,6 +6,8 @@ from onnxsim import simplify import onnx from infer_pack.models import TextEncoder256, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock, Generator +from .models import TextEncoder +from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI class SynthesizerTrnMs256NSFsid_ONNX(nn.Module): @@ -182,6 +184,185 @@ class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module): return o, x_mask, (z, z_p, m_p, logs_p) +class SynthesizerTrnMsNSFsid_webui_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMsNSFsidNono_webui_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + def export2onnx(input_model, output_model, output_model_simple, is_half, metadata): cpt = torch.load(input_model, map_location="cpu") @@ -190,10 +371,14 @@ def export2onnx(input_model, output_model, output_model_simple, is_half, metadat else: dev = torch.device("cpu") - if metadata["f0"] == True: + if metadata["f0"] == True and metadata["ModelType"] == RVC_MODEL_TYPE_RVC: net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half) - elif metadata["f0"] == False: + elif metadata["f0"] == True and metadata["ModelType"] == RVC_MODEL_TYPE_WEBUI: + net_g_onnx = SynthesizerTrnMsNSFsid_webui_ONNX(**cpt["params"], is_half=is_half) + elif metadata["f0"] == False and metadata["ModelType"] == RVC_MODEL_TYPE_RVC: net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"]) + elif metadata["f0"] == False and metadata["ModelType"] == RVC_MODEL_TYPE_WEBUI: + net_g_onnx = SynthesizerTrnMsNSFsidNono_webui_ONNX(**cpt["params"]) net_g_onnx.eval().to(dev) net_g_onnx.load_state_dict(cpt["weight"], strict=False) @@ -201,9 +386,9 @@ def export2onnx(input_model, output_model, output_model_simple, is_half, metadat net_g_onnx = net_g_onnx.half() if is_half: - feats = torch.HalfTensor(1, 2192, 256).to(dev) + feats = torch.HalfTensor(1, 2192, metadata["embChannels"]).to(dev) else: - feats = torch.FloatTensor(1, 2192, 256).to(dev) + feats = torch.FloatTensor(1, 2192, metadata["embChannels"]).to(dev) p_len = torch.LongTensor([2192]).to(dev) sid = torch.LongTensor([0]).to(dev)