From 4fc57153e708e9f288fb6a7bb1e4171121d5e7ba Mon Sep 17 00:00:00 2001 From: wataru Date: Fri, 28 Apr 2023 08:46:34 +0900 Subject: [PATCH] WIP: refactoring --- server/.vscode/settings.json | 2 +- server/voice_changer/RVC/ModelWrapper.py | 44 +- server/voice_changer/RVC/export2onnx.py | 426 ++---------------- server/voice_changer/RVC/models.py | 16 +- .../onnx/SynthesizerTrnMs256NSFsid_ONNX.py | 95 ++++ .../SynthesizerTrnMs256NSFsid_nono_ONNX.py | 94 ++++ .../SynthesizerTrnMsNSFsidNono_webui_ONNX.py | 97 ++++ .../onnx/SynthesizerTrnMsNSFsid_webui_ONNX.py | 98 ++++ 8 files changed, 472 insertions(+), 400 deletions(-) create mode 100644 server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_ONNX.py create mode 100644 server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_nono_ONNX.py create mode 100644 server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsidNono_webui_ONNX.py create mode 100644 server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsid_webui_ONNX.py diff --git a/server/.vscode/settings.json b/server/.vscode/settings.json index 810d061e..7bc5a813 100644 --- a/server/.vscode/settings.json +++ b/server/.vscode/settings.json @@ -9,7 +9,7 @@ "editor.formatOnSave": true // ファイル保存時に自動フォーマット }, "flake8.args": [ - "--ignore=E501,E402,W503" + "--ignore=E501,E402,E722,W503" // "--max-line-length=150", // "--max-complexity=20" ] diff --git a/server/voice_changer/RVC/ModelWrapper.py b/server/voice_changer/RVC/ModelWrapper.py index 45d7ab3f..a659a05a 100644 --- a/server/voice_changer/RVC/ModelWrapper.py +++ b/server/voice_changer/RVC/ModelWrapper.py @@ -2,6 +2,7 @@ import onnxruntime import torch import numpy as np import json + # providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] providers = ["CPUExecutionProvider"] @@ -13,8 +14,7 @@ class ModelWrapper: # ort_options = onnxruntime.SessionOptions() # ort_options.intra_op_num_threads = 8 self.onnx_session = onnxruntime.InferenceSession( - self.onnx_model, - providers=providers + self.onnx_model, providers=providers ) # input_info = s first_input_type = self.onnx_session.get_inputs()[0].type @@ -30,8 +30,12 @@ class ModelWrapper: self.embChannels = metadata["embChannels"] self.modelType = metadata["modelType"] self.deprecated = False - self.embedder = metadata["embedder"] if "embedder" in metadata else "hubert_base" - print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}, embedder:{self.embedder}") + self.embedder = ( + metadata["embedder"] if "embedder" in metadata else "hubert_base" + ) + print( + f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}, embedder:{self.embedder}" + ) except: self.samplingRate = 48000 self.f0 = True @@ -39,10 +43,18 @@ class ModelWrapper: self.modelType = 0 self.deprecated = True self.embedder = "hubert_base" - print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################") - print(f"[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default") - print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}") - print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################") + print( + "[Voice Changer] ############## !!!! CAUTION !!!! ####################" + ) + print( + "[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default" + ) + print( + f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}" + ) + print( + "[Voice Changer] ############## !!!! CAUTION !!!! ####################" + ) def getSamplingRate(self): return self.samplingRate @@ -63,7 +75,9 @@ class ModelWrapper: return self.embedder def set_providers(self, providers, provider_options=[{}]): - self.onnx_session.set_providers(providers=providers, provider_options=provider_options) + self.onnx_session.set_providers( + providers=providers, provider_options=provider_options + ) def get_providers(self): return self.onnx_session.get_providers() @@ -76,7 +90,8 @@ class ModelWrapper: "feats": feats.cpu().numpy().astype(np.float16), "p_len": p_len.cpu().numpy().astype(np.int64), "sid": sid.cpu().numpy().astype(np.int64), - }) + }, + ) else: audio1 = self.onnx_session.run( ["audio"], @@ -84,7 +99,8 @@ class ModelWrapper: "feats": feats.cpu().numpy().astype(np.float32), "p_len": p_len.cpu().numpy().astype(np.int64), "sid": sid.cpu().numpy().astype(np.int64), - }) + }, + ) return torch.tensor(np.array(audio1)) def infer(self, feats, p_len, pitch, pitchf, sid): @@ -97,7 +113,8 @@ class ModelWrapper: "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), "sid": sid.cpu().numpy().astype(np.int64), - }) + }, + ) else: audio1 = self.onnx_session.run( ["audio"], @@ -107,6 +124,7 @@ class ModelWrapper: "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), "sid": sid.cpu().numpy().astype(np.int64), - }) + }, + ) return torch.tensor(np.array(audio1)) diff --git a/server/voice_changer/RVC/export2onnx.py b/server/voice_changer/RVC/export2onnx.py index 69a53ff7..1b3bb2e9 100644 --- a/server/voice_changer/RVC/export2onnx.py +++ b/server/voice_changer/RVC/export2onnx.py @@ -1,383 +1,37 @@ -from distutils.util import strtobool import json import torch -from torch import nn from onnxsim import simplify import onnx -from infer_pack.models import TextEncoder256, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock, Generator -from .models import TextEncoder +from voice_changer.RVC.onnx.SynthesizerTrnMs256NSFsid_ONNX import ( + SynthesizerTrnMs256NSFsid_ONNX, +) +from voice_changer.RVC.onnx.SynthesizerTrnMs256NSFsid_nono_ONNX import ( + SynthesizerTrnMs256NSFsid_nono_ONNX, +) +from voice_changer.RVC.onnx.SynthesizerTrnMsNSFsidNono_webui_ONNX import ( + SynthesizerTrnMsNSFsidNono_webui_ONNX, +) +from voice_changer.RVC.onnx.SynthesizerTrnMsNSFsid_webui_ONNX import ( + SynthesizerTrnMsNSFsid_webui_ONNX, +) from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI -class SynthesizerTrnMs256NSFsid_ONNX(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr, - **kwargs - ): - - super().__init__() - if (type(sr) == type("strr")): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, sr=sr, is_half=kwargs["is_half"] - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - sr=None, - **kwargs - ): - - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder256( - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, f0=False - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def forward(self, phone, phone_lengths, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMsNSFsid_webui_ONNX(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - emb_channels, - sr, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.emb_channels = emb_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder( - inter_channels, - hidden_channels, - filter_channels, - emb_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - ) - self.dec = GeneratorNSF( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - is_half=kwargs["is_half"], - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - -class SynthesizerTrnMsNSFsidNono_webui_ONNX(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - emb_channels, - sr=None, - **kwargs - ): - super().__init__() - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.emb_channels = emb_channels - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - self.enc_p = TextEncoder( - inter_channels, - hidden_channels, - filter_channels, - emb_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - f0=False, - ) - self.dec = Generator( - inter_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - ) - self.enc_q = PosteriorEncoder( - spec_channels, - inter_channels, - hidden_channels, - 5, - 1, - 16, - gin_channels=gin_channels, - ) - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - - def forward(self, phone, phone_lengths, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) - - def export2onnx(input_model, output_model, output_model_simple, is_half, metadata): - cpt = torch.load(input_model, map_location="cpu") if is_half: dev = torch.device("cuda", index=0) else: dev = torch.device("cpu") - if metadata["f0"] == True and metadata["modelType"] == RVC_MODEL_TYPE_RVC: + if metadata["f0"] is True and metadata["modelType"] == RVC_MODEL_TYPE_RVC: net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half) - elif metadata["f0"] == True and metadata["modelType"] == RVC_MODEL_TYPE_WEBUI: + elif metadata["f0"] is True and metadata["modelType"] == RVC_MODEL_TYPE_WEBUI: net_g_onnx = SynthesizerTrnMsNSFsid_webui_ONNX(**cpt["params"], is_half=is_half) - elif metadata["f0"] == False and metadata["modelType"] == RVC_MODEL_TYPE_RVC: + elif metadata["f0"] is False and metadata["modelType"] == RVC_MODEL_TYPE_RVC: net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"]) - elif metadata["f0"] == False and metadata["modelType"] == RVC_MODEL_TYPE_WEBUI: + elif metadata["f0"] is False and metadata["modelType"] == RVC_MODEL_TYPE_WEBUI: net_g_onnx = SynthesizerTrnMsNSFsidNono_webui_ONNX(**cpt["params"]) net_g_onnx.eval().to(dev) @@ -392,31 +46,45 @@ def export2onnx(input_model, output_model, output_model_simple, is_half, metadat p_len = torch.LongTensor([2192]).to(dev) sid = torch.LongTensor([0]).to(dev) - if metadata["f0"] == True: + if metadata["f0"] is True: pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev) pitchf = torch.FloatTensor(1, 2192).to(dev) input_names = ["feats", "p_len", "pitch", "pitchf", "sid"] - inputs = (feats, p_len, pitch, pitchf, sid,) + inputs = ( + feats, + p_len, + pitch, + pitchf, + sid, + ) else: input_names = ["feats", "p_len", "sid"] - inputs = (feats, p_len, sid,) + inputs = ( + feats, + p_len, + sid, + ) - output_names = ["audio", ] + output_names = [ + "audio", + ] - torch.onnx.export(net_g_onnx, - inputs, - output_model, - dynamic_axes={ - "feats": [1], - "pitch": [1], - "pitchf": [1], - }, - do_constant_folding=False, - opset_version=17, - verbose=False, - input_names=input_names, - output_names=output_names) + torch.onnx.export( + net_g_onnx, + inputs, + output_model, + dynamic_axes={ + "feats": [1], + "pitch": [1], + "pitchf": [1], + }, + do_constant_folding=False, + opset_version=17, + verbose=False, + input_names=input_names, + output_names=output_names, + ) model_onnx2 = onnx.load(output_model) model_simp, check = simplify(model_onnx2) diff --git a/server/voice_changer/RVC/models.py b/server/voice_changer/RVC/models.py index 35d8538b..4bd04651 100644 --- a/server/voice_changer/RVC/models.py +++ b/server/voice_changer/RVC/models.py @@ -1,10 +1,14 @@ import math import torch from torch import nn -import numpy as np -from infer_pack.models import sr2sr, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock, Generator -from infer_pack import commons, attentions +from infer_pack.models import ( # type:ignore + GeneratorNSF, + PosteriorEncoder, + ResidualCouplingBlock, + Generator, +) +from infer_pack import commons, attentions # type:ignore class TextEncoder(nn.Module): @@ -31,7 +35,7 @@ class TextEncoder(nn.Module): self.p_dropout = p_dropout self.emb_phone = nn.Linear(emb_channels, hidden_channels) self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: + if f0 is True: self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 self.encoder = attentions.Encoder( hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout @@ -39,7 +43,7 @@ class TextEncoder(nn.Module): self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) def forward(self, phone, pitch, lengths): - if pitch == None: + if pitch is None: x = self.emb_phone(phone) else: x = self.emb_phone(phone) + self.emb_pitch(pitch) @@ -81,8 +85,6 @@ class SynthesizerTrnMsNSFsid(nn.Module): **kwargs ): super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] self.spec_channels = spec_channels self.inter_channels = inter_channels self.hidden_channels = hidden_channels diff --git a/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_ONNX.py b/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_ONNX.py new file mode 100644 index 00000000..4a08afae --- /dev/null +++ b/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_ONNX.py @@ -0,0 +1,95 @@ +from torch import nn +from infer_pack.models import ( # type:ignore + TextEncoder256, + GeneratorNSF, + PosteriorEncoder, + ResidualCouplingBlock, +) +import torch + + +class SynthesizerTrnMs256NSFsid_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_nono_ONNX.py b/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_nono_ONNX.py new file mode 100644 index 00000000..87445d70 --- /dev/null +++ b/server/voice_changer/RVC/onnx/SynthesizerTrnMs256NSFsid_nono_ONNX.py @@ -0,0 +1,94 @@ +from torch import nn +from infer_pack.models import ( # type:ignore + TextEncoder256, + PosteriorEncoder, + ResidualCouplingBlock, + Generator, +) +import torch + + +class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsidNono_webui_ONNX.py b/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsidNono_webui_ONNX.py new file mode 100644 index 00000000..ac654a93 --- /dev/null +++ b/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsidNono_webui_ONNX.py @@ -0,0 +1,97 @@ +from torch import nn +from infer_pack.models import ( # type:ignore + PosteriorEncoder, + ResidualCouplingBlock, + Generator, +) +from voice_changer.RVC.models import TextEncoder +import torch + + +class SynthesizerTrnMsNSFsidNono_webui_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsid_webui_ONNX.py b/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsid_webui_ONNX.py new file mode 100644 index 00000000..6f280297 --- /dev/null +++ b/server/voice_changer/RVC/onnx/SynthesizerTrnMsNSFsid_webui_ONNX.py @@ -0,0 +1,98 @@ +from torch import nn +from infer_pack.models import ( # type:ignore + GeneratorNSF, + PosteriorEncoder, + ResidualCouplingBlock, +) +from voice_changer.RVC.models import TextEncoder +import torch + + +class SynthesizerTrnMsNSFsid_webui_ONNX(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p)