From 2fa33aad8db60f7039d5f91987ed3845ec46e7ae Mon Sep 17 00:00:00 2001 From: wataru Date: Sun, 23 Apr 2023 06:19:48 +0900 Subject: [PATCH] WIP: support pitch-less and 768 --- server/voice_changer/RVC/RVC.py | 60 ++++++- server/voice_changer/RVC/const.py | 4 + .../RVC/custom_vc_infer_pipeline.py | 40 +++-- server/voice_changer/RVC/models.py | 170 ++++++++++++++++++ 4 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 server/voice_changer/RVC/const.py create mode 100644 server/voice_changer/RVC/models.py diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index a314017c..dadcb033 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -31,6 +31,8 @@ import pyworld as pw from voice_changer.RVC.custom_vc_infer_pipeline import VC from infer_pack.models import SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono +from .models import SynthesizerTrnMsNSFsid as SynthesizerTrnMs768NSFsid +from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768, RVC_MODEL_TYPE_UNKNOWN from fairseq import checkpoint_utils providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] @@ -42,6 +44,7 @@ class ModelSlot(): featureFile: str = "" indexFile: str = "" defaultTrans: int = "" + modelType: int = RVC_MODEL_TYPE_UNKNOWN @dataclass @@ -116,7 +119,8 @@ class RVC: onnxModelFile=props["files"]["onnxModelFilename"], featureFile=props["files"]["featureFilename"], indexFile=props["files"]["indexFilename"], - defaultTrans=params["trans"] + defaultTrans=params["trans"], + modelType=RVC_MODEL_TYPE_UNKNOWN ) print("[Voice Changer] RVC loading... slot:", self.tmp_slot) @@ -150,9 +154,50 @@ class RVC: # PyTorchモデル生成 if pyTorchModelFile != None and pyTorchModelFile != "": cpt = torch.load(pyTorchModelFile, map_location="cpu") + ''' + ※ ノーマル or Pitchレス判定 ⇒ コンフィグのpsamplingrateの形状から判断 + ■ ノーマル + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000] + ■ピッチレス + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2],  512, [16, 16, 4, 4],109, 256, 40000] + + 12番目の要素upsamplingrateの数で判定。4: ピッチレス, 5:ノーマル + + + ※ 256 or 768判定 ⇒ config全体の形状 + ■ ノーマル256 + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000] + ■ ノーマル 768対応 + [1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 768, 48000] + config全体の長さで判定 ⇒ config全体の形状 + ''' + + config_len = len(cpt["config"]) + upsamplingRateDims = len(cpt["config"][12]) + if config_len == 18 and upsamplingRateDims == 4: + print("[Voice Changer] RVC Model Type: Pitch-Less") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_PITCH_LESS + elif config_len == 18 and upsamplingRateDims == 5: + print("[Voice Changer] RVC Model Type: Normal") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL + elif config_len == 19: + print("[Voice Changer] RVC Model Type: Normal_768") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_NORMAL_768 + else: + print("[Voice Changer] RVC Model Type: UNKNOWN") + self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_UNKNOWN + self.settings.modelSamplingRate = cpt["config"][-1] - # net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half) - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + + if self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL: + net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half) + elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_PITCH_LESS: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_NORMAL_768: + net_g = SynthesizerTrnMs768NSFsid(**cpt["params"], is_half=self.is_half) + else: + print("unknwon") + net_g.eval() net_g.load_state_dict(cpt["weight"], strict=False) if self.is_half: @@ -340,12 +385,9 @@ class RVC: if_f0 = 1 f0_file = None - if self.settings.silenceFront == 0: - audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, - file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=0) - else: - audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, - file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate) + modelType = self.settings.modelSlots[self.currentSlot].modelType + audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, + file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, modelType=modelType) result = audio_out * np.sqrt(vol) diff --git a/server/voice_changer/RVC/const.py b/server/voice_changer/RVC/const.py new file mode 100644 index 00000000..aacb42df --- /dev/null +++ b/server/voice_changer/RVC/const.py @@ -0,0 +1,4 @@ +RVC_MODEL_TYPE_NORMAL = 0 +RVC_MODEL_TYPE_PITCH_LESS = 1 +RVC_MODEL_TYPE_NORMAL_768 = 2 +RVC_MODEL_TYPE_UNKNOWN = 99 diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py index 53507c9b..29419c17 100644 --- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py +++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py @@ -10,6 +10,7 @@ import pyworld import os import traceback import faiss +from .const import RVC_MODEL_TYPE_NORMAL, RVC_MODEL_TYPE_PITCH_LESS, RVC_MODEL_TYPE_NORMAL_768 class VC(object): @@ -82,7 +83,7 @@ class VC(object): f0_coarse = np.rint(f0_mel).astype(np.int) return f0_coarse, f0bak # 1-0 - def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate): # ,file_index,file_big_npy + def vc(self, model, net_g, sid, audio0, pitch, pitchf, times, index, big_npy, index_rate, modelType): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) if (self.is_half == True): feats = feats.half() @@ -93,16 +94,25 @@ class VC(object): assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS: + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9, # layer 9 + } + else: + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + } - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9, # layer 9 - } t0 = ttime() with torch.no_grad(): logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_PITCH_LESS: + feats = model.final_proj(logits[0]) + else: + feats = logits[0] if (isinstance(index, type(None)) == False and isinstance(big_npy, type(None)) == False and index_rate != 0): npy = feats[0].cpu().numpy() @@ -126,8 +136,10 @@ class VC(object): p_len = torch.tensor([p_len], device=self.device).long() with torch.no_grad(): - # audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) - audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) + if modelType == RVC_MODEL_TYPE_NORMAL or modelType == RVC_MODEL_TYPE_NORMAL_768: + audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) + else: + audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) del feats, p_len, padding_mask torch.cuda.empty_cache() @@ -136,7 +148,7 @@ class VC(object): times[2] += (t2 - t1) return audio1 - def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0): + def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None, silence_front=0, modelType: int = RVC_MODEL_TYPE_NORMAL): if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0): try: index = faiss.read_index(file_index) @@ -166,11 +178,11 @@ class VC(object): t2 = ttime() times[1] += (t2 - t1) if self.t_pad_tgt == 0: - audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch, pitchf[:, - t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate)) + audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch, + pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, modelType)) else: - audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch, pitchf[:, - t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate)[self.t_pad_tgt:-self.t_pad_tgt]) + audio_opt.append(self.vc(model, net_g, sid, audio_pad[t:], pitch[:, t // self.window:]if t is not None else pitch, + pitchf[:, t // self.window:]if t is not None else pitchf, times, index, big_npy, index_rate, modelType)[self.t_pad_tgt:-self.t_pad_tgt]) audio_opt = np.concatenate(audio_opt) del pitch, pitchf, sid diff --git a/server/voice_changer/RVC/models.py b/server/voice_changer/RVC/models.py new file mode 100644 index 00000000..70b3ad18 --- /dev/null +++ b/server/voice_changer/RVC/models.py @@ -0,0 +1,170 @@ +import math +import torch +from torch import nn +import numpy as np + +from infer_pack.models import sr2sr, GeneratorNSF, PosteriorEncoder, ResidualCouplingBlock +from infer_pack import commons, attentions + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.emb_channels = emb_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(emb_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class SynthesizerTrnMsNSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p)