From 0936b0228782517889c679a8aa83a371bdd04c20 Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 18 Jun 2023 22:23:50 +0900 Subject: [PATCH 1/4] =?UTF-8?q?v3=E3=83=A2=E3=83=87=E3=83=AB=E3=81=AE?= =?UTF-8?q?=E3=83=87=E3=83=A2=E3=83=B3=E3=82=B9=E3=83=88=E3=83=AC=E3=83=BC?= =?UTF-8?q?=E3=82=B7=E3=83=A7=E3=83=B3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/const.py | 1 + .../voice_changer/RVC/ModelSlotGenerator.py | 28 +- .../RVC/inferencer/InferencerManager.py | 4 +- .../RVC/inferencer/RVCInferencerv3.py | 40 ++ .../RVC/inferencer/model_v3/attentions.py | 343 ++++++++++ .../RVC/inferencer/model_v3/commons.py | 165 +++++ .../RVC/inferencer/model_v3/config.py | 71 ++ .../RVC/inferencer/model_v3/models.py | 522 +++++++++++++++ .../RVC/inferencer/model_v3/modules.py | 626 ++++++++++++++++++ .../RVC/inferencer/model_v3/transforms.py | 207 ++++++ .../RVC/inferencer/model_v3/utils.py | 286 ++++++++ 11 files changed, 2291 insertions(+), 2 deletions(-) create mode 100644 server/voice_changer/RVC/inferencer/RVCInferencerv3.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/attentions.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/commons.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/config.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/models.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/modules.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/transforms.py create mode 100644 server/voice_changer/RVC/inferencer/model_v3/utils.py diff --git a/server/const.py b/server/const.py index 35cd747b..64127f92 100644 --- a/server/const.py +++ b/server/const.py @@ -65,6 +65,7 @@ class EnumInferenceTypes(Enum): pyTorchRVCNono = "pyTorchRVCNono" pyTorchRVCv2 = "pyTorchRVCv2" pyTorchRVCv2Nono = "pyTorchRVCv2Nono" + pyTorchRVCv3 = "pyTorchRVCv3" pyTorchWebUI = "pyTorchWebUI" pyTorchWebUINono = "pyTorchWebUINono" onnxRVC = "onnxRVC" diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py index aa86d6b1..b897cf2c 100644 --- a/server/voice_changer/RVC/ModelSlotGenerator.py +++ b/server/voice_changer/RVC/ModelSlotGenerator.py @@ -11,7 +11,33 @@ def _setInfoByPytorch(slot: ModelSlot): cpt = torch.load(slot.modelFile, map_location="cpu") config_len = len(cpt["config"]) - if config_len == 18: + if cpt["version"] == "v3": + slot.f0 = True if cpt["f0"] == 1 else False + slot.modelType = EnumInferenceTypes.pyTorchRVCv3.value + slot.embChannels = cpt["config"][17] + slot.embOutputLayer = ( + cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9 + ) + if slot.embChannels == 256: + slot.useFinalProj = True + else: + slot.useFinalProj = False + + slot.embedder = cpt["embedder_name"] + if slot.embedder.endswith("768"): + slot.embedder = slot.embedder[:-3] + + if slot.embedder == EnumEmbedderTypes.hubert.value: + slot.embedder = EnumEmbedderTypes.hubert.value + elif slot.embedder == EnumEmbedderTypes.contentvec.value: + slot.embedder = EnumEmbedderTypes.contentvec.value + elif slot.embedder == EnumEmbedderTypes.hubert_jp.value: + slot.embedder = EnumEmbedderTypes.hubert_jp.value + print("nadare v3 loaded") + else: + raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder") + + elif config_len == 18: # Original RVC slot.f0 = True if cpt["f0"] == 1 else False version = cpt.get("version", "v1") diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index cd5f0a6e..0bc63bd9 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -8,7 +8,7 @@ from voice_changer.RVC.inferencer.RVCInferencerv2 import RVCInferencerv2 from voice_changer.RVC.inferencer.RVCInferencerv2Nono import RVCInferencerv2Nono from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono - +from voice_changer.RVC.inferencer.RVCInferencerv3 import RVCInferencerv3 class InferencerManager: currentInferencer: Inferencer | None = None @@ -37,6 +37,8 @@ class InferencerManager: return RVCInferencerNono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2 or inferencerType == EnumInferenceTypes.pyTorchRVCv2.value: return RVCInferencerv2().loadModel(file, gpu) + elif inferencerType == EnumInferenceTypes.pyTorchRVCv3 or inferencerType == EnumInferenceTypes.pyTorchRVCv3.value: + return RVCInferencerv3().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono or inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono.value: return RVCInferencerv2Nono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchWebUI or inferencerType == EnumInferenceTypes.pyTorchWebUI.value: diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv3.py b/server/voice_changer/RVC/inferencer/RVCInferencerv3.py new file mode 100644 index 00000000..ed6eb63f --- /dev/null +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv3.py @@ -0,0 +1,40 @@ +import torch +from torch import device + +from const import EnumInferenceTypes +from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager +from .model_v3.models import SynthesizerTrnMs256NSFSid + + +class RVCInferencerv3(Inferencer): + def loadModel(self, file: str, gpu: device): + print("nadare v3 load start") + super().setProps(EnumInferenceTypes.pyTorchRVCv3, file, True, gpu) + + dev = DeviceManager.get_instance().getDevice(gpu) + isHalf = False # DeviceManager.get_instance().halfPrecisionAvailable(gpu) + + cpt = torch.load(file, map_location="cpu") + model = SynthesizerTrnMs256NSFSid(**cpt["params"]) + + model.eval() + model.load_state_dict(cpt["weight"], strict=False) + + model = model.to(dev) + if isHalf: + model = model.half() + + self.model = model + print("load model comprete") + return self + + def infer( + self, + feats: torch.Tensor, + pitch_length: torch.Tensor, + pitch: torch.Tensor, + pitchf: torch.Tensor, + sid: torch.Tensor, + ) -> torch.Tensor: + return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/model_v3/attentions.py b/server/voice_changer/RVC/inferencer/model_v3/attentions.py new file mode 100644 index 00000000..0a9ecf37 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/attentions.py @@ -0,0 +1,343 @@ +import math + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm + +from . import commons +from .modules import LayerNorm, LoRALinear1d + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + gin_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=25, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + gin_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + gin_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, g): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, g, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask, g) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.attn_layers: + l.remove_weight_norm() + for l in self.ffn_layers: + l.remove_weight_norm() + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + gin_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=False, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = LoRALinear1d(channels, channels, gin_channels, 2) + self.conv_k = LoRALinear1d(channels, channels, gin_channels, 2) + self.conv_v = LoRALinear1d(channels, channels, gin_channels, 2) + self.conv_qkw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) + self.conv_vw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) + self.conv_o = LoRALinear1d(channels, out_channels, gin_channels, 2) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + def forward(self, x, c, g, attn_mask=None): + q = self.conv_qkw(self.conv_q(x, g)) + k = self.conv_qkw(self.conv_k(c, g)) + v = self.conv_vw(self.conv_v(c, g)) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x, g) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + def remove_weight_norm(self): + self.conv_q.remove_weight_norm() + self.conv_k.remove_weight_norm() + self.conv_v.remove_weight_norm() + self.conv_o.remove_weight_norm() + remove_weight_norm(self.conv_qkw) + remove_weight_norm(self.conv_vw) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + gin_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + self.conv_1 = LoRALinear1d(in_channels, filter_channels, gin_channels, 2) + self.conv_2 = LoRALinear1d(filter_channels, out_channels, gin_channels, 2) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask, g): + x = self.conv_1(x * x_mask, g) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(x * x_mask, g) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + + def remove_weight_norm(self): + self.conv_1.remove_weight_norm() + self.conv_2.remove_weight_norm() diff --git a/server/voice_changer/RVC/inferencer/model_v3/commons.py b/server/voice_changer/RVC/inferencer/model_v3/commons.py new file mode 100644 index 00000000..79731d2b --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/commons.py @@ -0,0 +1,165 @@ +import math + +import torch +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + r = x[i, :, idx_str:idx_end] + ret[i, :, :r.size(1)] = r + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + r = x[i, idx_str:idx_end] + ret[i, :r.size(0)] = r + return ret + + +def rand_slice_segments(x, x_lengths, segment_size=4, ids_str=None): + b, d, t = x.size() + if ids_str is None: + ids_str = torch.zeros([b]).to(device=x.device, dtype=x_lengths.dtype) + ids_str_max = torch.maximum(torch.zeros_like(x_lengths).to(device=x_lengths.device ,dtype=x_lengths.dtype), x_lengths - segment_size + 1 - ids_str) + ids_str += (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/server/voice_changer/RVC/inferencer/model_v3/config.py b/server/voice_changer/RVC/inferencer/model_v3/config.py new file mode 100644 index 00000000..6a0dc817 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/config.py @@ -0,0 +1,71 @@ +from typing import * + +from pydantic import BaseModel + + +class TrainConfigTrain(BaseModel): + log_interval: int + seed: int + epochs: int + learning_rate: float + betas: List[float] + eps: float + batch_size: int + fp16_run: bool + lr_decay: float + segment_size: int + init_lr_ratio: int + warmup_epochs: int + c_mel: int + c_kl: float + + +class TrainConfigData(BaseModel): + max_wav_value: float + sampling_rate: int + filter_length: int + hop_length: int + win_length: int + n_mel_channels: int + mel_fmin: float + mel_fmax: Any + + +class TrainConfigModel(BaseModel): + inter_channels: int + hidden_channels: int + filter_channels: int + n_heads: int + n_layers: int + kernel_size: int + p_dropout: int + resblock: str + resblock_kernel_sizes: List[int] + resblock_dilation_sizes: List[List[int]] + upsample_rates: List[int] + upsample_initial_channel: int + upsample_kernel_sizes: List[int] + use_spectral_norm: bool + gin_channels: int + emb_channels: int + spk_embed_dim: int + + +class TrainConfig(BaseModel): + version: Literal["v1", "v2"] = "v2" + train: TrainConfigTrain + data: TrainConfigData + model: TrainConfigModel + + +class DatasetMetaItem(BaseModel): + gt_wav: str + co256: str + f0: Optional[str] + f0nsf: Optional[str] + speaker_id: int + + +class DatasetMetadata(BaseModel): + files: Dict[str, DatasetMetaItem] + # mute: DatasetMetaItem diff --git a/server/voice_changer/RVC/inferencer/model_v3/models.py b/server/voice_changer/RVC/inferencer/model_v3/models.py new file mode 100644 index 00000000..2f8cc064 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/models.py @@ -0,0 +1,522 @@ +import math +import os +import sys + +import numpy as np +import torch +from torch import nn +from torch.nn import Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from . import attentions, commons, modules +from .commons import get_padding, init_weights +from .modules import (CausalConvTranspose1d, ConvNext2d, DilatedCausalConv1d, + LoRALinear1d, ResBlock1, WaveConv1D) + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels: int, + hidden_channels: int, + filter_channels: int, + emb_channels: int, + gin_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: int, + f0: bool = True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.emb_channels = emb_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(emb_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.emb_g = nn.Conv1d(gin_channels, hidden_channels, 1) + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + + def forward(self, phone, pitch, lengths, g): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask, g) + x = self.proj(x) + + return x, None, x_mask + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + gin_channels, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Conv1d(gin_channels, harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + sine_raw = torch.transpose(sine_wavs, 1, 2).to(device=x.device, dtype=x.dtype) + return sine_raw, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + harmonic_num=16, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.upsample_rates = upsample_rates + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, gin_channels=gin_channels, harmonic_num=harmonic_num, is_half=is_half + ) + self.gpre = Conv1d(gin_channels, initial_channel, 1) + self.conv_pre = ResBlock1(initial_channel, upsample_initial_channel, gin_channels, [7] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, 2) + + self.ups = nn.ModuleList() + self.resblocks = nn.ModuleList() + c_cur = upsample_initial_channel + for i, u in enumerate(upsample_rates): + c_pre = c_cur + c_cur = c_pre // 2 + self.ups.append( + CausalConvTranspose1d( + c_pre, + c_pre, + kernel_rate=3, + stride=u, + groups=c_pre, + ) + ) + self.resblocks.append(ResBlock1(c_pre, c_cur, gin_channels, [11] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, r=2)) + self.conv_post = DilatedCausalConv1d(c_cur, 1, 5, stride=1, groups=1, dilation=1, bias=False) + self.noise_convs = nn.ModuleList() + self.noise_pre = LoRALinear1d(1 + harmonic_num, c_pre, gin_channels, r=2+harmonic_num) + for i, u in enumerate(upsample_rates[::-1]): + c_pre = c_pre * 2 + c_cur = c_cur * 2 + if i + 1 < len(upsample_rates): + self.noise_convs.append(DilatedCausalConv1d(c_cur, c_pre, kernel_size=u*3, stride=u, groups=c_cur, dilation=1)) + else: + self.noise_convs.append(DilatedCausalConv1d(c_cur, initial_channel, kernel_size=u*3, stride=u, groups=math.gcd(c_cur, initial_channel), dilation=1)) + self.upp = np.prod(upsample_rates) + + def forward(self, x, x_mask, f0f, g): + har_source, noi_source, uv = self.m_source(f0f, self.upp) + har_source = self.noise_pre(har_source, g) + x_sources = [har_source] + for c in self.noise_convs: + har_source = c(har_source) + x_sources.append(har_source) + + x = x + x_sources[-1] + x = x + self.gpre(g) + x = self.conv_pre(x, x_mask, g) + for i, u in enumerate(self.upsample_rates): + x_mask = torch.repeat_interleave(x_mask, u, 2) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x = self.resblocks[i](x + x_sources[-i-2], x_mask, g) + + x = F.leaky_relu(x) + x = self.conv_post(x) + if x_mask is not None: + x *= x_mask + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + remove_weight_norm(self.noise_pre) + remove_weight_norm(self.conv_post) + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFSid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + self.sr = sr + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + + self.emb_pitch = nn.Embedding(256, emb_channels) # pitch 256 + self.dec = GeneratorNSF( + emb_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + ) + + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print( + "gin_channels:", + gin_channels, + "self.spk_embed_dim:", + self.spk_embed_dim, + "emb_channels:", + emb_channels, + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + # m_p, _, x_mask = self.enc_p(phone, pitch, phone_lengths, g) + # z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + # z_p = self.flow(m_p * x_mask, x_mask, g=g) + + x = phone + self.emb_pitch(pitch) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( + phone.dtype + ) + + m_p_slice, ids_slice = commons.rand_slice_segments( + x, phone_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(m_p_slice, mask_slice, pitchf, g) + return o, ids_slice, x_mask, g + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + x = phone + self.emb_pitch(pitch) + x = torch.transpose(x, 1, -1) + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( + phone.dtype + ) + o = self.dec((x * x_mask)[:, :, :max_len], x_mask, nsff0, g) + return o, x_mask, (None, None, None, None) + + +class DiscriminatorS(torch.nn.Module): + def __init__( + self, + hidden_channels: int, + filter_channels: int, + gin_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: int, + ): + super(DiscriminatorS, self).__init__() + self.convs = WaveConv1D(2, hidden_channels, gin_channels, [10, 7, 7, 7, 5, 3, 3], [5, 4, 4, 4, 3, 2, 2], [1] * 7, hidden_channels // 2, False) + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, gin_channels, n_heads, n_layers//2, kernel_size, p_dropout + ) + self.cross = weight_norm(torch.nn.Conv1d(gin_channels, hidden_channels, 1, 1)) + self.conv_post = weight_norm(torch.nn.Conv1d(hidden_channels, 1, 3, 1, padding=get_padding(5, 1))) + + def forward(self, x, g): + x = self.convs(x) + x_mask = torch.ones([x.shape[0], 1, x.shape[2]], device=x.device, dtype=x.dtype) + x = self.encoder(x, x_mask, g) + fmap = [x] + x = x + x * self.cross(g) + y = self.conv_post(x) + return y, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + self.init_kernel_size = upsample_rates[-1] * 3 + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + N = len(upsample_rates) + self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) + self.convs = nn.ModuleList() + for i, u in enumerate(upsample_rates[::-1][1:], start=1): + self.convs.append( + ConvNext2d( + final_dim // (2 ** (N - i)), + final_dim // (2 ** (N - i - 1)), + gin_channels, + (u*3, 1), + (u, 1), + 4, + r=2 + ) + ) + self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) + + def forward(self, x, g): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (n_pad, 0), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + x = torch.flip(x, dims=[2]) + x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") + x = self.init_conv(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = torch.flip(x, dims=[2]) + + for i, l in enumerate(self.convs): + x = l(x, g) + if i >= 1: + fmap.append(x) + + x = F.pad(x, [0, 0, 2, 0], mode="constant") + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): + super(MultiPeriodDiscriminator, self).__init__() + + # discs = [DiscriminatorS(hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout)] + discs = [ + DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods + ] + self.ups = np.prod(upsample_rates) + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat, g): + fmap_rs = [] + fmap_gs = [] + y_d_rs = [] + y_d_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y, g) + y_d_g, fmap_g = d(y_hat, g) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/model_v3/modules.py b/server/voice_changer/RVC/inferencer/model_v3/modules.py new file mode 100644 index 00000000..fb0db86d --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/modules.py @@ -0,0 +1,626 @@ +import math + +import torch +from torch import nn +from torch.nn import Conv1d, Conv2d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from . import commons, modules +from .commons import get_padding, init_weights +from .transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class DilatedCausalConv1d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): + super(DilatedCausalConv1d, self).__init__() + self.kernel_size = kernel_size + self.dilation = dilation + self.stride = stride + self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) + init_weights(self.conv) + + def forward(self, x): + x = torch.flip(x, [2]) + x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) + size = x.shape[2] // self.stride + x = self.conv(x)[:, :, :size] + x = torch.flip(x, [2]) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class CausalConvTranspose1d(nn.Module): + """ + padding = 0, dilation = 1のとき + + Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding + Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding + output_paddingいらないね + """ + def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): + super(CausalConvTranspose1d, self).__init__() + kernel_size = kernel_rate * stride + self.trim_size = (kernel_rate - 1) * stride + self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) + + def forward(self, x): + x = self.conv(x) + return x[:, :, :-self.trim_size] + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class LoRALinear1d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + init_weights(self.main_fc) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + + def forward(self, x, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + x = self.main_fc(x) + torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + +class LoRALinear2d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + + def forward(self, x, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + x = self.main_fc(x) + torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + +class WaveConv1D(torch.nn.Module): + def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r, use_spectral_norm=False): + super(WaveConv1D, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.convs = [] + # self.norms = [] + self.convs.append(LoRALinear1d(in_channels, inner_channels, gin_channels, r)) + for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations), start=1): + self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, k, s, dilation=d, groups=inner_channels, padding=get_padding(k, d)))) + if i < len(kernel_sizes): + self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, 1, 1))) + else: + self.convs.append(norm_f(Conv1d(inner_channels, out_channels, 1, 1))) + self.convs = nn.ModuleList(self.convs) + + def forward(self, x, g, x_mask=None): + for i, l in enumerate(self.convs): + if i % 2: + x_ = l(x) + else: + x_ = l(x, g) + x = F.leaky_relu(x_, modules.LRELU_SLOPE) + if x_mask is not None: + x *= x_mask + return x + + def remove_weight_norm(self): + for i, c in enumerate(self.convs): + if i % 2: + remove_weight_norm(c) + else: + c.remove_weight_norm() + + +class MBConv2d(torch.nn.Module): + """ + Causal MBConv2D + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(MBConv2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.pre_pointwise = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.depthwise = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) + self.post_pointwise = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + + def forward(self, x, g): + x = self.pre_pointwise(x, g) + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.depthwise(x) + x = self.post_pointwise(x, g) + return x + + +class ConvNext2d(torch.nn.Module): + """ + Causal ConvNext Block + stride = 1 only + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(ConvNext2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) + self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + self.act = nn.GELU() + self.norm = LayerNorm(in_channels) + + def forward(self, x, g): + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.dwconv(x) + x = self.norm(x) + x = self.pwconv1(x, g) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.pwconv2(x, g) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.dwconv) + + +class SqueezeExcitation1D(torch.nn.Module): + def __init__(self, input_channels, squeeze_channels, gin_channels, use_spectral_norm=False): + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + super(SqueezeExcitation1D, self).__init__() + self.fc1 = LoRALinear1d(input_channels, squeeze_channels, gin_channels, 2) + self.fc2 = LoRALinear1d(squeeze_channels, input_channels, gin_channels, 2) + + def _scale(self, x, x_mask, g): + x_length = torch.sum(x_mask, dim=2, keepdim=True) + x_length = torch.maximum(x_length, torch.ones_like(x_length)) + scale = torch.sum(x * x_mask, dim=2, keepdim=True) / x_length + scale = self.fc1(scale, g) + scale = F.leaky_relu(scale, modules.LRELU_SLOPE) + scale = self.fc2(scale, g) + return torch.sigmoid(scale) + + def forward(self, x, x_mask, g): + scale = self._scale(x, x_mask, g) + return scale * x + + def remove_weight_norm(self): + self.fc1.remove_weight_norm() + self.fc2.remove_weight_norm() + + +class ResBlock1(torch.nn.Module): + def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r): + super(ResBlock1, self).__init__() + norm_f = weight_norm + inner_channels = int(in_channels * extend_ratio) + self.dconvs = nn.ModuleList() + self.pconvs = nn.ModuleList() + # self.ses = nn.ModuleList() + self.norms = nn.ModuleList() + self.init_conv = LoRALinear1d(in_channels, inner_channels, gin_channels, r) + for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): + self.norms.append(LayerNorm(inner_channels)) + self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) + if i < len(kernel_sizes) - 1: + self.pconvs.append(LoRALinear1d(inner_channels, inner_channels, gin_channels, r)) + self.out_conv = LoRALinear1d(inner_channels, out_channels, gin_channels, r) + init_weights(self.init_conv) + init_weights(self.out_conv) + + def forward(self, x, x_mask, g): + x *= x_mask + x = self.init_conv(x, g) + for i in range(len(self.dconvs)): + x *= x_mask + x = self.norms[i](x) + x_ = self.dconvs[i](x) + x_ = F.leaky_relu(x_, modules.LRELU_SLOPE) + if i < len(self.dconvs) - 1: + x = x + self.pconvs[i](x_, g) + x = self.out_conv(x_, g) + return x + + def remove_weight_norm(self): + for c in self.dconvs: + c.remove_weight_norm() + for c in self.pconvs: + c.remove_weight_norm() + self.init_conv.remove_weight_norm() + self.out_conv.remove_weight_norm() + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/server/voice_changer/RVC/inferencer/model_v3/transforms.py b/server/voice_changer/RVC/inferencer/model_v3/transforms.py new file mode 100644 index 00000000..6f30b717 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/transforms.py @@ -0,0 +1,207 @@ +import numpy as np +import torch +from torch.nn import functional as F + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/server/voice_changer/RVC/inferencer/model_v3/utils.py b/server/voice_changer/RVC/inferencer/model_v3/utils.py new file mode 100644 index 00000000..a5152da2 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/model_v3/utils.py @@ -0,0 +1,286 @@ +import glob +import logging +import os +import shutil +import socket +import sys + +import ffmpeg +import matplotlib +import matplotlib.pylab as plt +import numpy as np +import torch +from scipy.io.wavfile import read +from torch.nn import functional as F + +from modules.shared import ROOT_DIR + +from .config import TrainConfig + +matplotlib.use("Agg") +logging.getLogger("matplotlib").setLevel(logging.WARNING) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +class AWP: + """ + Fast AWP + https://www.kaggle.com/code/junkoda/fast-awp + """ + def __init__(self, model, optimizer, *, adv_param='weight', + adv_lr=0.01, adv_eps=0.01): + self.model = model + self.optimizer = optimizer + self.adv_param = adv_param + self.adv_lr = adv_lr + self.adv_eps = adv_eps + self.backup = {} + + def perturb(self): + """ + Perturb model parameters for AWP gradient + Call before loss and loss.backward() + """ + self._save() # save model parameters + self._attack_step() # perturb weights + + def _attack_step(self): + e = 1e-6 + for name, param in self.model.named_parameters(): + if param.requires_grad and param.grad is not None and self.adv_param in name: + grad = self.optimizer.state[param]['exp_avg'] + norm_grad = torch.norm(grad) + norm_data = torch.norm(param.detach()) + + if norm_grad != 0 and not torch.isnan(norm_grad): + # Set lower and upper limit in change + limit_eps = self.adv_eps * param.detach().abs() + param_min = param.data - limit_eps + param_max = param.data + limit_eps + + # Perturb along gradient + # w += (adv_lr * |w| / |grad|) * grad + param.data.add_(grad, alpha=(self.adv_lr * (norm_data + e) / (norm_grad + e))) + + # Apply the limit to the change + param.data.clamp_(param_min, param_max) + + def _save(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and param.grad is not None and self.adv_param in name: + if name not in self.backup: + self.backup[name] = param.clone().detach() + else: + self.backup[name].copy_(param.data) + + def restore(self): + """ + Restore model parameter to correct position; AWP do not perturbe weights, it perturb gradients + Call after loss.backward(), before optimizer.step() + """ + for name, param in self.model.named_parameters(): + if name in self.backup: + param.data.copy_(self.backup[name]) + + +def load_audio(file: str, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # Prevent small white copy path head and tail with spaces and " and return + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() + + +def find_empty_port(): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port + + +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}" + ) + if saved_state_dict[k].dim() == 2: # NOTE: check is this ok? + # for embedded input 256 <==> 768 + # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc. + if saved_state_dict[k].dtype == torch.half: + new_state_dict[k] = ( + F.interpolate( + saved_state_dict[k].float().unsqueeze(0).unsqueeze(0), + size=state_dict[k].shape, + mode="bilinear", + ) + .half() + .squeeze(0) + .squeeze(0) + ) + else: + new_state_dict[k] = ( + F.interpolate( + saved_state_dict[k].unsqueeze(0).unsqueeze(0), + size=state_dict[k].shape, + mode="bilinear", + ) + .squeeze(0) + .squeeze(0) + ) + print( + "interpolated new_state_dict", + k, + "from", + saved_state_dict[k].shape, + "to", + new_state_dict[k].shape, + ) + else: + raise KeyError + except Exception as e: + # print(traceback.format_exc()) + print(f"{k} is not in the checkpoint") + print("error: %s" % e) + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + print("Loaded model weights") + + epoch = checkpoint_dict["epoch"] + learning_rate = checkpoint_dict["learning_rate"] + if optimizer is not None and load_opt == 1: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch)) + return model, optimizer, learning_rate, epoch + + +def save_state(model, optimizer, learning_rate, epoch, checkpoint_path): + print( + "Saving model and optimizer state at epoch {} to {}".format( + epoch, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "epoch": epoch, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + filelist = glob.glob(os.path.join(dir_path, regex)) + if len(filelist) == 0: + return None + filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + filepath = filelist[-1] + return filepath + + +def plot_spectrogram_to_numpy(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_config(training_dir: str, sample_rate: int, emb_channels: int): + if emb_channels == 256: + config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") + else: + config_path = os.path.join( + ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" + ) + config_save_path = os.path.join(training_dir, "config.json") + + shutil.copyfile(config_path, config_save_path) + + return TrainConfig.parse_file(config_save_path) From c8dc2a2637d5d64de039b365d045d79ec0421444 Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 18 Jun 2023 23:18:10 +0900 Subject: [PATCH 2/4] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 23b9c72e..d6cb6a81 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +これはVCClientでv3モデルを試すためのフォーク版です。それ以外の用途ではフォーク元を利用してください + ## VC Client [English](/README_en.md) From daff5098f86a08233f7214ba5b177f157995c248 Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 24 Jun 2023 10:33:57 +0900 Subject: [PATCH 3/4] =?UTF-8?q?voras=5Fbeta=E3=81=AE=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/const.py | 2 +- .../voice_changer/RVC/ModelSlotGenerator.py | 14 +- .../RVC/inferencer/Inferencer.py | 2 +- .../RVC/inferencer/InferencerManager.py | 6 +- ...CInferencerv3.py => VorasInferencebeta.py} | 15 +- .../RVC/inferencer/model_v3/attentions.py | 343 ---------- .../RVC/inferencer/model_v3/models.py | 522 --------------- .../RVC/inferencer/model_v3/modules.py | 626 ------------------ .../{model_v3 => voras_beta}/commons.py | 0 .../{model_v3 => voras_beta}/config.py | 14 +- .../RVC/inferencer/voras_beta/models.py | 238 +++++++ .../RVC/inferencer/voras_beta/modules.py | 496 ++++++++++++++ .../{model_v3 => voras_beta}/transforms.py | 0 .../{model_v3 => voras_beta}/utils.py | 0 server/voice_changer/RVC/pipeline/Pipeline.py | 53 +- 15 files changed, 778 insertions(+), 1553 deletions(-) rename server/voice_changer/RVC/inferencer/{RVCInferencerv3.py => VorasInferencebeta.py} (68%) delete mode 100644 server/voice_changer/RVC/inferencer/model_v3/attentions.py delete mode 100644 server/voice_changer/RVC/inferencer/model_v3/models.py delete mode 100644 server/voice_changer/RVC/inferencer/model_v3/modules.py rename server/voice_changer/RVC/inferencer/{model_v3 => voras_beta}/commons.py (100%) rename server/voice_changer/RVC/inferencer/{model_v3 => voras_beta}/config.py (77%) create mode 100644 server/voice_changer/RVC/inferencer/voras_beta/models.py create mode 100644 server/voice_changer/RVC/inferencer/voras_beta/modules.py rename server/voice_changer/RVC/inferencer/{model_v3 => voras_beta}/transforms.py (100%) rename server/voice_changer/RVC/inferencer/{model_v3 => voras_beta}/utils.py (100%) diff --git a/server/const.py b/server/const.py index 64127f92..b5d0af63 100644 --- a/server/const.py +++ b/server/const.py @@ -65,9 +65,9 @@ class EnumInferenceTypes(Enum): pyTorchRVCNono = "pyTorchRVCNono" pyTorchRVCv2 = "pyTorchRVCv2" pyTorchRVCv2Nono = "pyTorchRVCv2Nono" - pyTorchRVCv3 = "pyTorchRVCv3" pyTorchWebUI = "pyTorchWebUI" pyTorchWebUINono = "pyTorchWebUINono" + pyTorchVoRASbeta = "pyTorchVoRASbeta" onnxRVC = "onnxRVC" onnxRVCNono = "onnxRVCNono" diff --git a/server/voice_changer/RVC/ModelSlotGenerator.py b/server/voice_changer/RVC/ModelSlotGenerator.py index b897cf2c..99a8b14f 100644 --- a/server/voice_changer/RVC/ModelSlotGenerator.py +++ b/server/voice_changer/RVC/ModelSlotGenerator.py @@ -10,18 +10,15 @@ from data.ModelSlot import ModelSlot def _setInfoByPytorch(slot: ModelSlot): cpt = torch.load(slot.modelFile, map_location="cpu") config_len = len(cpt["config"]) - - if cpt["version"] == "v3": + print(cpt["version"]) + if cpt["version"] == "voras_beta": slot.f0 = True if cpt["f0"] == 1 else False - slot.modelType = EnumInferenceTypes.pyTorchRVCv3.value - slot.embChannels = cpt["config"][17] + slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value + slot.embChannels = 768 slot.embOutputLayer = ( cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9 ) - if slot.embChannels == 256: - slot.useFinalProj = True - else: - slot.useFinalProj = False + slot.useFinalProj = False slot.embedder = cpt["embedder_name"] if slot.embedder.endswith("768"): @@ -33,7 +30,6 @@ def _setInfoByPytorch(slot: ModelSlot): slot.embedder = EnumEmbedderTypes.contentvec.value elif slot.embedder == EnumEmbedderTypes.hubert_jp.value: slot.embedder = EnumEmbedderTypes.hubert_jp.value - print("nadare v3 loaded") else: raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder") diff --git a/server/voice_changer/RVC/inferencer/Inferencer.py b/server/voice_changer/RVC/inferencer/Inferencer.py index 3e5a62d8..cf5913bf 100644 --- a/server/voice_changer/RVC/inferencer/Inferencer.py +++ b/server/voice_changer/RVC/inferencer/Inferencer.py @@ -8,7 +8,7 @@ from const import EnumInferenceTypes class Inferencer(Protocol): inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC file: str - isHalf: bool = True + isHalf: bool = False gpu: int = 0 model: onnxruntime.InferenceSession | Any | None = None diff --git a/server/voice_changer/RVC/inferencer/InferencerManager.py b/server/voice_changer/RVC/inferencer/InferencerManager.py index 0bc63bd9..ef4bea5e 100644 --- a/server/voice_changer/RVC/inferencer/InferencerManager.py +++ b/server/voice_changer/RVC/inferencer/InferencerManager.py @@ -8,7 +8,7 @@ from voice_changer.RVC.inferencer.RVCInferencerv2 import RVCInferencerv2 from voice_changer.RVC.inferencer.RVCInferencerv2Nono import RVCInferencerv2Nono from voice_changer.RVC.inferencer.WebUIInferencer import WebUIInferencer from voice_changer.RVC.inferencer.WebUIInferencerNono import WebUIInferencerNono -from voice_changer.RVC.inferencer.RVCInferencerv3 import RVCInferencerv3 +from voice_changer.RVC.inferencer.VorasInferencebeta import VoRASInferencer class InferencerManager: currentInferencer: Inferencer | None = None @@ -37,8 +37,8 @@ class InferencerManager: return RVCInferencerNono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2 or inferencerType == EnumInferenceTypes.pyTorchRVCv2.value: return RVCInferencerv2().loadModel(file, gpu) - elif inferencerType == EnumInferenceTypes.pyTorchRVCv3 or inferencerType == EnumInferenceTypes.pyTorchRVCv3.value: - return RVCInferencerv3().loadModel(file, gpu) + elif inferencerType == EnumInferenceTypes.pyTorchVoRASbeta or inferencerType == EnumInferenceTypes.pyTorchVoRASbeta.value: + return VoRASInferencer().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono or inferencerType == EnumInferenceTypes.pyTorchRVCv2Nono.value: return RVCInferencerv2Nono().loadModel(file, gpu) elif inferencerType == EnumInferenceTypes.pyTorchWebUI or inferencerType == EnumInferenceTypes.pyTorchWebUI.value: diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv3.py b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py similarity index 68% rename from server/voice_changer/RVC/inferencer/RVCInferencerv3.py rename to server/voice_changer/RVC/inferencer/VorasInferencebeta.py index ed6eb63f..a5b02f40 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv3.py +++ b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py @@ -4,26 +4,25 @@ from torch import device from const import EnumInferenceTypes from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from .model_v3.models import SynthesizerTrnMs256NSFSid +from .voras_beta.models import Synthesizer -class RVCInferencerv3(Inferencer): +class VoRASInferencer(Inferencer): def loadModel(self, file: str, gpu: device): - print("nadare v3 load start") - super().setProps(EnumInferenceTypes.pyTorchRVCv3, file, True, gpu) + super().setProps(EnumInferenceTypes.pyTorchVoRASbeta, file, False, gpu) dev = DeviceManager.get_instance().getDevice(gpu) - isHalf = False # DeviceManager.get_instance().halfPrecisionAvailable(gpu) + self.isHalf = False # DeviceManager.get_instance().halfPrecisionAvailable(gpu) cpt = torch.load(file, map_location="cpu") - model = SynthesizerTrnMs256NSFSid(**cpt["params"]) + model = Synthesizer(**cpt["params"]) model.eval() model.load_state_dict(cpt["weight"], strict=False) + model.remove_weight_norm() + model.change_speaker(0) model = model.to(dev) - if isHalf: - model = model.half() self.model = model print("load model comprete") diff --git a/server/voice_changer/RVC/inferencer/model_v3/attentions.py b/server/voice_changer/RVC/inferencer/model_v3/attentions.py deleted file mode 100644 index 0a9ecf37..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/attentions.py +++ /dev/null @@ -1,343 +0,0 @@ -import math - -import torch -from torch import nn -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, weight_norm - -from . import commons -from .modules import LayerNorm, LoRALinear1d - - -class Encoder(nn.Module): - def __init__( - self, - hidden_channels, - filter_channels, - gin_channels, - n_heads, - n_layers, - kernel_size=1, - p_dropout=0.0, - window_size=25, - **kwargs - ): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, - hidden_channels, - gin_channels, - n_heads, - p_dropout=p_dropout, - window_size=window_size, - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN( - hidden_channels, - hidden_channels, - filter_channels, - gin_channels, - kernel_size, - p_dropout=p_dropout, - ) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, g): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, g, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask, g) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.attn_layers: - l.remove_weight_norm() - for l in self.ffn_layers: - l.remove_weight_norm() - - -class MultiHeadAttention(nn.Module): - def __init__( - self, - channels, - out_channels, - gin_channels, - n_heads, - p_dropout=0.0, - window_size=None, - heads_share=False, - block_length=None, - proximal_bias=False, - proximal_init=False, - ): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_k = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_v = LoRALinear1d(channels, channels, gin_channels, 2) - self.conv_qkw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) - self.conv_vw = weight_norm(nn.Conv1d(channels, channels, 5, 1, groups=channels, padding=2)) - self.conv_o = LoRALinear1d(channels, out_channels, gin_channels, 2) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) - * rel_stddev - ) - - def forward(self, x, c, g, attn_mask=None): - q = self.conv_qkw(self.conv_q(x, g)) - k = self.conv_qkw(self.conv_k(c, g)) - v = self.conv_vw(self.conv_v(c, g)) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x, g) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert ( - t_s == t_t - ), "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys( - query / math.sqrt(self.k_channels), key_relative_embeddings - ) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to( - device=scores.device, dtype=scores.dtype - ) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert ( - t_s == t_t - ), "Local attention is only available for self-attention." - block_mask = ( - torch.ones_like(scores) - .triu(-self.block_length) - .tril(self.block_length) - ) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings( - self.emb_rel_v, t_s - ) - output = output + self._matmul_with_relative_values( - relative_weights, value_relative_embeddings - ) - output = ( - output.transpose(2, 3).contiguous().view(b, d, t_t) - ) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), - ) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[ - :, slice_start_position:slice_end_position - ] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad( - x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) - ) - - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ - :, :, :length, length - 1 : - ] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad( - x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) - ) - x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - def remove_weight_norm(self): - self.conv_q.remove_weight_norm() - self.conv_k.remove_weight_norm() - self.conv_v.remove_weight_norm() - self.conv_o.remove_weight_norm() - remove_weight_norm(self.conv_qkw) - remove_weight_norm(self.conv_vw) - - -class FFN(nn.Module): - def __init__( - self, - in_channels, - out_channels, - filter_channels, - gin_channels, - kernel_size, - p_dropout=0.0, - activation=None, - causal=False, - ): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - self.conv_1 = LoRALinear1d(in_channels, filter_channels, gin_channels, 2) - self.conv_2 = LoRALinear1d(filter_channels, out_channels, gin_channels, 2) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask, g): - x = self.conv_1(x * x_mask, g) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(x * x_mask, g) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - - def remove_weight_norm(self): - self.conv_1.remove_weight_norm() - self.conv_2.remove_weight_norm() diff --git a/server/voice_changer/RVC/inferencer/model_v3/models.py b/server/voice_changer/RVC/inferencer/model_v3/models.py deleted file mode 100644 index 2f8cc064..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/models.py +++ /dev/null @@ -1,522 +0,0 @@ -import math -import os -import sys - -import numpy as np -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d, ConvTranspose1d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from . import attentions, commons, modules -from .commons import get_padding, init_weights -from .modules import (CausalConvTranspose1d, ConvNext2d, DilatedCausalConv1d, - LoRALinear1d, ResBlock1, WaveConv1D) - -parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -sys.path.append(parent_dir) - - -class TextEncoder(nn.Module): - def __init__( - self, - out_channels: int, - hidden_channels: int, - filter_channels: int, - emb_channels: int, - gin_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: int, - f0: bool = True, - ): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.emb_channels = emb_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.emb_phone = nn.Linear(emb_channels, hidden_channels) - self.lrelu = nn.LeakyReLU(0.1, inplace=True) - if f0 == True: - self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 - self.emb_g = nn.Conv1d(gin_channels, hidden_channels, 1) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout - ) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - - def forward(self, phone, pitch, lengths, g): - if pitch == None: - x = self.emb_phone(phone) - else: - x = self.emb_phone(phone) + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( - x.dtype - ) - x = self.encoder(x * x_mask, x_mask, g) - x = self.proj(x) - - return x, None, x_mask - - -class SineGen(torch.nn.Module): - """Definition of sine generator - SineGen(samp_rate, harmonic_num = 0, - sine_amp = 0.1, noise_std = 0.003, - voiced_threshold = 0, - flag_for_pulse=False) - samp_rate: sampling rate in Hz - harmonic_num: number of harmonic overtones (default 0) - sine_amp: amplitude of sine-wavefrom (default 0.1) - noise_std: std of Gaussian noise (default 0.003) - voiced_thoreshold: F0 threshold for U/V classification (default 0) - flag_for_pulse: this SinGen is used inside PulseGen (default False) - Note: when flag_for_pulse is True, the first time step of a voiced - segment is always sin(np.pi) or cos(0) - """ - - def __init__( - self, - samp_rate, - harmonic_num=0, - sine_amp=0.1, - noise_std=0.003, - voiced_threshold=0, - flag_for_pulse=False, - ): - super(SineGen, self).__init__() - self.sine_amp = sine_amp - self.noise_std = noise_std - self.harmonic_num = harmonic_num - self.dim = self.harmonic_num + 1 - self.sampling_rate = samp_rate - self.voiced_threshold = voiced_threshold - - def _f02uv(self, f0): - # generate uv signal - uv = torch.ones_like(f0) - uv = uv * (f0 > self.voiced_threshold) - return uv - - def forward(self, f0, upp): - """sine_tensor, uv = forward(f0) - input F0: tensor(batchsize=1, length, dim=1) - f0 for unvoiced steps should be 0 - output sine_tensor: tensor(batchsize=1, length, dim) - output uv: tensor(batchsize=1, length, 1) - """ - with torch.no_grad(): - f0 = f0[:, None].transpose(1, 2) - f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) - # fundamental component - f0_buf[:, :, 0] = f0[:, :, 0] - for idx in np.arange(self.harmonic_num): - f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( - idx + 2 - ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic - rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 - rand_ini = torch.rand( - f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device - ) - rand_ini[:, 0] = 0 - rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini - tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 - tmp_over_one *= upp - tmp_over_one = F.interpolate( - tmp_over_one.transpose(2, 1), - scale_factor=upp, - mode="linear", - align_corners=True, - ).transpose(2, 1) - rad_values = F.interpolate( - rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose( - 2, 1 - ) ####### - tmp_over_one %= 1 - tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 - cumsum_shift = torch.zeros_like(rad_values) - cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 - sine_waves = torch.sin( - torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi - ) - sine_waves = sine_waves * self.sine_amp - uv = self._f02uv(f0) - uv = F.interpolate( - uv.transpose(2, 1), scale_factor=upp, mode="nearest" - ).transpose(2, 1) - noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 - noise = noise_amp * torch.randn_like(sine_waves) - sine_waves = sine_waves * uv + noise - return sine_waves, uv, noise - - -class SourceModuleHnNSF(torch.nn.Module): - """SourceModule for hn-nsf - SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, - add_noise_std=0.003, voiced_threshod=0) - sampling_rate: sampling_rate in Hz - harmonic_num: number of harmonic above F0 (default: 0) - sine_amp: amplitude of sine source signal (default: 0.1) - add_noise_std: std of additive Gaussian noise (default: 0.003) - note that amplitude of noise in unvoiced is decided - by sine_amp - voiced_threshold: threhold to set U/V given F0 (default: 0) - Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) - F0_sampled (batchsize, length, 1) - Sine_source (batchsize, length, 1) - noise_source (batchsize, length 1) - uv (batchsize, length, 1) - """ - - def __init__( - self, - sampling_rate, - gin_channels, - harmonic_num=0, - sine_amp=0.1, - add_noise_std=0.003, - voiced_threshod=0, - is_half=True, - ): - super(SourceModuleHnNSF, self).__init__() - - self.sine_amp = sine_amp - self.noise_std = add_noise_std - self.is_half = is_half - # to produce sine waveforms - self.l_sin_gen = SineGen( - sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod - ) - - # to merge source harmonics into a single excitation - self.l_linear = torch.nn.Conv1d(gin_channels, harmonic_num + 1, 1) - self.l_tanh = torch.nn.Tanh() - - def forward(self, x, upp=None): - sine_wavs, uv, _ = self.l_sin_gen(x, upp) - sine_raw = torch.transpose(sine_wavs, 1, 2).to(device=x.device, dtype=x.dtype) - return sine_raw, None, None # noise, uv - - -class GeneratorNSF(torch.nn.Module): - def __init__( - self, - initial_channel, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels, - sr, - harmonic_num=16, - is_half=False, - ): - super(GeneratorNSF, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.upsample_rates = upsample_rates - - self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) - self.m_source = SourceModuleHnNSF( - sampling_rate=sr, gin_channels=gin_channels, harmonic_num=harmonic_num, is_half=is_half - ) - self.gpre = Conv1d(gin_channels, initial_channel, 1) - self.conv_pre = ResBlock1(initial_channel, upsample_initial_channel, gin_channels, [7] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, 2) - - self.ups = nn.ModuleList() - self.resblocks = nn.ModuleList() - c_cur = upsample_initial_channel - for i, u in enumerate(upsample_rates): - c_pre = c_cur - c_cur = c_pre // 2 - self.ups.append( - CausalConvTranspose1d( - c_pre, - c_pre, - kernel_rate=3, - stride=u, - groups=c_pre, - ) - ) - self.resblocks.append(ResBlock1(c_pre, c_cur, gin_channels, [11] * 5, [1] * 5, [1, 2, 4, 8, 1], 1, r=2)) - self.conv_post = DilatedCausalConv1d(c_cur, 1, 5, stride=1, groups=1, dilation=1, bias=False) - self.noise_convs = nn.ModuleList() - self.noise_pre = LoRALinear1d(1 + harmonic_num, c_pre, gin_channels, r=2+harmonic_num) - for i, u in enumerate(upsample_rates[::-1]): - c_pre = c_pre * 2 - c_cur = c_cur * 2 - if i + 1 < len(upsample_rates): - self.noise_convs.append(DilatedCausalConv1d(c_cur, c_pre, kernel_size=u*3, stride=u, groups=c_cur, dilation=1)) - else: - self.noise_convs.append(DilatedCausalConv1d(c_cur, initial_channel, kernel_size=u*3, stride=u, groups=math.gcd(c_cur, initial_channel), dilation=1)) - self.upp = np.prod(upsample_rates) - - def forward(self, x, x_mask, f0f, g): - har_source, noi_source, uv = self.m_source(f0f, self.upp) - har_source = self.noise_pre(har_source, g) - x_sources = [har_source] - for c in self.noise_convs: - har_source = c(har_source) - x_sources.append(har_source) - - x = x + x_sources[-1] - x = x + self.gpre(g) - x = self.conv_pre(x, x_mask, g) - for i, u in enumerate(self.upsample_rates): - x_mask = torch.repeat_interleave(x_mask, u, 2) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - x = self.resblocks[i](x + x_sources[-i-2], x_mask, g) - - x = F.leaky_relu(x) - x = self.conv_post(x) - if x_mask is not None: - x *= x_mask - x = torch.tanh(x) - return x - - def remove_weight_norm(self): - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - remove_weight_norm(self.noise_pre) - remove_weight_norm(self.conv_post) - - -sr2sr = { - "32k": 32000, - "40k": 40000, - "48k": 48000, -} - - -class SynthesizerTrnMs256NSFSid(nn.Module): - def __init__( - self, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - spk_embed_dim, - gin_channels, - emb_channels, - sr, - **kwargs - ): - super().__init__() - if type(sr) == type("strr"): - sr = sr2sr[sr] - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.emb_channels = emb_channels - self.sr = sr - # self.hop_length = hop_length# - self.spk_embed_dim = spk_embed_dim - - self.emb_pitch = nn.Embedding(256, emb_channels) # pitch 256 - self.dec = GeneratorNSF( - emb_channels, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - gin_channels=gin_channels, - sr=sr, - ) - - self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) - print( - "gin_channels:", - gin_channels, - "self.spk_embed_dim:", - self.spk_embed_dim, - "emb_channels:", - emb_channels, - ) - - def remove_weight_norm(self): - self.dec.remove_weight_norm() - - def forward( - self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds - ): # 这里ds是id,[bs,1] - # print(1,pitch.shape)#[bs,t] - g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 - # m_p, _, x_mask = self.enc_p(phone, pitch, phone_lengths, g) - # z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - # z_p = self.flow(m_p * x_mask, x_mask, g=g) - - x = phone + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( - phone.dtype - ) - - m_p_slice, ids_slice = commons.rand_slice_segments( - x, phone_lengths, self.segment_size - ) - # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) - pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) - mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) - # print(-2,pitchf.shape,z_slice.shape) - o = self.dec(m_p_slice, mask_slice, pitchf, g) - return o, ids_slice, x_mask, g - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): - g = self.emb_g(sid).unsqueeze(-1) - x = phone + self.emb_pitch(pitch) - x = torch.transpose(x, 1, -1) - x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to( - phone.dtype - ) - o = self.dec((x * x_mask)[:, :, :max_len], x_mask, nsff0, g) - return o, x_mask, (None, None, None, None) - - -class DiscriminatorS(torch.nn.Module): - def __init__( - self, - hidden_channels: int, - filter_channels: int, - gin_channels: int, - n_heads: int, - n_layers: int, - kernel_size: int, - p_dropout: int, - ): - super(DiscriminatorS, self).__init__() - self.convs = WaveConv1D(2, hidden_channels, gin_channels, [10, 7, 7, 7, 5, 3, 3], [5, 4, 4, 4, 3, 2, 2], [1] * 7, hidden_channels // 2, False) - self.encoder = attentions.Encoder( - hidden_channels, filter_channels, gin_channels, n_heads, n_layers//2, kernel_size, p_dropout - ) - self.cross = weight_norm(torch.nn.Conv1d(gin_channels, hidden_channels, 1, 1)) - self.conv_post = weight_norm(torch.nn.Conv1d(hidden_channels, 1, 3, 1, padding=get_padding(5, 1))) - - def forward(self, x, g): - x = self.convs(x) - x_mask = torch.ones([x.shape[0], 1, x.shape[2]], device=x.device, dtype=x.dtype) - x = self.encoder(x, x_mask, g) - fmap = [x] - x = x + x * self.cross(g) - y = self.conv_post(x) - return y, fmap - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - self.init_kernel_size = upsample_rates[-1] * 3 - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - N = len(upsample_rates) - self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) - self.convs = nn.ModuleList() - for i, u in enumerate(upsample_rates[::-1][1:], start=1): - self.convs.append( - ConvNext2d( - final_dim // (2 ** (N - i)), - final_dim // (2 ** (N - i - 1)), - gin_channels, - (u*3, 1), - (u, 1), - 4, - r=2 - ) - ) - self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) - - def forward(self, x, g): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (n_pad, 0), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - x = torch.flip(x, dims=[2]) - x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") - x = self.init_conv(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = torch.flip(x, dims=[2]) - - for i, l in enumerate(self.convs): - x = l(x, g) - if i >= 1: - fmap.append(x) - - x = F.pad(x, [0, 0, 2, 0], mode="constant") - x = self.conv_post(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): - super(MultiPeriodDiscriminator, self).__init__() - - # discs = [DiscriminatorS(hidden_channels, filter_channels, gin_channels, n_heads, n_layers, kernel_size, p_dropout)] - discs = [ - DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods - ] - self.ups = np.prod(upsample_rates) - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat, g): - fmap_rs = [] - fmap_gs = [] - y_d_rs = [] - y_d_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y, g) - y_d_g, fmap_g = d(y_hat, g) - # for j in range(len(fmap_r)): - # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/model_v3/modules.py b/server/voice_changer/RVC/inferencer/model_v3/modules.py deleted file mode 100644 index fb0db86d..00000000 --- a/server/voice_changer/RVC/inferencer/model_v3/modules.py +++ /dev/null @@ -1,626 +0,0 @@ -import math - -import torch -from torch import nn -from torch.nn import Conv1d, Conv2d -from torch.nn import functional as F -from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm - -from . import commons, modules -from .commons import get_padding, init_weights -from .transforms import piecewise_rational_quadratic_transform - -LRELU_SLOPE = 0.1 - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__( - self, - in_channels, - hidden_channels, - out_channels, - kernel_size, - n_layers, - p_dropout, - ): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append( - nn.Conv1d( - in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) - for _ in range(n_layers - 1): - self.conv_layers.append( - nn.Conv1d( - hidden_channels, - hidden_channels, - kernel_size, - padding=kernel_size // 2, - ) - ) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size**i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append( - nn.Conv1d( - channels, - channels, - kernel_size, - groups=channels, - dilation=dilation, - padding=padding, - ) - ) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__( - self, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0, - p_dropout=0, - ): - super(WN, self).__init__() - assert kernel_size % 2 == 1 - self.hidden_channels = hidden_channels - self.kernel_size = (kernel_size,) - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d( - gin_channels, 2 * hidden_channels * n_layers, 1 - ) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") - - for i in range(n_layers): - dilation = dilation_rate**i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d( - hidden_channels, - 2 * hidden_channels, - kernel_size, - dilation=dilation, - padding=padding, - ) - in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:, : self.hidden_channels, :] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:, self.hidden_channels :, :] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - -class DilatedCausalConv1d(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): - super(DilatedCausalConv1d, self).__init__() - self.kernel_size = kernel_size - self.dilation = dilation - self.stride = stride - self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) - init_weights(self.conv) - - def forward(self, x): - x = torch.flip(x, [2]) - x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) - size = x.shape[2] // self.stride - x = self.conv(x)[:, :, :size] - x = torch.flip(x, [2]) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class CausalConvTranspose1d(nn.Module): - """ - padding = 0, dilation = 1のとき - - Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding - Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding - output_paddingいらないね - """ - def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): - super(CausalConvTranspose1d, self).__init__() - kernel_size = kernel_rate * stride - self.trim_size = (kernel_rate - 1) * stride - self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) - - def forward(self, x): - x = self.conv(x) - return x[:, :, :-self.trim_size] - - def remove_weight_norm(self): - remove_weight_norm(self.conv) - - -class LoRALinear1d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - init_weights(self.main_fc) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - - def forward(self, x, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - x = self.main_fc(x) + torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - -class LoRALinear2d(nn.Module): - def __init__(self, in_channels, out_channels, info_channels, r): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.info_channels = info_channels - self.r = r - self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) - self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) - self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) - nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) - nn.init.constant_(self.adapter_out.weight.data, 1e-6) - self.adapter_in = weight_norm(self.adapter_in) - self.adapter_out = weight_norm(self.adapter_out) - - def forward(self, x, g): - a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) - a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) - x = self.main_fc(x) + torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.main_fc) - remove_weight_norm(self.adapter_in) - remove_weight_norm(self.adapter_out) - - -class WaveConv1D(torch.nn.Module): - def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r, use_spectral_norm=False): - super(WaveConv1D, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.convs = [] - # self.norms = [] - self.convs.append(LoRALinear1d(in_channels, inner_channels, gin_channels, r)) - for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations), start=1): - self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, k, s, dilation=d, groups=inner_channels, padding=get_padding(k, d)))) - if i < len(kernel_sizes): - self.convs.append(norm_f(Conv1d(inner_channels, inner_channels, 1, 1))) - else: - self.convs.append(norm_f(Conv1d(inner_channels, out_channels, 1, 1))) - self.convs = nn.ModuleList(self.convs) - - def forward(self, x, g, x_mask=None): - for i, l in enumerate(self.convs): - if i % 2: - x_ = l(x) - else: - x_ = l(x, g) - x = F.leaky_relu(x_, modules.LRELU_SLOPE) - if x_mask is not None: - x *= x_mask - return x - - def remove_weight_norm(self): - for i, c in enumerate(self.convs): - if i % 2: - remove_weight_norm(c) - else: - c.remove_weight_norm() - - -class MBConv2d(torch.nn.Module): - """ - Causal MBConv2D - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(MBConv2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.pre_pointwise = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.depthwise = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) - self.post_pointwise = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - - def forward(self, x, g): - x = self.pre_pointwise(x, g) - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.depthwise(x) - x = self.post_pointwise(x, g) - return x - - -class ConvNext2d(torch.nn.Module): - """ - Causal ConvNext Block - stride = 1 only - """ - def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): - super(ConvNext2d, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - inner_channels = int(in_channels * extend_ratio) - self.kernel_size = kernel_size - self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) - self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) - self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) - self.act = nn.GELU() - self.norm = LayerNorm(in_channels) - - def forward(self, x, g): - x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") - x = self.dwconv(x) - x = self.norm(x) - x = self.pwconv1(x, g) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.pwconv2(x, g) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - return x - - def remove_weight_norm(self): - remove_weight_norm(self.dwconv) - - -class SqueezeExcitation1D(torch.nn.Module): - def __init__(self, input_channels, squeeze_channels, gin_channels, use_spectral_norm=False): - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - super(SqueezeExcitation1D, self).__init__() - self.fc1 = LoRALinear1d(input_channels, squeeze_channels, gin_channels, 2) - self.fc2 = LoRALinear1d(squeeze_channels, input_channels, gin_channels, 2) - - def _scale(self, x, x_mask, g): - x_length = torch.sum(x_mask, dim=2, keepdim=True) - x_length = torch.maximum(x_length, torch.ones_like(x_length)) - scale = torch.sum(x * x_mask, dim=2, keepdim=True) / x_length - scale = self.fc1(scale, g) - scale = F.leaky_relu(scale, modules.LRELU_SLOPE) - scale = self.fc2(scale, g) - return torch.sigmoid(scale) - - def forward(self, x, x_mask, g): - scale = self._scale(x, x_mask, g) - return scale * x - - def remove_weight_norm(self): - self.fc1.remove_weight_norm() - self.fc2.remove_weight_norm() - - -class ResBlock1(torch.nn.Module): - def __init__(self, in_channels, out_channels, gin_channels, kernel_sizes, strides, dilations, extend_ratio, r): - super(ResBlock1, self).__init__() - norm_f = weight_norm - inner_channels = int(in_channels * extend_ratio) - self.dconvs = nn.ModuleList() - self.pconvs = nn.ModuleList() - # self.ses = nn.ModuleList() - self.norms = nn.ModuleList() - self.init_conv = LoRALinear1d(in_channels, inner_channels, gin_channels, r) - for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): - self.norms.append(LayerNorm(inner_channels)) - self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) - if i < len(kernel_sizes) - 1: - self.pconvs.append(LoRALinear1d(inner_channels, inner_channels, gin_channels, r)) - self.out_conv = LoRALinear1d(inner_channels, out_channels, gin_channels, r) - init_weights(self.init_conv) - init_weights(self.out_conv) - - def forward(self, x, x_mask, g): - x *= x_mask - x = self.init_conv(x, g) - for i in range(len(self.dconvs)): - x *= x_mask - x = self.norms[i](x) - x_ = self.dconvs[i](x) - x_ = F.leaky_relu(x_, modules.LRELU_SLOPE) - if i < len(self.dconvs) - 1: - x = x + self.pconvs[i](x_, g) - x = self.out_conv(x_, g) - return x - - def remove_weight_norm(self): - for c in self.dconvs: - c.remove_weight_norm() - for c in self.pconvs: - c.remove_weight_norm() - self.init_conv.remove_weight_norm() - self.out_conv.remove_weight_norm() - - -class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels, 1)) - self.logs = nn.Parameter(torch.zeros(channels, 1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1, 2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__( - self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False, - ): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN( - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=p_dropout, - gin_channels=gin_channels, - ) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels] * 2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1, 2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - def remove_weight_norm(self): - self.enc.remove_weight_norm() - - -class ConvFlow(nn.Module): - def __init__( - self, - in_channels, - filter_channels, - kernel_size, - n_layers, - num_bins=10, - tail_bound=5.0, - ): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) - self.proj = nn.Conv1d( - filter_channels, self.half_channels * (num_bins * 3 - 1), 1 - ) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( - self.filter_channels - ) - unnormalized_derivatives = h[..., 2 * self.num_bins :] - - x1, logabsdet = piecewise_rational_quadratic_transform( - x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails="linear", - tail_bound=self.tail_bound, - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1, 2]) - if not reverse: - return x, logdet - else: - return x diff --git a/server/voice_changer/RVC/inferencer/model_v3/commons.py b/server/voice_changer/RVC/inferencer/voras_beta/commons.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/commons.py rename to server/voice_changer/RVC/inferencer/voras_beta/commons.py diff --git a/server/voice_changer/RVC/inferencer/model_v3/config.py b/server/voice_changer/RVC/inferencer/voras_beta/config.py similarity index 77% rename from server/voice_changer/RVC/inferencer/model_v3/config.py rename to server/voice_changer/RVC/inferencer/voras_beta/config.py index 6a0dc817..ddfb4271 100644 --- a/server/voice_changer/RVC/inferencer/model_v3/config.py +++ b/server/voice_changer/RVC/inferencer/voras_beta/config.py @@ -32,27 +32,17 @@ class TrainConfigData(BaseModel): class TrainConfigModel(BaseModel): + emb_channels: int inter_channels: int - hidden_channels: int - filter_channels: int - n_heads: int n_layers: int - kernel_size: int - p_dropout: int - resblock: str - resblock_kernel_sizes: List[int] - resblock_dilation_sizes: List[List[int]] upsample_rates: List[int] - upsample_initial_channel: int - upsample_kernel_sizes: List[int] use_spectral_norm: bool gin_channels: int - emb_channels: int spk_embed_dim: int class TrainConfig(BaseModel): - version: Literal["v1", "v2"] = "v2" + version: Literal["voras"] = "voras" train: TrainConfigTrain data: TrainConfigData model: TrainConfigModel diff --git a/server/voice_changer/RVC/inferencer/voras_beta/models.py b/server/voice_changer/RVC/inferencer/voras_beta/models.py new file mode 100644 index 00000000..3168e590 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/voras_beta/models.py @@ -0,0 +1,238 @@ +import math +import os +import sys + +import numpy as np +import torch +from torch import nn +from torch.nn import Conv2d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from . import commons, modules +from .commons import get_padding +from .modules import (ConvNext2d, HarmonicEmbedder, IMDCTSymExpHead, + LoRALinear1d, SnakeFilter, WaveBlock) + +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +sr2sr = { + "24k": 24000, + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + +class GeneratorVoras(torch.nn.Module): + def __init__( + self, + emb_channels, + inter_channels, + gin_channels, + n_layers, + sr, + hop_length, + ): + super(GeneratorVoras, self).__init__() + self.n_layers = n_layers + self.emb_pitch = HarmonicEmbedder(768, inter_channels, gin_channels, 16, 15) # # pitch 256 + self.plinear = LoRALinear1d(inter_channels, inter_channels, gin_channels, r=8) + self.glinear = weight_norm(nn.Conv1d(gin_channels, inter_channels, 1)) + self.resblocks = nn.ModuleList() + self.init_linear = LoRALinear1d(emb_channels, inter_channels, gin_channels, r=4) + for _ in range(self.n_layers): + self.resblocks.append(WaveBlock(inter_channels, gin_channels, [9] * 2, [1] * 2, [1, 9], 2, r=4)) + self.head = IMDCTSymExpHead(inter_channels, gin_channels, hop_length, padding="center", sample_rate=sr) + self.post = SnakeFilter(4, 8, 9, 2, eps=1e-5) + + def forward(self, x, pitchf, x_mask, g): + x = self.init_linear(x, g) + self.plinear(self.emb_pitch(pitchf, g), g) + self.glinear(g) + for i in range(self.n_layers): + x = self.resblocks[i](x, x_mask, g) + x = x * x_mask + x = self.head(x, g) + x = self.post(x) + return torch.tanh(x) + + def remove_weight_norm(self): + self.plinear.remove_weight_norm() + remove_weight_norm(self.glinear) + for l in self.resblocks: + l.remove_weight_norm() + self.init_linear.remove_weight_norm() + self.head.remove_weight_norm() + self.post.remove_weight_norm() + + def fix_speaker(self, g): + self.plinear.fix_speaker(g) + self.init_linear.fix_speaker(g) + for l in self.resblocks: + l.fix_speaker(g) + self.head.fix_speaker(g) + + def unfix_speaker(self, g): + self.plinear.unfix_speaker(g) + self.init_linear.unfix_speaker(g) + for l in self.resblocks: + l.unfix_speaker(g) + self.head.unfix_speaker(g) + + +class Synthesizer(nn.Module): + def __init__( + self, + segment_size, + n_fft, + hop_length, + inter_channels, + n_layers, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.segment_size = segment_size + self.n_fft = n_fft + self.hop_length = hop_length + self.inter_channels = inter_channels + self.n_layers = n_layers + self.spk_embed_dim = spk_embed_dim + self.gin_channels = gin_channels + self.emb_channels = emb_channels + self.sr = sr + + self.dec = GeneratorVoras( + emb_channels, + inter_channels, + gin_channels, + n_layers, + sr, + hop_length + ) + + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print( + "gin_channels:", + gin_channels, + "self.spk_embed_dim:", + self.spk_embed_dim, + "emb_channels:", + emb_channels, + ) + self.speaker = None + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + + def change_speaker(self, sid: int): + if self.speaker is not None: + g = self.emb_g(torch.from_numpy(np.array(self.speaker))).unsqueeze(-1) + self.dec.unfix_speaker(g) + g = self.emb_g(torch.from_numpy(np.array(sid))).unsqueeze(-1) + self.dec.fix_speaker(g) + self.speaker = sid + + def forward( + self, phone, phone_lengths, pitch, pitchf, ds + ): + g = self.emb_g(ds).unsqueeze(-1) + x = torch.transpose(phone, 1, -1) + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) + x_slice, ids_slice = commons.rand_slice_segments( + x, phone_lengths, self.segment_size + ) + pitchf_slice = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + mask_slice = commons.slice_segments(x_mask, ids_slice, self.segment_size) + o = self.dec(x_slice, pitchf_slice, mask_slice, g) + return o, ids_slice, x_mask, g + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + x = torch.transpose(phone, 1, -1) + x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(phone.dtype) + o = self.dec((x * x_mask)[:, :, :max_len], nsff0, x_mask, g) + return o, x_mask, (None, None, None, None) + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, gin_channels, upsample_rates, final_dim=256, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + self.init_kernel_size = upsample_rates[-1] * 3 + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + N = len(upsample_rates) + self.init_conv = norm_f(Conv2d(1, final_dim // (2 ** (N - 1)), (self.init_kernel_size, 1), (upsample_rates[-1], 1))) + self.convs = nn.ModuleList() + for i, u in enumerate(upsample_rates[::-1][1:], start=1): + self.convs.append( + ConvNext2d( + final_dim // (2 ** (N - i)), + final_dim // (2 ** (N - i - 1)), + gin_channels, + (u*3, 1), + (u, 1), + 4, + r=2 + i//2 + ) + ) + self.conv_post = weight_norm(Conv2d(final_dim, 1, (3, 1), (1, 1))) + + def forward(self, x, g): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (n_pad, 0), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + x = torch.flip(x, dims=[2]) + x = F.pad(x, [0, 0, 0, self.init_kernel_size - 1], mode="constant") + x = self.init_conv(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = torch.flip(x, dims=[2]) + fmap.append(x) + + for i, l in enumerate(self.convs): + x = l(x, g) + fmap.append(x) + + x = F.pad(x, [0, 0, 2, 0], mode="constant") + x = self.conv_post(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, upsample_rates, gin_channels, periods=[2, 3, 5, 7, 11, 17], **kwargs): + super(MultiPeriodDiscriminator, self).__init__() + + discs = [ + DiscriminatorP(i, gin_channels, upsample_rates, use_spectral_norm=False) for i in periods + ] + self.ups = np.prod(upsample_rates) + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat, g): + fmap_rs = [] + fmap_gs = [] + y_d_rs = [] + y_d_gs = [] + for d in self.discriminators: + y_d_r, fmap_r = d(y, g) + y_d_g, fmap_g = d(y_hat, g) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/server/voice_changer/RVC/inferencer/voras_beta/modules.py b/server/voice_changer/RVC/inferencer/voras_beta/modules.py new file mode 100644 index 00000000..f6659c68 --- /dev/null +++ b/server/voice_changer/RVC/inferencer/voras_beta/modules.py @@ -0,0 +1,496 @@ +import math + +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import Conv1d, Conv2d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm +from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz + +from . import commons, modules +from .commons import get_padding, init_weights +from .transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + +class HarmonicEmbedder(nn.Module): + def __init__(self, num_embeddings, embedding_dim, gin_channels, num_head, num_harmonic=0, f0_min=50., f0_max=1100., device="cuda"): + super(HarmonicEmbedder, self).__init__() + self.embedding_dim = embedding_dim + self.num_head = num_head + self.num_harmonic = num_harmonic + + f0_mel_min = np.log(1 + f0_min / 700) + f0_mel_max = np.log(1 + f0_max * (1 + num_harmonic) / 700) + self.sequence = torch.from_numpy(np.linspace(f0_mel_min, f0_mel_max, num_embeddings-2)) + self.emb_layer = torch.nn.Embedding(num_embeddings, embedding_dim) + self.linear_q = Conv1d(gin_channels, num_head * (1 + num_harmonic), 1) + self.weight = None + + def forward(self, x, g): + b, l = x.size() + non_zero = (x != 0.).to(dtype=torch.long).unsqueeze(1) + mel = torch.log(1 + x / 700).unsqueeze(1) + harmonies = torch.arange(1 + self.num_harmonic, device=x.device, dtype=x.dtype).view(1, 1 + self.num_harmonic, 1) + 1. + ix = torch.searchsorted(self.sequence.to(x.device), mel * harmonies).to(x.device) + 1 + ix = ix * non_zero + emb = self.emb_layer(ix).transpose(1, 3).reshape(b, self.num_head, self.embedding_dim // self.num_head, 1 + self.num_harmonic, l) + if self.weight is None: + weight = torch.nn.functional.softmax(self.linear_q(g).reshape(b, self.num_head, 1, 1 + self.num_harmonic, 1), 3) + else: + weight = self.weight + res = torch.sum(emb * weight, dim=3).reshape(b, self.embedding_dim, l) + return res + + def fix_speaker(self, g): + self.weight = torch.nn.functional.softmax(self.linear_q(g).reshape(1, self.num_head, 1, 1 + self.num_harmonic, 1), 3) + + def unfix_speaker(self, g): + self.weight = None + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class DilatedCausalConv1d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, groups=1, dilation=1, bias=True): + super(DilatedCausalConv1d, self).__init__() + self.kernel_size = kernel_size + self.dilation = dilation + self.stride = stride + self.conv = weight_norm(nn.Conv1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups, dilation=dilation, bias=bias)) + + def forward(self, x): + x = torch.flip(x, [2]) + x = F.pad(x, [0, (self.kernel_size - 1) * self.dilation], mode="constant", value=0.) + size = x.shape[2] // self.stride + x = self.conv(x)[:, :, :size] + x = torch.flip(x, [2]) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class CausalConvTranspose1d(nn.Module): + """ + padding = 0, dilation = 1のとき + + Lout = (Lin - 1) * stride + kernel_rate * stride + output_padding + Lout = Lin * stride + (kernel_rate - 1) * stride + output_padding + output_paddingいらないね + """ + def __init__(self, in_channels, out_channels, kernel_rate=3, stride=1, groups=1): + super(CausalConvTranspose1d, self).__init__() + kernel_size = kernel_rate * stride + self.trim_size = (kernel_rate - 1) * stride + self.conv = weight_norm(nn.ConvTranspose1d(in_channels, out_channels, kernel_size, stride=stride, groups=groups)) + + def forward(self, x): + x = self.conv(x) + return x[:, :, :-self.trim_size] + + def remove_weight_norm(self): + remove_weight_norm(self.conv) + + +class LoRALinear1d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv1d(in_channels, out_channels, 1)) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + self.speaker_fixed = False + + def forward(self, x, g): + x_ = self.main_fc(x) + if not self.speaker_fixed: + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + l = torch.einsum("brl,brc->bcl", torch.einsum("bcl,bcr->brl", x, a_in), a_out) + x_ = x_ + l + return x_ + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + def fix_speaker(self, g): + self.speaker_fixed = True + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) + self.main_fc.weight.data.add_(weight) + + def unfix_speaker(self, g): + self.speaker_fixed = False + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2) + self.main_fc.weight.data.sub_(weight) + + +class LoRALinear2d(nn.Module): + def __init__(self, in_channels, out_channels, info_channels, r): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.info_channels = info_channels + self.r = r + self.main_fc = weight_norm(nn.Conv2d(in_channels, out_channels, (1, 1), (1, 1))) + self.adapter_in = nn.Conv1d(info_channels, in_channels * r, 1) + self.adapter_out = nn.Conv1d(info_channels, out_channels * r, 1) + nn.init.normal_(self.adapter_in.weight.data, 0, 0.01) + nn.init.constant_(self.adapter_out.weight.data, 1e-6) + self.adapter_in = weight_norm(self.adapter_in) + self.adapter_out = weight_norm(self.adapter_out) + self.speaker_fixed = False + + def forward(self, x, g): + x_ = self.main_fc(x) + if not self.speaker_fixed: + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + l = torch.einsum("brhw,brc->bchw", torch.einsum("bchw,bcr->brhw", x, a_in), a_out) + x_ = x_ + l + return x_ + + def remove_weight_norm(self): + remove_weight_norm(self.main_fc) + remove_weight_norm(self.adapter_in) + remove_weight_norm(self.adapter_out) + + def fix_speaker(self, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) + self.main_fc.weight.data.add_(weight) + + def unfix_speaker(self, g): + a_in = self.adapter_in(g).view(-1, self.in_channels, self.r) + a_out = self.adapter_out(g).view(-1, self.r, self.out_channels) + weight = torch.einsum("bir,bro->oi", a_in, a_out).unsqueeze(2).unsqueeze(3) + self.main_fc.weight.data.sub_(weight) + + +class MBConv2d(torch.nn.Module): + """ + Causal MBConv2D + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(MBConv2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.dwconv = norm_f(Conv2d(inner_channels, inner_channels, kernel_size, stride, groups=inner_channels)) + self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + self.pwnorm = LayerNorm(in_channels) + self.dwnorm = LayerNorm(inner_channels) + + def forward(self, x, g): + x = self.pwnorm(x) + x = self.pwconv1(x, g) + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.dwnorm(x) + x = self.dwconv(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.pwconv2(x, g) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + return x + +class ConvNext2d(torch.nn.Module): + """ + Causal ConvNext Block + stride = 1 only + """ + def __init__(self, in_channels, out_channels, gin_channels, kernel_size, stride, extend_ratio, r, use_spectral_norm=False): + super(ConvNext2d, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + inner_channels = int(in_channels * extend_ratio) + self.kernel_size = kernel_size + self.dwconv = norm_f(Conv2d(in_channels, in_channels, kernel_size, stride, groups=in_channels)) + self.pwconv1 = LoRALinear2d(in_channels, inner_channels, gin_channels, r=r) + self.pwconv2 = LoRALinear2d(inner_channels, out_channels, gin_channels, r=r) + self.act = nn.GELU() + self.norm = LayerNorm(in_channels) + + def forward(self, x, g): + x = F.pad(x, [0, 0, self.kernel_size[0] - 1, 0], mode="constant") + x = self.dwconv(x) + x = self.norm(x) + x = self.pwconv1(x, g) + x = self.act(x) + x = self.pwconv2(x, g) + x = self.act(x) + return x + + def remove_weight_norm(self): + remove_weight_norm(self.dwconv) + + +class WaveBlock(torch.nn.Module): + def __init__(self, inner_channels, gin_channels, kernel_sizes, strides, dilations, extend_rate, r): + super(WaveBlock, self).__init__() + norm_f = weight_norm + extend_channels = int(inner_channels * extend_rate) + self.dconvs = nn.ModuleList() + self.p1convs = nn.ModuleList() + self.p2convs = nn.ModuleList() + self.norms = nn.ModuleList() + self.act = nn.GELU() + + # self.ses = nn.ModuleList() + # self.norms = [] + for i, (k, s, d) in enumerate(zip(kernel_sizes, strides, dilations)): + self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, k, stride=s, dilation=d, groups=inner_channels)) + self.p1convs.append(LoRALinear1d(inner_channels, extend_channels, gin_channels, r)) + self.p2convs.append(LoRALinear1d(extend_channels, inner_channels, gin_channels, r)) + self.norms.append(LayerNorm(inner_channels)) + + def forward(self, x, x_mask, g): + x *= x_mask + for i in range(len(self.dconvs)): + residual = x.clone() + x = self.dconvs[i](x) + x = self.norms[i](x) + x *= x_mask + x = self.p1convs[i](x, g) + x = self.act(x) + x = self.p2convs[i](x, g) + x = residual + x + return x + + def remove_weight_norm(self): + for c in self.dconvs: + c.remove_weight_norm() + for c in self.p1convs: + c.remove_weight_norm() + for c in self.p2convs: + c.remove_weight_norm() + + def fix_speaker(self, g): + for c in self.p1convs: + c.fix_speaker(g) + for c in self.p2convs: + c.fix_speaker(g) + + def unfix_speaker(self, g): + for c in self.p1convs: + c.unfix_speaker(g) + for c in self.p2convs: + c.unfix_speaker(g) + + +class SnakeFilter(torch.nn.Module): + """ + Adaptive filter using snakebeta + """ + def __init__(self, channels, groups, kernel_size, num_layers, eps=1e-6): + super(SnakeFilter, self).__init__() + self.eps = eps + self.num_layers = num_layers + inner_channels = channels * groups + self.init_conv = DilatedCausalConv1d(1, inner_channels, kernel_size) + self.dconvs = torch.nn.ModuleList() + self.pconvs = torch.nn.ModuleList() + self.post_conv = DilatedCausalConv1d(inner_channels+1, 1, kernel_size, bias=False) + + for i in range(self.num_layers): + self.dconvs.append(DilatedCausalConv1d(inner_channels, inner_channels, kernel_size, stride=1, groups=inner_channels, dilation=kernel_size ** (i + 1))) + self.pconvs.append(weight_norm(Conv1d(inner_channels, inner_channels, 1, groups=groups))) + self.snake_alpha = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) + self.snake_beta = torch.nn.Parameter(torch.zeros(inner_channels), requires_grad=True) + + def forward(self, x): + y = x.clone() + x = self.init_conv(x) + for i in range(self.num_layers): + # snake activation + x = self.dconvs[i](x) + x = self.pconvs[i](x) + x = x + (1.0 / torch.clip(self.snake_beta.unsqueeze(0).unsqueeze(-1), min=self.eps)) * torch.pow(torch.sin(x * self.snake_alpha.unsqueeze(0).unsqueeze(-1)), 2) + x = torch.cat([x, y], 1) + x = self.post_conv(x) + return x + + def remove_weight_norm(self): + self.init_conv.remove_weight_norm() + for c in self.dconvs: + c.remove_weight_norm() + for c in self.pconvs: + remove_weight_norm(c) + self.post_conv.remove_weight_norm() + +""" +https://github.com/charactr-platform/vocos/blob/main/vocos/heads.py +""" +class FourierHead(nn.Module): + """Base class for inverse fourier modules.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Args: + x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, + L is the sequence length, and H denotes the model dimension. + + Returns: + Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. + """ + raise NotImplementedError("Subclasses must implement the forward method.") + + +class IMDCT(nn.Module): + """ + Inverse Modified Discrete Cosine Transform (IMDCT) module. + + Args: + frame_len (int): Length of the MDCT frame. + padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". + """ + + def __init__(self, frame_len: int, padding: str = "same"): + super().__init__() + if padding not in ["center", "same"]: + raise ValueError("Padding must be 'center' or 'same'.") + self.padding = padding + self.frame_len = frame_len * 2 + N = frame_len + n0 = (N + 1) / 2 + window = torch.from_numpy(scipy.signal.cosine(N * 2)).float() + self.register_buffer("window", window) + + pre_twiddle = torch.exp(1j * torch.pi * n0 * torch.arange(N * 2) / N) + post_twiddle = torch.exp(1j * torch.pi * (torch.arange(N * 2) + n0) / (N * 2)) + self.register_buffer("pre_twiddle", torch.view_as_real(pre_twiddle)) + self.register_buffer("post_twiddle", torch.view_as_real(post_twiddle)) + + def forward(self, X: torch.Tensor) -> torch.Tensor: + """ + Apply the Inverse Modified Discrete Cosine Transform (IMDCT) to the input MDCT coefficients. + + Args: + X (Tensor): Input MDCT coefficients of shape (B, N, L), where B is the batch size, + L is the number of frames, and N is the number of frequency bins. + + Returns: + Tensor: Reconstructed audio waveform of shape (B, T), where T is the length of the audio. + """ + X = X.transpose(1, 2) + B, L, N = X.shape + Y = torch.zeros((B, L, N * 2), dtype=X.dtype, device=X.device) + Y[..., :N] = X + Y[..., N:] = -1 * torch.conj(torch.flip(X, dims=(-1,))) + y = torch.fft.ifft(Y * torch.view_as_complex(self.pre_twiddle).expand(Y.shape), dim=-1) + y = torch.real(y * torch.view_as_complex(self.post_twiddle).expand(y.shape)) * np.sqrt(N) * np.sqrt(2) + result = y * self.window.expand(y.shape) + output_size = (1, (L + 1) * N) + audio = torch.nn.functional.fold( + result.transpose(1, 2), + output_size=output_size, + kernel_size=(1, self.frame_len), + stride=(1, self.frame_len // 2), + )[:, 0, 0, :] + + if self.padding == "center": + pad = self.frame_len // 2 + elif self.padding == "same": + pad = self.frame_len // 4 + else: + raise ValueError("Padding must be 'center' or 'same'.") + + audio = audio[:, pad:-pad] + return audio.unsqueeze(1) + + +class IMDCTSymExpHead(FourierHead): + """ + IMDCT Head module for predicting MDCT coefficients with symmetric exponential function + + Args: + dim (int): Hidden dimension of the model. + mdct_frame_len (int): Length of the MDCT frame. + padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same". + sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized + based on perceptual scaling. Defaults to None. + clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False. + """ + + def __init__( + self, dim: int, gin_channels: int, mdct_frame_len: int, padding: str = "same", sample_rate: int = 24000, + ): + super().__init__() + out_dim = mdct_frame_len + self.dconv = DilatedCausalConv1d(dim, dim, 5, 1, dim, 1) + self.pconv1 = LoRALinear1d(dim, dim * 2, gin_channels, 2) + self.pconv2 = LoRALinear1d(dim * 2, out_dim, gin_channels, 2) + self.act = torch.nn.GELU() + self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding) + + if sample_rate is not None: + # optionally init the last layer following mel-scale + m_max = _hz_to_mel(sample_rate // 2) + m_pts = torch.linspace(0, m_max, out_dim) + f_pts = _mel_to_hz(m_pts) + scale = 1 - (f_pts / f_pts.max()) + + with torch.no_grad(): + self.pconv2.main_fc.weight.mul_(scale.view(-1, 1, 1)) + + def forward(self, x: torch.Tensor, g: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the IMDCTSymExpHead module. + + Args: + x (Tensor): Input tensor of shape (B, L, H), where B is the batch size, + L is the sequence length, and H denotes the model dimension. + + Returns: + Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal. + """ + x = self.dconv(x) + x = self.pconv1(x, g) + x = self.act(x) + x = self.pconv2(x, g) + x = symexp(x) + x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes + audio = self.imdct(x) + return audio + + def remove_weight_norm(self): + self.dconv.remove_weight_norm() + self.pconv1.remove_weight_norm() + self.pconv2.remove_weight_norm() + + def fix_speaker(self, g): + self.pconv1.fix_speaker(g) + self.pconv2.fix_speaker(g) + + def unfix_speaker(self, g): + self.pconv1.unfix_speaker(g) + self.pconv2.unfix_speaker(g) + +def symexp(x: torch.Tensor) -> torch.Tensor: + return torch.sign(x) * (torch.exp(x.abs()) - 1) \ No newline at end of file diff --git a/server/voice_changer/RVC/inferencer/model_v3/transforms.py b/server/voice_changer/RVC/inferencer/voras_beta/transforms.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/transforms.py rename to server/voice_changer/RVC/inferencer/voras_beta/transforms.py diff --git a/server/voice_changer/RVC/inferencer/model_v3/utils.py b/server/voice_changer/RVC/inferencer/voras_beta/utils.py similarity index 100% rename from server/voice_changer/RVC/inferencer/model_v3/utils.py rename to server/voice_changer/RVC/inferencer/voras_beta/utils.py diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 1d8fbd0c..4e24acb1 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -3,6 +3,7 @@ from typing import Any import math import torch import torch.nn.functional as F +from torch.cuda.amp import autocast from Exceptions import ( DeviceCannotSupportHalfPrecisionException, DeviceChangingException, @@ -118,10 +119,6 @@ class Pipeline(object): # tensor型調整 feats = audio_pad - if self.isHalf is True: - feats = feats.half() - else: - feats = feats.float() if feats.dim() == 2: # double channels feats = feats.mean(-1) assert feats.dim() == 1, feats.dim() @@ -129,19 +126,20 @@ class Pipeline(object): # embedding padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - try: - feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) - if torch.isnan(feats).all(): - raise DeviceCannotSupportHalfPrecisionException() - except RuntimeError as e: - if "HALF" in e.__str__().upper(): - raise HalfPrecisionChangingException() - elif "same device" in e.__str__(): - raise DeviceChangingException() - else: - raise e - if protect < 0.5 and search_index: - feats0 = feats.clone() + with autocast(enabled=self.isHalf): + try: + feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) + if torch.isnan(feats).all(): + raise DeviceCannotSupportHalfPrecisionException() + except RuntimeError as e: + if "HALF" in e.__str__().upper(): + raise HalfPrecisionChangingException() + elif "same device" in e.__str__(): + raise DeviceChangingException() + else: + raise e + if protect < 0.5 and search_index: + feats0 = feats.clone() # Index - feature抽出 # if self.index is not None and self.feature is not None and index_rate != 0: @@ -167,10 +165,8 @@ class Pipeline(object): # recover silient font npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy]) - if self.isHalf is True: - npy = npy.astype("float16") - feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and search_index: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) @@ -207,14 +203,15 @@ class Pipeline(object): # 推論実行 try: with torch.no_grad(): - audio1 = ( - torch.clip( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), - -1.0, - 1.0, - ) - * 32767.5 - ).data.to(dtype=torch.int16) + with autocast(enabled=self.isHalf): + audio1 = ( + torch.clip( + self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), + -1.0, + 1.0, + ) + * 32767.5 + ).data.to(dtype=torch.int16) except RuntimeError as e: if "HALF" in e.__str__().upper(): print("11", e) From 4d67354ea6f4d358038a256da65213871e969d0b Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 24 Jun 2023 21:01:50 +0900 Subject: [PATCH 4/4] resolve_conflict --- README.md | 2 -- server/voice_changer/RVC/inferencer/Inferencer.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index d6cb6a81..23b9c72e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -これはVCClientでv3モデルを試すためのフォーク版です。それ以外の用途ではフォーク元を利用してください - ## VC Client [English](/README_en.md) diff --git a/server/voice_changer/RVC/inferencer/Inferencer.py b/server/voice_changer/RVC/inferencer/Inferencer.py index cf5913bf..3e5a62d8 100644 --- a/server/voice_changer/RVC/inferencer/Inferencer.py +++ b/server/voice_changer/RVC/inferencer/Inferencer.py @@ -8,7 +8,7 @@ from const import EnumInferenceTypes class Inferencer(Protocol): inferencerType: EnumInferenceTypes = EnumInferenceTypes.pyTorchRVC file: str - isHalf: bool = False + isHalf: bool = True gpu: int = 0 model: onnxruntime.InferenceSession | Any | None = None