WIP: integrate vcs to new gui 3

This commit is contained in:
wataru 2023-06-22 06:56:00 +09:00
parent fa7894de50
commit d83590dc35
10 changed files with 1552 additions and 7 deletions

View File

@ -20,7 +20,7 @@ import torch
import onnxruntime import onnxruntime
import pyworld as pw import pyworld as pw
from models import SynthesizerTrn # type:ignore from voice_changer.MMVCv15.models.models import SynthesizerTrn # type:ignore
from voice_changer.MMVCv15.client_modules import ( from voice_changer.MMVCv15.client_modules import (
convert_continuos_f0, convert_continuos_f0,
spectrogram_torch, spectrogram_torch,
@ -156,8 +156,7 @@ class MMVCv15:
def get_info(self): def get_info(self):
data = asdict(self.settings) data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else [] data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
return data return data
def get_processing_sampling_rate(self): def get_processing_sampling_rate(self):
@ -231,10 +230,6 @@ class MMVCv15:
return [spec, f0, sid] return [spec, f0, sid]
def _onnx_inference(self, data): def _onnx_inference(self, data):
if self.settings.onnxModelFile == "" and self.settings.onnxModelFile is None:
print("[Voice Changer] No ONNX session.")
raise NoModeLoadedException("ONNX")
spec, f0, sid_src = data spec, f0, sid_src = data
spec = spec.unsqueeze(0) spec = spec.unsqueeze(0)
spec_lengths = torch.tensor([spec.size(2)]) spec_lengths = torch.tensor([spec.size(2)])

View File

@ -0,0 +1,27 @@
import torch
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size * dilation - dilation) / 2)
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)

View File

@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Reo Yoneyama (Nagoya University)
# MIT License (https://opensource.org/licenses/MIT)
"""Feature-related functions.
References:
- https://github.com/bigpon/QPPWG
- https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts
"""
import sys
from logging import getLogger
import numpy as np
import torch
from torch.nn.functional import interpolate
# A logger for this file
logger = getLogger(__name__)
def validate_length(xs, ys=None, hop_size=None):
"""Validate length
Args:
xs (ndarray): numpy array of features
ys (ndarray): numpy array of audios
hop_size (int): upsampling factor
Returns:
(ndarray): length adjusted features
"""
min_len_x = min([x.shape[0] for x in xs])
if ys is not None:
min_len_y = min([y.shape[0] for y in ys])
if min_len_y < min_len_x * hop_size:
min_len_x = min_len_y // hop_size
if min_len_y > min_len_x * hop_size:
min_len_y = min_len_x * hop_size
ys = [y[:min_len_y] for y in ys]
xs = [x[:min_len_x] for x in xs]
return xs + ys if ys is not None else xs
def dilated_factor(batch_f0, fs, dense_factor):
"""Pitch-dependent dilated factor
Args:
batch_f0 (ndarray): the f0 sequence (T)
fs (int): sampling rate
dense_factor (int): the number of taps in one cycle
Return:
dilated_factors(np array):
float array of the pitch-dependent dilated factors (T)
"""
batch_f0[batch_f0 == 0] = fs / dense_factor
dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0
# assert np.all(dilated_factors > 0)
return dilated_factors
class SignalGenerator:
"""Input signal generator module."""
def __init__(
self,
sample_rate=24000,
hop_size=120,
sine_amp=0.1,
noise_amp=0.003,
signal_types=["sine", "noise"],
):
"""Initialize WaveNetResidualBlock module.
Args:
sample_rate (int): Sampling rate.
hop_size (int): Hop size of input F0.
sine_amp (float): Sine amplitude for NSF-based sine generation.
noise_amp (float): Noise amplitude for NSF-based sine generation.
signal_types (list): List of input signal types for generator.
"""
self.sample_rate = sample_rate
self.hop_size = hop_size
self.signal_types = signal_types
self.sine_amp = sine_amp
self.noise_amp = noise_amp
for signal_type in signal_types:
if signal_type not in ["noise", "sine", "sines", "uv"]:
logger.info(f"{signal_type} is not supported type for generator input.")
sys.exit(0)
# logger.info(f"Use {signal_types} for generator input signals.")
@torch.no_grad()
def __call__(self, f0, f0_scale=1.0):
signals = []
for typ in self.signal_types:
if "noise" == typ:
signals.append(self.random_noise(f0))
if "sine" == typ:
signals.append(self.sinusoid(f0))
if "sines" == typ:
signals.append(self.sinusoids(f0))
if "uv" == typ:
signals.append(self.vuv_binary(f0))
input_batch = signals[0]
for signal in signals[1:]:
input_batch = torch.cat([input_batch, signal], axis=1)
return input_batch * f0_scale
@torch.no_grad()
def random_noise(self, f0):
"""Calculate noise signals.
Args:
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
Returns:
Tensor: Gaussian noise signals (B, 1, T).
"""
B, _, T = f0.size()
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device)
return noise
@torch.no_grad()
def sinusoid(self, f0):
"""Calculate sine signals.
Args:
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
Returns:
Tensor: Sines generated following NSF (B, 1, T).
"""
B, _, T = f0.size()
vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1
sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp
if self.noise_amp > 0:
noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
sine = sine + noise
return sine
@torch.no_grad()
def sinusoids(self, f0):
"""Calculate sines.
Args:
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
Returns:
Tensor: Sines generated following NSF (B, 1, T).
"""
B, _, T = f0.size()
vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
f0 = interpolate(f0, T * self.hop_size)
sines = torch.zeros_like(f0, device=f0.device)
harmonics = 5 # currently only fixed number of harmonics is supported
for i in range(harmonics):
radious = (f0 * (i + 1) / self.sample_rate) % 1
sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi)
sines = self.sine_amp * sines * vuv / harmonics
if self.noise_amp > 0:
noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
sines = sines + noise
return sines
@torch.no_grad()
def vuv_binary(self, f0):
"""Calculate V/UV binary sequences.
Args:
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
Returns:
Tensor: V/UV binary sequences (B, 1, T).
"""
_, _, T = f0.size()
uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
return uv

View File

@ -0,0 +1,312 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Reo Yoneyama (Nagoya University)
# MIT License (https://opensource.org/licenses/MIT)
"""HiFiGAN and SiFiGAN Generator modules.
References:
- https://github.com/kan-bayashi/ParallelWaveGAN
- https://github.com/bigpon/QPPWG
- https://github.com/jik876/hifi-gan
"""
from logging import getLogger
import torch.nn as nn
from .residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock
# A logger for this file
logger = getLogger(__name__)
class SiFiGANGenerator(nn.Module):
"""SiFiGAN generator module."""
def __init__(
self,
in_channels,
out_channels=1,
channels=512,
kernel_size=7,
upsample_scales=(5, 4, 3, 2),
upsample_kernel_sizes=(10, 8, 6, 4),
source_network_params={
"resblock_kernel_size": 3, # currently only 3 is supported.
"resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)],
"use_additional_convs": True,
},
filter_network_params={
"resblock_kernel_sizes": (3, 5, 7),
"resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)],
"use_additional_convs": False,
},
share_upsamples=False,
share_downsamples=False,
bias=True,
nonlinear_activation="LeakyReLU",
nonlinear_activation_params={"negative_slope": 0.1},
use_weight_norm=True,
requires_grad=True,
):
"""Initialize SiFiGANGenerator module.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
channels (int): Number of hidden representation channels.
kernel_size (int): Kernel size of initial and final conv layer.
upsample_scales (list): List of upsampling scales.
upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
source_network_params (dict): Parameters for source-network.
filter_network_params (dict): Parameters for filter-network.
share_upsamples (bool): Whether to share up-sampling transposed CNNs.
share_downsamples (bool): Whether to share down-sampling CNNs.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
use_weight_norm (bool): Whether to use weight norm.
If set to true, it will be applied to all of the conv layers.
"""
super().__init__()
# check hyperparameters are valid
assert kernel_size % 2 == 1, "Kernel size must be odd number."
assert len(upsample_scales) == len(upsample_kernel_sizes)
# define modules
self.num_upsamples = len(upsample_kernel_sizes)
self.source_network_params = source_network_params
self.filter_network_params = filter_network_params
self.share_upsamples = share_upsamples
self.share_downsamples = share_downsamples
self.sn = nn.ModuleDict()
self.fn = nn.ModuleDict()
self.input_conv = Conv1d(
in_channels,
channels,
kernel_size,
bias=bias,
padding=(kernel_size - 1) // 2,
)
self.sn["upsamples"] = nn.ModuleList()
self.fn["upsamples"] = nn.ModuleList()
self.sn["blocks"] = nn.ModuleList()
self.fn["blocks"] = nn.ModuleList()
for i in range(len(upsample_kernel_sizes)):
assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
self.sn["upsamples"] += [
nn.Sequential(
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
nn.ConvTranspose1d(
channels // (2**i),
channels // (2 ** (i + 1)),
upsample_kernel_sizes[i],
upsample_scales[i],
padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
output_padding=upsample_scales[i] % 2,
bias=bias,
),
)
]
if not share_upsamples:
self.fn["upsamples"] += [
nn.Sequential(
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
nn.ConvTranspose1d(
channels // (2**i),
channels // (2 ** (i + 1)),
upsample_kernel_sizes[i],
upsample_scales[i],
padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
output_padding=upsample_scales[i] % 2,
bias=bias,
),
)
]
self.sn["blocks"] += [
AdaptiveResidualBlock(
kernel_size=source_network_params["resblock_kernel_size"],
channels=channels // (2 ** (i + 1)),
dilations=source_network_params["resblock_dilations"][i],
bias=bias,
use_additional_convs=source_network_params["use_additional_convs"],
nonlinear_activation=nonlinear_activation,
nonlinear_activation_params=nonlinear_activation_params,
)
]
for j in range(len(filter_network_params["resblock_kernel_sizes"])):
self.fn["blocks"] += [
ResidualBlock(
kernel_size=filter_network_params["resblock_kernel_sizes"][j],
channels=channels // (2 ** (i + 1)),
dilations=filter_network_params["resblock_dilations"][j],
bias=bias,
use_additional_convs=filter_network_params["use_additional_convs"],
nonlinear_activation=nonlinear_activation,
nonlinear_activation_params=nonlinear_activation_params,
)
]
self.sn["output_conv"] = nn.Sequential(
nn.LeakyReLU(),
nn.Conv1d(
channels // (2 ** (i + 1)),
out_channels,
kernel_size,
bias=bias,
padding=(kernel_size - 1) // 2,
),
)
self.fn["output_conv"] = nn.Sequential(
nn.LeakyReLU(),
nn.Conv1d(
channels // (2 ** (i + 1)),
out_channels,
kernel_size,
bias=bias,
padding=(kernel_size - 1) // 2,
),
nn.Tanh(),
)
# sine embedding layers
self.sn["emb"] = Conv1d(
1,
channels // (2 ** len(upsample_kernel_sizes)),
kernel_size,
bias=bias,
padding=(kernel_size - 1) // 2,
)
# down-sampling CNNs
self.sn["downsamples"] = nn.ModuleList()
for i in reversed(range(1, len(upsample_kernel_sizes))):
self.sn["downsamples"] += [
nn.Sequential(
nn.Conv1d(
channels // (2 ** (i + 1)),
channels // (2**i),
upsample_kernel_sizes[i],
upsample_scales[i],
padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
bias=bias,
),
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
)
]
if not share_downsamples:
self.fn["downsamples"] = nn.ModuleList()
for i in reversed(range(1, len(upsample_kernel_sizes))):
self.fn["downsamples"] += [
nn.Sequential(
nn.Conv1d(
channels // (2 ** (i + 1)),
channels // (2**i),
upsample_kernel_sizes[i],
upsample_scales[i],
padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
bias=bias,
),
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
)
]
# apply weight norm
if use_weight_norm:
self.apply_weight_norm()
# reset parameters
self.reset_parameters()
if requires_grad is False:
for param in self.parameters():
param.requires_grad = False
def forward(self, x, c, d, sid):
"""Calculate forward propagation.
Args:
x (Tensor): Input sine signal (B, 1, T).
c (Tensor): Input tensor (B, in_channels, T).
d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples].
Returns:
Tensor: Output tensor (B, out_channels, T).
"""
# currently, same input feature is input to each network
c = self.input_conv(c)
e = c
# source-network forward
x = self.sn["emb"](x)
embs = [x]
for i in range(self.num_upsamples - 1):
x = self.sn["downsamples"][i](x)
embs += [x]
for i in range(self.num_upsamples):
# excitation generation network
e = self.sn["upsamples"][i](e) + embs[-i - 1]
e = self.sn["blocks"][i](e, d[i])
e_ = self.sn["output_conv"](e)
# filter-network forward
embs = [e]
for i in range(self.num_upsamples - 1):
if self.share_downsamples:
e = self.sn["downsamples"][i](e)
else:
e = self.fn["downsamples"][i](e)
embs += [e]
num_blocks = len(self.filter_network_params["resblock_kernel_sizes"])
for i in range(self.num_upsamples):
# resonance filtering network
if self.share_upsamples:
c = self.sn["upsamples"][i](c) + embs[-i - 1]
else:
c = self.fn["upsamples"][i](c) + embs[-i - 1]
cs = 0.0 # initialize
for j in range(num_blocks):
cs += self.fn["blocks"][i * num_blocks + j](c)
c = cs / num_blocks
c = self.fn["output_conv"](c)
return c, e_
def reset_parameters(self):
"""Reset parameters.
This initialization follows the official implementation manner.
https://github.com/jik876/hifi-gan/blob/master/models.py
"""
def _reset_parameters(m):
if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
m.weight.data.normal_(0.0, 0.01)
logger.debug(f"Reset parameters in {m}.")
self.apply(_reset_parameters)
def remove_weight_norm(self):
"""Remove weight normalization module from all of the layers."""
def _remove_weight_norm(m):
try:
logger.debug(f"Weight norm is removed from {m}.")
nn.utils.remove_weight_norm(m)
except ValueError: # this module didn't have weight norm
return
self.apply(_remove_weight_norm)
def apply_weight_norm(self):
"""Apply weight normalization module from all of the layers."""
def _apply_weight_norm(m):
if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
nn.utils.weight_norm(m)
logger.debug(f"Weight norm is applied to {m}.")
self.apply(_apply_weight_norm)

View File

@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
# Copyright 2020 Yi-Chiao Wu (Nagoya University)
# MIT License (https://opensource.org/licenses/MIT)
"""Indexing-related functions."""
import torch
from torch.nn import ConstantPad1d as pad1d
def pd_indexing(x, d, dilation, batch_index, ch_index):
"""Pitch-dependent indexing of past and future samples.
Args:
x (Tensor): Input feature map (B, C, T).
d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
dilation (Int): Dilation size.
batch_index (Tensor): Batch index
ch_index (Tensor): Channel index
Returns:
Tensor: Past output tensor (B, out_channels, T)
Tensor: Future output tensor (B, out_channels, T)
"""
(_, _, batch_length) = d.size()
dilations = d * dilation
# get past index
idxP = torch.arange(-batch_length, 0).float()
idxP = idxP.to(x.device)
idxP = torch.add(-dilations, idxP)
idxP = idxP.round().long()
maxP = -((torch.min(idxP) + batch_length))
assert maxP >= 0
idxP = (batch_index, ch_index, idxP)
# padding past tensor
xP = pad1d((maxP, 0), 0)(x)
# get future index
idxF = torch.arange(0, batch_length).float()
idxF = idxF.to(x.device)
idxF = torch.add(dilations, idxF)
idxF = idxF.round().long()
maxF = torch.max(idxF) - (batch_length - 1)
assert maxF >= 0
idxF = (batch_index, ch_index, idxF)
# padding future tensor
xF = pad1d((0, maxF), 0)(x)
return xP[idxP], xF[idxF]
def index_initial(n_batch, n_ch, tensor=True):
"""Tensor batch and channel index initialization.
Args:
n_batch (Int): Number of batch.
n_ch (Int): Number of channel.
tensor (bool): Return tensor or numpy array
Returns:
Tensor: Batch index
Tensor: Channel index
"""
batch_index = []
for i in range(n_batch):
batch_index.append([[i]] * n_ch)
ch_index = []
for i in range(n_ch):
ch_index += [[i]]
ch_index = [ch_index] * n_batch
if tensor:
batch_index = torch.tensor(batch_index)
ch_index = torch.tensor(ch_index)
if torch.cuda.is_available():
batch_index = batch_index.cuda()
ch_index = ch_index.cuda()
return batch_index, ch_index

View File

@ -0,0 +1,438 @@
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .commons import init_weights, get_padding, sequence_mask
from .generator import SiFiGANGenerator
from .features import SignalGenerator, dilated_factor
class TextEncoder(nn.Module):
def __init__(self, out_channels, hidden_channels, requires_grad=True):
super().__init__()
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
# パラメータを学習しない
if requires_grad is False:
for param in self.parameters():
param.requires_grad = False
def forward(self, x, x_lengths):
x = torch.transpose(x.half(), 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return x, m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0, requires_grad=True):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
self.flows.append(Flip())
# パラメータを学習しない
if requires_grad is False:
for param in self.parameters():
param.requires_grad = False
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
class PosteriorEncoder(nn.Module):
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, requires_grad=True):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
# パラメータを学習しない
if requires_grad is False:
for param in self.parameters():
param.requires_grad = False
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class Generator(torch.nn.Module):
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, requires_grad=True):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
resblock = ResBlock1 if resblock == "1" else ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if requires_grad is False:
for param in self.parameters():
param.requires_grad = False
def forward(self, x, g=None):
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print("Removing weight norm...")
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
]
)
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
self.convs = nn.ModuleList(
[
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
]
)
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
# periods = [2,3,5,7,11]
periods = [3, 5, 7, 11, 13]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat, flag=True):
if flag:
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
else:
y_d_gs = []
with torch.no_grad():
for i, d in enumerate(self.discriminators):
y_d_g, _ = d(y_hat)
y_d_gs.append(y_d_g)
return y_d_gs
class SynthesizerTrn(nn.Module):
"""
Synthesizer for Training
"""
def __init__(
self,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
n_flow,
dec_out_channels=1,
dec_kernel_size=7,
n_speakers=0,
gin_channels=0,
requires_grad_pe=True,
requires_grad_flow=True,
requires_grad_text_enc=True,
requires_grad_dec=True,
requires_grad_emb_g=True,
sample_rate=24000,
hop_size=128,
sine_amp=0.1,
noise_amp=0.003,
signal_types=["sine"],
dense_factors=[0.5, 1, 4, 8],
upsample_scales=[8, 4, 2, 2],
):
super().__init__()
self.spec_channels = spec_channels
self.hidden_channels = hidden_channels
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.dec_out_channels = dec_out_channels
self.dec_kernel_size = dec_kernel_size
self.n_speakers = n_speakers
self.gin_channels = gin_channels
self.requires_grad_pe = requires_grad_pe
self.requires_grad_flow = requires_grad_flow
self.requires_grad_text_enc = requires_grad_text_enc
self.requires_grad_dec = requires_grad_dec
self.requires_grad_emb_g = requires_grad_emb_g
self.sample_rate = sample_rate
self.hop_size = hop_size
self.sine_amp = sine_amp
self.noise_amp = noise_amp
self.signal_types = signal_types
self.dense_factors = dense_factors
self.upsample_scales = upsample_scales
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels, requires_grad=requires_grad_pe)
self.enc_p = TextEncoder(inter_channels, hidden_channels, requires_grad=requires_grad_text_enc)
self.dec = SiFiGANGenerator(in_channels=inter_channels, out_channels=dec_out_channels, channels=upsample_initial_channel, kernel_size=dec_kernel_size, upsample_scales=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, requires_grad=requires_grad_dec)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels, requires_grad=requires_grad_flow)
self.signal_generator = SignalGenerator(sample_rate=sample_rate, hop_size=hop_size, noise_amp=noise_amp, signal_types=signal_types)
if n_speakers > 1:
self.emb_g = nn.Embedding(n_speakers, gin_channels)
self.emb_g.requires_grad = requires_grad_emb_g
def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None):
pass
# sin, d = self.make_sin_d(f0)
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
# # target sid 作成
# target_sids = self.make_random_target_sids(target_ids, sid)
# if self.n_speakers > 0:
# g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
# tgt_g = self.emb_g(target_sids).unsqueeze(-1) # [b, h, 1]
# else:
# g = None
# # PE
# z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
# # Flow
# z_p = self.flow(z, y_mask, g=g)
# # VC
# tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True)
# # アライメントの作成
# liner_alignment = F.one_hot(torch.arange(0, x.shape[2] + 2)).cuda()
# liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0)
# liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode="linear", align_corners=True)
# liner_alignment = liner_alignment[:, 1:-1, :]
# # TextEncとPEのshape合わせ
# m_p = torch.matmul(m_p, liner_alignment)
# logs_p = torch.matmul(logs_p, liner_alignment)
# # slice
# z_slice = slice_segments(z, slice_id, self.segment_size)
# # targetのslice
# tgt_z_slice = slice_segments(tgt_z, slice_id, self.segment_size)
# # Dec
# o = self.dec(sin, z_slice, d, sid=g)
# tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g)
# return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)
def make_sin_d(self, f0):
# f0 から sin と d を作成
# f0 : [b, 1, t]
# sin : [b, 1, t]
# d : [4][b, 1, t]
prod_upsample_scales = np.cumprod(self.upsample_scales)
dfs_batch = []
for df, us in zip(self.dense_factors, prod_upsample_scales):
dilated_tensor = dilated_factor(f0, self.sample_rate, df)
# result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))
in_batch = self.signal_generator(f0)
return in_batch, dfs_batch
def make_random_target_sids(self, target_ids, sid):
# target_sids は target_ids をランダムで埋める
target_sids = torch.zeros_like(sid)
for i in range(len(target_sids)):
source_id = sid[i]
deleted_target_ids = target_ids[target_ids != source_id] # source_id と target_id が同じにならないよう sid と同じものを削除
if len(deleted_target_ids) >= 1:
target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))]
else:
# target_id 候補が無いときは仕方ないので sid を使う
target_sids[i] = source_id
return target_sids
def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
sin, d = self.make_sin_d(f0)
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
# print("VC", sin.device, d[0].device, g_tgt.device)
o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
return o_hat[0]
def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
o_hat = self.dec(z * y_mask, g=g_tgt)
return o_hat, y_mask, (z)
def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
# g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
o_hat = self.dec(z * y_mask, g=g_src)
return o_hat, y_mask, (z)
def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
return o_hat, y_mask, (z, z_p, z_hat)
def save_synthesizer(self, path):
enc_q = self.enc_q.state_dict()
dec = self.dec.state_dict()
emb_g = self.emb_g.state_dict()
torch.save({"enc_q": enc_q, "dec": dec, "emb_g": emb_g}, path)
def load_synthesizer(self, path):
dict = torch.load(path, map_location="cpu")
enc_q = dict["enc_q"]
dec = dict["dec"]
emb_g = dict["emb_g"]
self.enc_q.load_state_dict(enc_q)
self.dec.load_state_dict(dec)
self.emb_g.load_state_dict(emb_g)

View File

@ -0,0 +1,186 @@
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d
from torch.nn.utils import weight_norm, remove_weight_norm
from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
LRELU_SLOPE = 0.1
class WN(torch.nn.Module):
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
super(WN, self).__init__()
assert kernel_size % 2 == 1
self.hidden_channels = hidden_channels
self.kernel_size = (kernel_size,)
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
for i in range(n_layers):
dilation = dilation_rate**i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
else:
g_l = torch.zeros_like(x_in)
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:, : self.hidden_channels, :]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:, self.hidden_channels :, :]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x
class ResidualCouplingLayer(nn.Module):
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1, 2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x

View File

@ -0,0 +1 @@
modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26

View File

@ -0,0 +1,257 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Reo Yoneyama (Nagoya University)
# MIT License (https://opensource.org/licenses/MIT)
"""Residual block modules.
References:
- https://github.com/kan-bayashi/ParallelWaveGAN
- https://github.com/bigpon/QPPWG
- https://github.com/r9y9/wavenet_vocoder
"""
from logging import getLogger
import torch
import torch.nn as nn
from .snake import Snake
from .index import index_initial, pd_indexing
# A logger for this file
logger = getLogger(__name__)
class Conv1d(nn.Conv1d):
"""Conv1d module with customized initialization."""
def __init__(self, *args, **kwargs):
"""Initialize Conv1d module."""
super(Conv1d, self).__init__(*args, **kwargs)
def reset_parameters(self):
"""Reset parameters."""
nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
if self.bias is not None:
nn.init.constant_(self.bias, 0.0)
class Conv1d1x1(Conv1d):
"""1x1 Conv1d with customized initialization."""
def __init__(self, in_channels, out_channels, bias=True):
"""Initialize 1x1 Conv1d module."""
super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
class Conv2d(nn.Conv2d):
"""Conv2d module with customized initialization."""
def __init__(self, *args, **kwargs):
"""Initialize Conv2d module."""
super(Conv2d, self).__init__(*args, **kwargs)
def reset_parameters(self):
"""Reset parameters."""
nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu")
if self.bias is not None:
nn.init.constant_(self.bias, 0.0)
class Conv2d1x1(Conv2d):
"""1x1 Conv2d with customized initialization."""
def __init__(self, in_channels, out_channels, bias=True):
"""Initialize 1x1 Conv2d module."""
super(Conv2d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
class ResidualBlock(nn.Module):
"""Residual block module in HiFiGAN."""
def __init__(
self,
kernel_size=3,
channels=512,
dilations=(1, 3, 5),
bias=True,
use_additional_convs=True,
nonlinear_activation="LeakyReLU",
nonlinear_activation_params={"negative_slope": 0.1},
):
"""Initialize ResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
channels (int): Number of channels for convolution layer.
dilations (List[int]): List of dilation factors.
use_additional_convs (bool): Whether to use additional convolution layers.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
"""
super().__init__()
self.use_additional_convs = use_additional_convs
self.convs1 = nn.ModuleList()
if use_additional_convs:
self.convs2 = nn.ModuleList()
assert kernel_size % 2 == 1, "Kernel size must be odd number."
for dilation in dilations:
if nonlinear_activation == "Snake":
nonlinear = Snake(channels, **nonlinear_activation_params)
else:
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
self.convs1 += [
nn.Sequential(
nonlinear,
nn.Conv1d(
channels,
channels,
kernel_size,
dilation=dilation,
bias=bias,
padding=(kernel_size - 1) // 2 * dilation,
),
)
]
if use_additional_convs:
if nonlinear_activation == "Snake":
nonlinear = Snake(channels, **nonlinear_activation_params)
else:
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
self.convs2 += [
nn.Sequential(
nonlinear,
nn.Conv1d(
channels,
channels,
kernel_size,
dilation=1,
bias=bias,
padding=(kernel_size - 1) // 2,
),
)
]
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
Returns:
Tensor: Output tensor (B, channels, T).
"""
for idx in range(len(self.convs1)):
xt = self.convs1[idx](x)
if self.use_additional_convs:
xt = self.convs2[idx](xt)
x = xt + x
return x
class AdaptiveResidualBlock(nn.Module):
"""Residual block module in HiFiGAN."""
def __init__(
self,
kernel_size=3,
channels=512,
dilations=(1, 2, 4),
bias=True,
use_additional_convs=True,
nonlinear_activation="LeakyReLU",
nonlinear_activation_params={"negative_slope": 0.1},
):
"""Initialize ResidualBlock module.
Args:
kernel_size (int): Kernel size of dilation convolution layer.
channels (int): Number of channels for convolution layer.
bias (bool): Whether to add bias parameter in convolution layers.
nonlinear_activation (str): Activation function module name.
nonlinear_activation_params (dict): Hyperparameters for activation function.
"""
super().__init__()
self.use_additional_convs = use_additional_convs
assert kernel_size == 3, "Currently only kernel_size = 3 is supported."
self.channels = channels
self.dilations = dilations
self.nonlinears = nn.ModuleList()
self.convsC = nn.ModuleList()
self.convsP = nn.ModuleList()
self.convsF = nn.ModuleList()
if use_additional_convs:
self.convsA = nn.ModuleList()
for _ in dilations:
if nonlinear_activation == "Snake":
self.nonlinears += [Snake(channels, **nonlinear_activation_params)]
else:
self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)]
self.convsC += [
Conv1d1x1(
channels,
channels,
bias=bias,
),
]
self.convsP += [
Conv1d1x1(
channels,
channels,
bias=bias,
),
]
self.convsF += [
Conv1d1x1(
channels,
channels,
bias=bias,
),
]
if use_additional_convs:
if nonlinear_activation == "Snake":
nonlinear = Snake(channels, **nonlinear_activation_params)
else:
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
self.convsA += [
nn.Sequential(
nonlinear,
nn.Conv1d(
channels,
channels,
kernel_size,
dilation=1,
bias=bias,
padding=(kernel_size - 1) // 2,
),
)
]
def forward(self, x, d):
"""Calculate forward propagation.
Args:
x (Tensor): Input tensor (B, channels, T).
d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
Returns:
Tensor: Output tensor (B, channels, T).
"""
batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False)
batch_index = torch.tensor(batch_index).to(x.device)
ch_index = torch.tensor(ch_index).to(x.device)
for i, dilation in enumerate(self.dilations):
xt = self.nonlinears[i](x)
xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index)
xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF)
if self.use_additional_convs:
xt = self.convsA[i](xt)
x = xt + x
return x

View File

@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Reo Yoneyama (Nagoya University)
# MIT License (https://opensource.org/licenses/MIT)
"""Snake Activation Function Module.
References:
- Neural Networks Fail to Learn Periodic Functions and How to Fix It
https://arxiv.org/pdf/2006.08195.pdf
- BigVGAN: A Universal Neural Vocoder with Large-Scale Training
https://arxiv.org/pdf/2206.04658.pdf
"""
import torch
import torch.nn as nn
class Snake(nn.Module):
"""Snake activation function module."""
def __init__(self, channels, init=50):
"""Initialize Snake module.
Args:
channels (int): Number of feature channels.
init (float): Initial value of the learnable parameter alpha.
According to the original paper, 5 ~ 50 would be
suitable for periodic data (i.e. voices).
"""
super(Snake, self).__init__()
alpha = init * torch.ones(1, channels, 1)
self.alpha = nn.Parameter(alpha)
def forward(self, x):
"""Calculate forward propagation.
Args:
x (Tensor): Input noise signal (B, channels, T).
Returns:
Tensor: Output tensor (B, channels, T).
"""
return x + torch.sin(self.alpha * x) ** 2 / self.alpha