mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 21:45:00 +03:00
WIP: integrate vcs to new gui 3
This commit is contained in:
parent
fa7894de50
commit
d83590dc35
@ -20,7 +20,7 @@ import torch
|
|||||||
import onnxruntime
|
import onnxruntime
|
||||||
import pyworld as pw
|
import pyworld as pw
|
||||||
|
|
||||||
from models import SynthesizerTrn # type:ignore
|
from voice_changer.MMVCv15.models.models import SynthesizerTrn # type:ignore
|
||||||
from voice_changer.MMVCv15.client_modules import (
|
from voice_changer.MMVCv15.client_modules import (
|
||||||
convert_continuos_f0,
|
convert_continuos_f0,
|
||||||
spectrogram_torch,
|
spectrogram_torch,
|
||||||
@ -156,8 +156,7 @@ class MMVCv15:
|
|||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
|
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_processing_sampling_rate(self):
|
def get_processing_sampling_rate(self):
|
||||||
@ -231,10 +230,6 @@ class MMVCv15:
|
|||||||
return [spec, f0, sid]
|
return [spec, f0, sid]
|
||||||
|
|
||||||
def _onnx_inference(self, data):
|
def _onnx_inference(self, data):
|
||||||
if self.settings.onnxModelFile == "" and self.settings.onnxModelFile is None:
|
|
||||||
print("[Voice Changer] No ONNX session.")
|
|
||||||
raise NoModeLoadedException("ONNX")
|
|
||||||
|
|
||||||
spec, f0, sid_src = data
|
spec, f0, sid_src = data
|
||||||
spec = spec.unsqueeze(0)
|
spec = spec.unsqueeze(0)
|
||||||
spec_lengths = torch.tensor([spec.size(2)])
|
spec_lengths = torch.tensor([spec.size(2)])
|
||||||
|
27
server/voice_changer/MMVCv15/models/commons.py
Normal file
27
server/voice_changer/MMVCv15/models/commons.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def init_weights(m, mean=0.0, std=0.01):
|
||||||
|
classname = m.__class__.__name__
|
||||||
|
if classname.find("Conv") != -1:
|
||||||
|
m.weight.data.normal_(mean, std)
|
||||||
|
|
||||||
|
|
||||||
|
def get_padding(kernel_size, dilation=1):
|
||||||
|
return int((kernel_size * dilation - dilation) / 2)
|
||||||
|
|
||||||
|
|
||||||
|
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||||
|
n_channels_int = n_channels[0]
|
||||||
|
in_act = input_a + input_b
|
||||||
|
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||||
|
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||||
|
acts = t_act * s_act
|
||||||
|
return acts
|
||||||
|
|
||||||
|
|
||||||
|
def sequence_mask(length, max_length=None):
|
||||||
|
if max_length is None:
|
||||||
|
max_length = length.max()
|
||||||
|
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||||
|
return x.unsqueeze(0) < length.unsqueeze(1)
|
200
server/voice_changer/MMVCv15/models/features.py
Normal file
200
server/voice_changer/MMVCv15/models/features.py
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 Reo Yoneyama (Nagoya University)
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""Feature-related functions.
|
||||||
|
|
||||||
|
References:
|
||||||
|
- https://github.com/bigpon/QPPWG
|
||||||
|
- https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from logging import getLogger
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.nn.functional import interpolate
|
||||||
|
|
||||||
|
# A logger for this file
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_length(xs, ys=None, hop_size=None):
|
||||||
|
"""Validate length
|
||||||
|
|
||||||
|
Args:
|
||||||
|
xs (ndarray): numpy array of features
|
||||||
|
ys (ndarray): numpy array of audios
|
||||||
|
hop_size (int): upsampling factor
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(ndarray): length adjusted features
|
||||||
|
|
||||||
|
"""
|
||||||
|
min_len_x = min([x.shape[0] for x in xs])
|
||||||
|
if ys is not None:
|
||||||
|
min_len_y = min([y.shape[0] for y in ys])
|
||||||
|
if min_len_y < min_len_x * hop_size:
|
||||||
|
min_len_x = min_len_y // hop_size
|
||||||
|
if min_len_y > min_len_x * hop_size:
|
||||||
|
min_len_y = min_len_x * hop_size
|
||||||
|
ys = [y[:min_len_y] for y in ys]
|
||||||
|
xs = [x[:min_len_x] for x in xs]
|
||||||
|
|
||||||
|
return xs + ys if ys is not None else xs
|
||||||
|
|
||||||
|
|
||||||
|
def dilated_factor(batch_f0, fs, dense_factor):
|
||||||
|
"""Pitch-dependent dilated factor
|
||||||
|
|
||||||
|
Args:
|
||||||
|
batch_f0 (ndarray): the f0 sequence (T)
|
||||||
|
fs (int): sampling rate
|
||||||
|
dense_factor (int): the number of taps in one cycle
|
||||||
|
|
||||||
|
Return:
|
||||||
|
dilated_factors(np array):
|
||||||
|
float array of the pitch-dependent dilated factors (T)
|
||||||
|
|
||||||
|
"""
|
||||||
|
batch_f0[batch_f0 == 0] = fs / dense_factor
|
||||||
|
dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0
|
||||||
|
# assert np.all(dilated_factors > 0)
|
||||||
|
return dilated_factors
|
||||||
|
|
||||||
|
|
||||||
|
class SignalGenerator:
|
||||||
|
"""Input signal generator module."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sample_rate=24000,
|
||||||
|
hop_size=120,
|
||||||
|
sine_amp=0.1,
|
||||||
|
noise_amp=0.003,
|
||||||
|
signal_types=["sine", "noise"],
|
||||||
|
):
|
||||||
|
"""Initialize WaveNetResidualBlock module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sample_rate (int): Sampling rate.
|
||||||
|
hop_size (int): Hop size of input F0.
|
||||||
|
sine_amp (float): Sine amplitude for NSF-based sine generation.
|
||||||
|
noise_amp (float): Noise amplitude for NSF-based sine generation.
|
||||||
|
signal_types (list): List of input signal types for generator.
|
||||||
|
|
||||||
|
"""
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.hop_size = hop_size
|
||||||
|
self.signal_types = signal_types
|
||||||
|
self.sine_amp = sine_amp
|
||||||
|
self.noise_amp = noise_amp
|
||||||
|
|
||||||
|
for signal_type in signal_types:
|
||||||
|
if signal_type not in ["noise", "sine", "sines", "uv"]:
|
||||||
|
logger.info(f"{signal_type} is not supported type for generator input.")
|
||||||
|
sys.exit(0)
|
||||||
|
# logger.info(f"Use {signal_types} for generator input signals.")
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def __call__(self, f0, f0_scale=1.0):
|
||||||
|
signals = []
|
||||||
|
for typ in self.signal_types:
|
||||||
|
if "noise" == typ:
|
||||||
|
signals.append(self.random_noise(f0))
|
||||||
|
if "sine" == typ:
|
||||||
|
signals.append(self.sinusoid(f0))
|
||||||
|
if "sines" == typ:
|
||||||
|
signals.append(self.sinusoids(f0))
|
||||||
|
if "uv" == typ:
|
||||||
|
signals.append(self.vuv_binary(f0))
|
||||||
|
|
||||||
|
input_batch = signals[0]
|
||||||
|
for signal in signals[1:]:
|
||||||
|
input_batch = torch.cat([input_batch, signal], axis=1)
|
||||||
|
|
||||||
|
return input_batch * f0_scale
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def random_noise(self, f0):
|
||||||
|
"""Calculate noise signals.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Gaussian noise signals (B, 1, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
B, _, T = f0.size()
|
||||||
|
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device)
|
||||||
|
|
||||||
|
return noise
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sinusoid(self, f0):
|
||||||
|
"""Calculate sine signals.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Sines generated following NSF (B, 1, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
B, _, T = f0.size()
|
||||||
|
vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
|
||||||
|
radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1
|
||||||
|
sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp
|
||||||
|
if self.noise_amp > 0:
|
||||||
|
noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
|
||||||
|
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
|
||||||
|
sine = sine + noise
|
||||||
|
|
||||||
|
return sine
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def sinusoids(self, f0):
|
||||||
|
"""Calculate sines.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Sines generated following NSF (B, 1, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
B, _, T = f0.size()
|
||||||
|
vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
|
||||||
|
f0 = interpolate(f0, T * self.hop_size)
|
||||||
|
sines = torch.zeros_like(f0, device=f0.device)
|
||||||
|
harmonics = 5 # currently only fixed number of harmonics is supported
|
||||||
|
for i in range(harmonics):
|
||||||
|
radious = (f0 * (i + 1) / self.sample_rate) % 1
|
||||||
|
sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi)
|
||||||
|
sines = self.sine_amp * sines * vuv / harmonics
|
||||||
|
if self.noise_amp > 0:
|
||||||
|
noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
|
||||||
|
noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
|
||||||
|
sines = sines + noise
|
||||||
|
|
||||||
|
return sines
|
||||||
|
|
||||||
|
@torch.no_grad()
|
||||||
|
def vuv_binary(self, f0):
|
||||||
|
"""Calculate V/UV binary sequences.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
f0 (Tensor): F0 tensor (B, 1, T // hop_size).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: V/UV binary sequences (B, 1, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
_, _, T = f0.size()
|
||||||
|
uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
|
||||||
|
|
||||||
|
return uv
|
312
server/voice_changer/MMVCv15/models/generator.py
Normal file
312
server/voice_changer/MMVCv15/models/generator.py
Normal file
@ -0,0 +1,312 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 Reo Yoneyama (Nagoya University)
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""HiFiGAN and SiFiGAN Generator modules.
|
||||||
|
|
||||||
|
References:
|
||||||
|
- https://github.com/kan-bayashi/ParallelWaveGAN
|
||||||
|
- https://github.com/bigpon/QPPWG
|
||||||
|
- https://github.com/jik876/hifi-gan
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from logging import getLogger
|
||||||
|
|
||||||
|
import torch.nn as nn
|
||||||
|
from .residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock
|
||||||
|
|
||||||
|
# A logger for this file
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SiFiGANGenerator(nn.Module):
|
||||||
|
"""SiFiGAN generator module."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_channels,
|
||||||
|
out_channels=1,
|
||||||
|
channels=512,
|
||||||
|
kernel_size=7,
|
||||||
|
upsample_scales=(5, 4, 3, 2),
|
||||||
|
upsample_kernel_sizes=(10, 8, 6, 4),
|
||||||
|
source_network_params={
|
||||||
|
"resblock_kernel_size": 3, # currently only 3 is supported.
|
||||||
|
"resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)],
|
||||||
|
"use_additional_convs": True,
|
||||||
|
},
|
||||||
|
filter_network_params={
|
||||||
|
"resblock_kernel_sizes": (3, 5, 7),
|
||||||
|
"resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)],
|
||||||
|
"use_additional_convs": False,
|
||||||
|
},
|
||||||
|
share_upsamples=False,
|
||||||
|
share_downsamples=False,
|
||||||
|
bias=True,
|
||||||
|
nonlinear_activation="LeakyReLU",
|
||||||
|
nonlinear_activation_params={"negative_slope": 0.1},
|
||||||
|
use_weight_norm=True,
|
||||||
|
requires_grad=True,
|
||||||
|
):
|
||||||
|
"""Initialize SiFiGANGenerator module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
in_channels (int): Number of input channels.
|
||||||
|
out_channels (int): Number of output channels.
|
||||||
|
channels (int): Number of hidden representation channels.
|
||||||
|
kernel_size (int): Kernel size of initial and final conv layer.
|
||||||
|
upsample_scales (list): List of upsampling scales.
|
||||||
|
upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
|
||||||
|
source_network_params (dict): Parameters for source-network.
|
||||||
|
filter_network_params (dict): Parameters for filter-network.
|
||||||
|
share_upsamples (bool): Whether to share up-sampling transposed CNNs.
|
||||||
|
share_downsamples (bool): Whether to share down-sampling CNNs.
|
||||||
|
bias (bool): Whether to add bias parameter in convolution layers.
|
||||||
|
nonlinear_activation (str): Activation function module name.
|
||||||
|
nonlinear_activation_params (dict): Hyperparameters for activation function.
|
||||||
|
use_weight_norm (bool): Whether to use weight norm.
|
||||||
|
If set to true, it will be applied to all of the conv layers.
|
||||||
|
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
# check hyperparameters are valid
|
||||||
|
assert kernel_size % 2 == 1, "Kernel size must be odd number."
|
||||||
|
assert len(upsample_scales) == len(upsample_kernel_sizes)
|
||||||
|
|
||||||
|
# define modules
|
||||||
|
self.num_upsamples = len(upsample_kernel_sizes)
|
||||||
|
self.source_network_params = source_network_params
|
||||||
|
self.filter_network_params = filter_network_params
|
||||||
|
self.share_upsamples = share_upsamples
|
||||||
|
self.share_downsamples = share_downsamples
|
||||||
|
self.sn = nn.ModuleDict()
|
||||||
|
self.fn = nn.ModuleDict()
|
||||||
|
self.input_conv = Conv1d(
|
||||||
|
in_channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
)
|
||||||
|
self.sn["upsamples"] = nn.ModuleList()
|
||||||
|
self.fn["upsamples"] = nn.ModuleList()
|
||||||
|
self.sn["blocks"] = nn.ModuleList()
|
||||||
|
self.fn["blocks"] = nn.ModuleList()
|
||||||
|
for i in range(len(upsample_kernel_sizes)):
|
||||||
|
assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
|
||||||
|
self.sn["upsamples"] += [
|
||||||
|
nn.Sequential(
|
||||||
|
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||||
|
nn.ConvTranspose1d(
|
||||||
|
channels // (2**i),
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
upsample_kernel_sizes[i],
|
||||||
|
upsample_scales[i],
|
||||||
|
padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
|
||||||
|
output_padding=upsample_scales[i] % 2,
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if not share_upsamples:
|
||||||
|
self.fn["upsamples"] += [
|
||||||
|
nn.Sequential(
|
||||||
|
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||||
|
nn.ConvTranspose1d(
|
||||||
|
channels // (2**i),
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
upsample_kernel_sizes[i],
|
||||||
|
upsample_scales[i],
|
||||||
|
padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
|
||||||
|
output_padding=upsample_scales[i] % 2,
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
self.sn["blocks"] += [
|
||||||
|
AdaptiveResidualBlock(
|
||||||
|
kernel_size=source_network_params["resblock_kernel_size"],
|
||||||
|
channels=channels // (2 ** (i + 1)),
|
||||||
|
dilations=source_network_params["resblock_dilations"][i],
|
||||||
|
bias=bias,
|
||||||
|
use_additional_convs=source_network_params["use_additional_convs"],
|
||||||
|
nonlinear_activation=nonlinear_activation,
|
||||||
|
nonlinear_activation_params=nonlinear_activation_params,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
for j in range(len(filter_network_params["resblock_kernel_sizes"])):
|
||||||
|
self.fn["blocks"] += [
|
||||||
|
ResidualBlock(
|
||||||
|
kernel_size=filter_network_params["resblock_kernel_sizes"][j],
|
||||||
|
channels=channels // (2 ** (i + 1)),
|
||||||
|
dilations=filter_network_params["resblock_dilations"][j],
|
||||||
|
bias=bias,
|
||||||
|
use_additional_convs=filter_network_params["use_additional_convs"],
|
||||||
|
nonlinear_activation=nonlinear_activation,
|
||||||
|
nonlinear_activation_params=nonlinear_activation_params,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
self.sn["output_conv"] = nn.Sequential(
|
||||||
|
nn.LeakyReLU(),
|
||||||
|
nn.Conv1d(
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
out_channels,
|
||||||
|
kernel_size,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self.fn["output_conv"] = nn.Sequential(
|
||||||
|
nn.LeakyReLU(),
|
||||||
|
nn.Conv1d(
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
out_channels,
|
||||||
|
kernel_size,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
),
|
||||||
|
nn.Tanh(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# sine embedding layers
|
||||||
|
self.sn["emb"] = Conv1d(
|
||||||
|
1,
|
||||||
|
channels // (2 ** len(upsample_kernel_sizes)),
|
||||||
|
kernel_size,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
)
|
||||||
|
# down-sampling CNNs
|
||||||
|
self.sn["downsamples"] = nn.ModuleList()
|
||||||
|
for i in reversed(range(1, len(upsample_kernel_sizes))):
|
||||||
|
self.sn["downsamples"] += [
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv1d(
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
channels // (2**i),
|
||||||
|
upsample_kernel_sizes[i],
|
||||||
|
upsample_scales[i],
|
||||||
|
padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if not share_downsamples:
|
||||||
|
self.fn["downsamples"] = nn.ModuleList()
|
||||||
|
for i in reversed(range(1, len(upsample_kernel_sizes))):
|
||||||
|
self.fn["downsamples"] += [
|
||||||
|
nn.Sequential(
|
||||||
|
nn.Conv1d(
|
||||||
|
channels // (2 ** (i + 1)),
|
||||||
|
channels // (2**i),
|
||||||
|
upsample_kernel_sizes[i],
|
||||||
|
upsample_scales[i],
|
||||||
|
padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
# apply weight norm
|
||||||
|
if use_weight_norm:
|
||||||
|
self.apply_weight_norm()
|
||||||
|
|
||||||
|
# reset parameters
|
||||||
|
self.reset_parameters()
|
||||||
|
|
||||||
|
if requires_grad is False:
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, x, c, d, sid):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input sine signal (B, 1, T).
|
||||||
|
c (Tensor): Input tensor (B, in_channels, T).
|
||||||
|
d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Output tensor (B, out_channels, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# currently, same input feature is input to each network
|
||||||
|
c = self.input_conv(c)
|
||||||
|
e = c
|
||||||
|
|
||||||
|
# source-network forward
|
||||||
|
x = self.sn["emb"](x)
|
||||||
|
embs = [x]
|
||||||
|
for i in range(self.num_upsamples - 1):
|
||||||
|
x = self.sn["downsamples"][i](x)
|
||||||
|
embs += [x]
|
||||||
|
for i in range(self.num_upsamples):
|
||||||
|
# excitation generation network
|
||||||
|
e = self.sn["upsamples"][i](e) + embs[-i - 1]
|
||||||
|
e = self.sn["blocks"][i](e, d[i])
|
||||||
|
e_ = self.sn["output_conv"](e)
|
||||||
|
|
||||||
|
# filter-network forward
|
||||||
|
embs = [e]
|
||||||
|
for i in range(self.num_upsamples - 1):
|
||||||
|
if self.share_downsamples:
|
||||||
|
e = self.sn["downsamples"][i](e)
|
||||||
|
else:
|
||||||
|
e = self.fn["downsamples"][i](e)
|
||||||
|
embs += [e]
|
||||||
|
num_blocks = len(self.filter_network_params["resblock_kernel_sizes"])
|
||||||
|
for i in range(self.num_upsamples):
|
||||||
|
# resonance filtering network
|
||||||
|
if self.share_upsamples:
|
||||||
|
c = self.sn["upsamples"][i](c) + embs[-i - 1]
|
||||||
|
else:
|
||||||
|
c = self.fn["upsamples"][i](c) + embs[-i - 1]
|
||||||
|
cs = 0.0 # initialize
|
||||||
|
for j in range(num_blocks):
|
||||||
|
cs += self.fn["blocks"][i * num_blocks + j](c)
|
||||||
|
c = cs / num_blocks
|
||||||
|
c = self.fn["output_conv"](c)
|
||||||
|
|
||||||
|
return c, e_
|
||||||
|
|
||||||
|
def reset_parameters(self):
|
||||||
|
"""Reset parameters.
|
||||||
|
|
||||||
|
This initialization follows the official implementation manner.
|
||||||
|
https://github.com/jik876/hifi-gan/blob/master/models.py
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _reset_parameters(m):
|
||||||
|
if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
|
||||||
|
m.weight.data.normal_(0.0, 0.01)
|
||||||
|
logger.debug(f"Reset parameters in {m}.")
|
||||||
|
|
||||||
|
self.apply(_reset_parameters)
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
"""Remove weight normalization module from all of the layers."""
|
||||||
|
|
||||||
|
def _remove_weight_norm(m):
|
||||||
|
try:
|
||||||
|
logger.debug(f"Weight norm is removed from {m}.")
|
||||||
|
nn.utils.remove_weight_norm(m)
|
||||||
|
except ValueError: # this module didn't have weight norm
|
||||||
|
return
|
||||||
|
|
||||||
|
self.apply(_remove_weight_norm)
|
||||||
|
|
||||||
|
def apply_weight_norm(self):
|
||||||
|
"""Apply weight normalization module from all of the layers."""
|
||||||
|
|
||||||
|
def _apply_weight_norm(m):
|
||||||
|
if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
|
||||||
|
nn.utils.weight_norm(m)
|
||||||
|
logger.debug(f"Weight norm is applied to {m}.")
|
||||||
|
|
||||||
|
self.apply(_apply_weight_norm)
|
82
server/voice_changer/MMVCv15/models/index.py
Normal file
82
server/voice_changer/MMVCv15/models/index.py
Normal file
@ -0,0 +1,82 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2020 Yi-Chiao Wu (Nagoya University)
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""Indexing-related functions."""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torch.nn import ConstantPad1d as pad1d
|
||||||
|
|
||||||
|
|
||||||
|
def pd_indexing(x, d, dilation, batch_index, ch_index):
|
||||||
|
"""Pitch-dependent indexing of past and future samples.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input feature map (B, C, T).
|
||||||
|
d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
|
||||||
|
dilation (Int): Dilation size.
|
||||||
|
batch_index (Tensor): Batch index
|
||||||
|
ch_index (Tensor): Channel index
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Past output tensor (B, out_channels, T)
|
||||||
|
Tensor: Future output tensor (B, out_channels, T)
|
||||||
|
|
||||||
|
"""
|
||||||
|
(_, _, batch_length) = d.size()
|
||||||
|
dilations = d * dilation
|
||||||
|
|
||||||
|
# get past index
|
||||||
|
idxP = torch.arange(-batch_length, 0).float()
|
||||||
|
idxP = idxP.to(x.device)
|
||||||
|
idxP = torch.add(-dilations, idxP)
|
||||||
|
idxP = idxP.round().long()
|
||||||
|
maxP = -((torch.min(idxP) + batch_length))
|
||||||
|
assert maxP >= 0
|
||||||
|
idxP = (batch_index, ch_index, idxP)
|
||||||
|
# padding past tensor
|
||||||
|
xP = pad1d((maxP, 0), 0)(x)
|
||||||
|
|
||||||
|
# get future index
|
||||||
|
idxF = torch.arange(0, batch_length).float()
|
||||||
|
idxF = idxF.to(x.device)
|
||||||
|
idxF = torch.add(dilations, idxF)
|
||||||
|
idxF = idxF.round().long()
|
||||||
|
maxF = torch.max(idxF) - (batch_length - 1)
|
||||||
|
assert maxF >= 0
|
||||||
|
idxF = (batch_index, ch_index, idxF)
|
||||||
|
# padding future tensor
|
||||||
|
xF = pad1d((0, maxF), 0)(x)
|
||||||
|
|
||||||
|
return xP[idxP], xF[idxF]
|
||||||
|
|
||||||
|
|
||||||
|
def index_initial(n_batch, n_ch, tensor=True):
|
||||||
|
"""Tensor batch and channel index initialization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
n_batch (Int): Number of batch.
|
||||||
|
n_ch (Int): Number of channel.
|
||||||
|
tensor (bool): Return tensor or numpy array
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Batch index
|
||||||
|
Tensor: Channel index
|
||||||
|
|
||||||
|
"""
|
||||||
|
batch_index = []
|
||||||
|
for i in range(n_batch):
|
||||||
|
batch_index.append([[i]] * n_ch)
|
||||||
|
ch_index = []
|
||||||
|
for i in range(n_ch):
|
||||||
|
ch_index += [[i]]
|
||||||
|
ch_index = [ch_index] * n_batch
|
||||||
|
|
||||||
|
if tensor:
|
||||||
|
batch_index = torch.tensor(batch_index)
|
||||||
|
ch_index = torch.tensor(ch_index)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
batch_index = batch_index.cuda()
|
||||||
|
ch_index = ch_index.cuda()
|
||||||
|
return batch_index, ch_index
|
438
server/voice_changer/MMVCv15/models/models.py
Normal file
438
server/voice_changer/MMVCv15/models/models.py
Normal file
@ -0,0 +1,438 @@
|
|||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE
|
||||||
|
|
||||||
|
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
||||||
|
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||||
|
from .commons import init_weights, get_padding, sequence_mask
|
||||||
|
from .generator import SiFiGANGenerator
|
||||||
|
from .features import SignalGenerator, dilated_factor
|
||||||
|
|
||||||
|
|
||||||
|
class TextEncoder(nn.Module):
|
||||||
|
def __init__(self, out_channels, hidden_channels, requires_grad=True):
|
||||||
|
super().__init__()
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||||
|
# パラメータを学習しない
|
||||||
|
if requires_grad is False:
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, x, x_lengths):
|
||||||
|
x = torch.transpose(x.half(), 1, -1) # [b, h, t]
|
||||||
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||||
|
stats = self.proj(x) * x_mask
|
||||||
|
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||||
|
return x, m, logs, x_mask
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualCouplingBlock(nn.Module):
|
||||||
|
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0, requires_grad=True):
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.n_flows = n_flows
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
|
||||||
|
self.flows = nn.ModuleList()
|
||||||
|
for i in range(n_flows):
|
||||||
|
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
||||||
|
self.flows.append(Flip())
|
||||||
|
|
||||||
|
# パラメータを学習しない
|
||||||
|
if requires_grad is False:
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, reverse=False):
|
||||||
|
if not reverse:
|
||||||
|
for flow in self.flows:
|
||||||
|
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||||
|
else:
|
||||||
|
for flow in reversed(self.flows):
|
||||||
|
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class PosteriorEncoder(nn.Module):
|
||||||
|
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, requires_grad=True):
|
||||||
|
super().__init__()
|
||||||
|
self.in_channels = in_channels
|
||||||
|
self.out_channels = out_channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
|
||||||
|
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||||
|
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||||
|
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||||
|
|
||||||
|
# パラメータを学習しない
|
||||||
|
if requires_grad is False:
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, x, x_lengths, g=None):
|
||||||
|
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||||
|
x = self.pre(x) * x_mask
|
||||||
|
x = self.enc(x, x_mask, g=g)
|
||||||
|
stats = self.proj(x) * x_mask
|
||||||
|
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||||
|
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||||
|
return z, m, logs, x_mask
|
||||||
|
|
||||||
|
|
||||||
|
class Generator(torch.nn.Module):
|
||||||
|
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, requires_grad=True):
|
||||||
|
super(Generator, self).__init__()
|
||||||
|
self.num_kernels = len(resblock_kernel_sizes)
|
||||||
|
self.num_upsamples = len(upsample_rates)
|
||||||
|
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
||||||
|
resblock = ResBlock1 if resblock == "1" else ResBlock2
|
||||||
|
|
||||||
|
self.ups = nn.ModuleList()
|
||||||
|
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||||
|
self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
|
||||||
|
|
||||||
|
self.resblocks = nn.ModuleList()
|
||||||
|
for i in range(len(self.ups)):
|
||||||
|
ch = upsample_initial_channel // (2 ** (i + 1))
|
||||||
|
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||||
|
self.resblocks.append(resblock(ch, k, d))
|
||||||
|
|
||||||
|
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||||
|
self.ups.apply(init_weights)
|
||||||
|
|
||||||
|
if requires_grad is False:
|
||||||
|
for param in self.parameters():
|
||||||
|
param.requires_grad = False
|
||||||
|
|
||||||
|
def forward(self, x, g=None):
|
||||||
|
x = self.conv_pre(x)
|
||||||
|
|
||||||
|
for i in range(self.num_upsamples):
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
x = self.ups[i](x)
|
||||||
|
xs = None
|
||||||
|
for j in range(self.num_kernels):
|
||||||
|
if xs is None:
|
||||||
|
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
else:
|
||||||
|
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||||
|
x = xs / self.num_kernels
|
||||||
|
x = F.leaky_relu(x)
|
||||||
|
x = self.conv_post(x)
|
||||||
|
x = torch.tanh(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
print("Removing weight norm...")
|
||||||
|
for l in self.ups:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.resblocks:
|
||||||
|
l.remove_weight_norm()
|
||||||
|
|
||||||
|
|
||||||
|
class DiscriminatorP(torch.nn.Module):
|
||||||
|
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||||
|
super(DiscriminatorP, self).__init__()
|
||||||
|
self.period = period
|
||||||
|
self.use_spectral_norm = use_spectral_norm
|
||||||
|
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||||
|
self.convs = nn.ModuleList(
|
||||||
|
[
|
||||||
|
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||||
|
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||||
|
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||||
|
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||||
|
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
fmap = []
|
||||||
|
|
||||||
|
# 1d to 2d
|
||||||
|
b, c, t = x.shape
|
||||||
|
if t % self.period != 0: # pad first
|
||||||
|
n_pad = self.period - (t % self.period)
|
||||||
|
x = F.pad(x, (0, n_pad), "reflect")
|
||||||
|
t = t + n_pad
|
||||||
|
x = x.view(b, c, t // self.period, self.period)
|
||||||
|
|
||||||
|
for l in self.convs:
|
||||||
|
x = l(x)
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
fmap.append(x)
|
||||||
|
x = self.conv_post(x)
|
||||||
|
fmap.append(x)
|
||||||
|
x = torch.flatten(x, 1, -1)
|
||||||
|
|
||||||
|
return x, fmap
|
||||||
|
|
||||||
|
|
||||||
|
class DiscriminatorS(torch.nn.Module):
|
||||||
|
def __init__(self, use_spectral_norm=False):
|
||||||
|
super(DiscriminatorS, self).__init__()
|
||||||
|
norm_f = weight_norm if use_spectral_norm is False else spectral_norm
|
||||||
|
self.convs = nn.ModuleList(
|
||||||
|
[
|
||||||
|
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||||
|
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||||
|
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
||||||
|
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
||||||
|
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
||||||
|
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
fmap = []
|
||||||
|
|
||||||
|
for l in self.convs:
|
||||||
|
x = l(x)
|
||||||
|
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
fmap.append(x)
|
||||||
|
x = self.conv_post(x)
|
||||||
|
fmap.append(x)
|
||||||
|
x = torch.flatten(x, 1, -1)
|
||||||
|
|
||||||
|
return x, fmap
|
||||||
|
|
||||||
|
|
||||||
|
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||||
|
def __init__(self, use_spectral_norm=False):
|
||||||
|
super(MultiPeriodDiscriminator, self).__init__()
|
||||||
|
# periods = [2,3,5,7,11]
|
||||||
|
periods = [3, 5, 7, 11, 13]
|
||||||
|
|
||||||
|
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||||
|
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||||
|
self.discriminators = nn.ModuleList(discs)
|
||||||
|
|
||||||
|
def forward(self, y, y_hat, flag=True):
|
||||||
|
if flag:
|
||||||
|
y_d_rs = []
|
||||||
|
y_d_gs = []
|
||||||
|
fmap_rs = []
|
||||||
|
fmap_gs = []
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
y_d_r, fmap_r = d(y)
|
||||||
|
y_d_g, fmap_g = d(y_hat)
|
||||||
|
y_d_rs.append(y_d_r)
|
||||||
|
y_d_gs.append(y_d_g)
|
||||||
|
fmap_rs.append(fmap_r)
|
||||||
|
fmap_gs.append(fmap_g)
|
||||||
|
|
||||||
|
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||||
|
else:
|
||||||
|
y_d_gs = []
|
||||||
|
with torch.no_grad():
|
||||||
|
for i, d in enumerate(self.discriminators):
|
||||||
|
y_d_g, _ = d(y_hat)
|
||||||
|
y_d_gs.append(y_d_g)
|
||||||
|
|
||||||
|
return y_d_gs
|
||||||
|
|
||||||
|
|
||||||
|
class SynthesizerTrn(nn.Module):
|
||||||
|
"""
|
||||||
|
Synthesizer for Training
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
spec_channels,
|
||||||
|
segment_size,
|
||||||
|
inter_channels,
|
||||||
|
hidden_channels,
|
||||||
|
upsample_rates,
|
||||||
|
upsample_initial_channel,
|
||||||
|
upsample_kernel_sizes,
|
||||||
|
n_flow,
|
||||||
|
dec_out_channels=1,
|
||||||
|
dec_kernel_size=7,
|
||||||
|
n_speakers=0,
|
||||||
|
gin_channels=0,
|
||||||
|
requires_grad_pe=True,
|
||||||
|
requires_grad_flow=True,
|
||||||
|
requires_grad_text_enc=True,
|
||||||
|
requires_grad_dec=True,
|
||||||
|
requires_grad_emb_g=True,
|
||||||
|
sample_rate=24000,
|
||||||
|
hop_size=128,
|
||||||
|
sine_amp=0.1,
|
||||||
|
noise_amp=0.003,
|
||||||
|
signal_types=["sine"],
|
||||||
|
dense_factors=[0.5, 1, 4, 8],
|
||||||
|
upsample_scales=[8, 4, 2, 2],
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.spec_channels = spec_channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.upsample_rates = upsample_rates
|
||||||
|
self.upsample_initial_channel = upsample_initial_channel
|
||||||
|
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||||
|
self.segment_size = segment_size
|
||||||
|
self.dec_out_channels = dec_out_channels
|
||||||
|
self.dec_kernel_size = dec_kernel_size
|
||||||
|
self.n_speakers = n_speakers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
self.requires_grad_pe = requires_grad_pe
|
||||||
|
self.requires_grad_flow = requires_grad_flow
|
||||||
|
self.requires_grad_text_enc = requires_grad_text_enc
|
||||||
|
self.requires_grad_dec = requires_grad_dec
|
||||||
|
self.requires_grad_emb_g = requires_grad_emb_g
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.hop_size = hop_size
|
||||||
|
self.sine_amp = sine_amp
|
||||||
|
self.noise_amp = noise_amp
|
||||||
|
self.signal_types = signal_types
|
||||||
|
self.dense_factors = dense_factors
|
||||||
|
self.upsample_scales = upsample_scales
|
||||||
|
|
||||||
|
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels, requires_grad=requires_grad_pe)
|
||||||
|
self.enc_p = TextEncoder(inter_channels, hidden_channels, requires_grad=requires_grad_text_enc)
|
||||||
|
self.dec = SiFiGANGenerator(in_channels=inter_channels, out_channels=dec_out_channels, channels=upsample_initial_channel, kernel_size=dec_kernel_size, upsample_scales=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, requires_grad=requires_grad_dec)
|
||||||
|
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels, requires_grad=requires_grad_flow)
|
||||||
|
self.signal_generator = SignalGenerator(sample_rate=sample_rate, hop_size=hop_size, noise_amp=noise_amp, signal_types=signal_types)
|
||||||
|
|
||||||
|
if n_speakers > 1:
|
||||||
|
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||||
|
self.emb_g.requires_grad = requires_grad_emb_g
|
||||||
|
|
||||||
|
def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None):
|
||||||
|
pass
|
||||||
|
# sin, d = self.make_sin_d(f0)
|
||||||
|
|
||||||
|
# x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
||||||
|
# # target sid 作成
|
||||||
|
# target_sids = self.make_random_target_sids(target_ids, sid)
|
||||||
|
|
||||||
|
# if self.n_speakers > 0:
|
||||||
|
# g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||||
|
# tgt_g = self.emb_g(target_sids).unsqueeze(-1) # [b, h, 1]
|
||||||
|
# else:
|
||||||
|
# g = None
|
||||||
|
|
||||||
|
# # PE
|
||||||
|
# z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||||
|
# # Flow
|
||||||
|
# z_p = self.flow(z, y_mask, g=g)
|
||||||
|
# # VC
|
||||||
|
# tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True)
|
||||||
|
# # アライメントの作成
|
||||||
|
# liner_alignment = F.one_hot(torch.arange(0, x.shape[2] + 2)).cuda()
|
||||||
|
# liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0)
|
||||||
|
# liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode="linear", align_corners=True)
|
||||||
|
# liner_alignment = liner_alignment[:, 1:-1, :]
|
||||||
|
# # TextEncとPEのshape合わせ
|
||||||
|
# m_p = torch.matmul(m_p, liner_alignment)
|
||||||
|
# logs_p = torch.matmul(logs_p, liner_alignment)
|
||||||
|
|
||||||
|
# # slice
|
||||||
|
# z_slice = slice_segments(z, slice_id, self.segment_size)
|
||||||
|
# # targetのslice
|
||||||
|
# tgt_z_slice = slice_segments(tgt_z, slice_id, self.segment_size)
|
||||||
|
# # Dec
|
||||||
|
# o = self.dec(sin, z_slice, d, sid=g)
|
||||||
|
# tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g)
|
||||||
|
|
||||||
|
# return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)
|
||||||
|
|
||||||
|
def make_sin_d(self, f0):
|
||||||
|
# f0 から sin と d を作成
|
||||||
|
# f0 : [b, 1, t]
|
||||||
|
# sin : [b, 1, t]
|
||||||
|
# d : [4][b, 1, t]
|
||||||
|
prod_upsample_scales = np.cumprod(self.upsample_scales)
|
||||||
|
dfs_batch = []
|
||||||
|
for df, us in zip(self.dense_factors, prod_upsample_scales):
|
||||||
|
dilated_tensor = dilated_factor(f0, self.sample_rate, df)
|
||||||
|
# result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
|
||||||
|
result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
|
||||||
|
dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))
|
||||||
|
in_batch = self.signal_generator(f0)
|
||||||
|
|
||||||
|
return in_batch, dfs_batch
|
||||||
|
|
||||||
|
def make_random_target_sids(self, target_ids, sid):
|
||||||
|
# target_sids は target_ids をランダムで埋める
|
||||||
|
target_sids = torch.zeros_like(sid)
|
||||||
|
for i in range(len(target_sids)):
|
||||||
|
source_id = sid[i]
|
||||||
|
deleted_target_ids = target_ids[target_ids != source_id] # source_id と target_id が同じにならないよう sid と同じものを削除
|
||||||
|
if len(deleted_target_ids) >= 1:
|
||||||
|
target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))]
|
||||||
|
else:
|
||||||
|
# target_id 候補が無いときは仕方ないので sid を使う
|
||||||
|
target_sids[i] = source_id
|
||||||
|
return target_sids
|
||||||
|
|
||||||
|
def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
|
||||||
|
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||||
|
sin, d = self.make_sin_d(f0)
|
||||||
|
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||||
|
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||||
|
z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||||
|
z_p = self.flow(z, y_mask, g=g_src)
|
||||||
|
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||||
|
# print("VC", sin.device, d[0].device, g_tgt.device)
|
||||||
|
o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
|
||||||
|
return o_hat[0]
|
||||||
|
|
||||||
|
def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
|
||||||
|
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||||
|
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||||
|
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||||
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||||
|
o_hat = self.dec(z * y_mask, g=g_tgt)
|
||||||
|
return o_hat, y_mask, (z)
|
||||||
|
|
||||||
|
def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
|
||||||
|
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||||
|
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||||
|
# g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||||
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||||
|
o_hat = self.dec(z * y_mask, g=g_src)
|
||||||
|
return o_hat, y_mask, (z)
|
||||||
|
|
||||||
|
def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
|
||||||
|
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||||
|
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||||
|
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||||
|
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||||
|
z_p = self.flow(z, y_mask, g=g_src)
|
||||||
|
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||||
|
z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
|
||||||
|
z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
|
||||||
|
o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
|
||||||
|
return o_hat, y_mask, (z, z_p, z_hat)
|
||||||
|
|
||||||
|
def save_synthesizer(self, path):
|
||||||
|
enc_q = self.enc_q.state_dict()
|
||||||
|
dec = self.dec.state_dict()
|
||||||
|
emb_g = self.emb_g.state_dict()
|
||||||
|
torch.save({"enc_q": enc_q, "dec": dec, "emb_g": emb_g}, path)
|
||||||
|
|
||||||
|
def load_synthesizer(self, path):
|
||||||
|
dict = torch.load(path, map_location="cpu")
|
||||||
|
enc_q = dict["enc_q"]
|
||||||
|
dec = dict["dec"]
|
||||||
|
emb_g = dict["emb_g"]
|
||||||
|
self.enc_q.load_state_dict(enc_q)
|
||||||
|
self.dec.load_state_dict(dec)
|
||||||
|
self.emb_g.load_state_dict(emb_g)
|
186
server/voice_changer/MMVCv15/models/modules.py
Normal file
186
server/voice_changer/MMVCv15/models/modules.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
import torch
|
||||||
|
from torch import nn
|
||||||
|
from torch.nn import functional as F
|
||||||
|
|
||||||
|
from torch.nn import Conv1d
|
||||||
|
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||||
|
|
||||||
|
from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
|
||||||
|
|
||||||
|
|
||||||
|
LRELU_SLOPE = 0.1
|
||||||
|
|
||||||
|
|
||||||
|
class WN(torch.nn.Module):
|
||||||
|
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
||||||
|
super(WN, self).__init__()
|
||||||
|
assert kernel_size % 2 == 1
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = (kernel_size,)
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.gin_channels = gin_channels
|
||||||
|
self.p_dropout = p_dropout
|
||||||
|
|
||||||
|
self.in_layers = torch.nn.ModuleList()
|
||||||
|
self.res_skip_layers = torch.nn.ModuleList()
|
||||||
|
self.drop = nn.Dropout(p_dropout)
|
||||||
|
|
||||||
|
if gin_channels != 0:
|
||||||
|
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
|
||||||
|
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||||
|
|
||||||
|
for i in range(n_layers):
|
||||||
|
dilation = dilation_rate**i
|
||||||
|
padding = int((kernel_size * dilation - dilation) / 2)
|
||||||
|
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
|
||||||
|
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||||
|
self.in_layers.append(in_layer)
|
||||||
|
|
||||||
|
# last one is not necessary
|
||||||
|
if i < n_layers - 1:
|
||||||
|
res_skip_channels = 2 * hidden_channels
|
||||||
|
else:
|
||||||
|
res_skip_channels = hidden_channels
|
||||||
|
|
||||||
|
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||||
|
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
||||||
|
self.res_skip_layers.append(res_skip_layer)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, **kwargs):
|
||||||
|
output = torch.zeros_like(x)
|
||||||
|
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||||
|
|
||||||
|
if g is not None:
|
||||||
|
g = self.cond_layer(g)
|
||||||
|
|
||||||
|
for i in range(self.n_layers):
|
||||||
|
x_in = self.in_layers[i](x)
|
||||||
|
if g is not None:
|
||||||
|
cond_offset = i * 2 * self.hidden_channels
|
||||||
|
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
||||||
|
else:
|
||||||
|
g_l = torch.zeros_like(x_in)
|
||||||
|
|
||||||
|
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
||||||
|
acts = self.drop(acts)
|
||||||
|
|
||||||
|
res_skip_acts = self.res_skip_layers[i](acts)
|
||||||
|
if i < self.n_layers - 1:
|
||||||
|
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
||||||
|
x = (x + res_acts) * x_mask
|
||||||
|
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
||||||
|
else:
|
||||||
|
output = output + res_skip_acts
|
||||||
|
return output * x_mask
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
if self.gin_channels != 0:
|
||||||
|
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||||
|
for l in self.in_layers:
|
||||||
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
|
for l in self.res_skip_layers:
|
||||||
|
torch.nn.utils.remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock1(torch.nn.Module):
|
||||||
|
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||||
|
super(ResBlock1, self).__init__()
|
||||||
|
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
|
||||||
|
self.convs1.apply(init_weights)
|
||||||
|
|
||||||
|
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
|
||||||
|
self.convs2.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask=None):
|
||||||
|
for c1, c2 in zip(self.convs1, self.convs2):
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c1(xt)
|
||||||
|
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c2(xt)
|
||||||
|
x = xt + x
|
||||||
|
if x_mask is not None:
|
||||||
|
x = x * x_mask
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs1:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
for l in self.convs2:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class ResBlock2(torch.nn.Module):
|
||||||
|
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
||||||
|
super(ResBlock2, self).__init__()
|
||||||
|
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
|
||||||
|
self.convs.apply(init_weights)
|
||||||
|
|
||||||
|
def forward(self, x, x_mask=None):
|
||||||
|
for c in self.convs:
|
||||||
|
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||||
|
if x_mask is not None:
|
||||||
|
xt = xt * x_mask
|
||||||
|
xt = c(xt)
|
||||||
|
x = xt + x
|
||||||
|
if x_mask is not None:
|
||||||
|
x = x * x_mask
|
||||||
|
return x
|
||||||
|
|
||||||
|
def remove_weight_norm(self):
|
||||||
|
for l in self.convs:
|
||||||
|
remove_weight_norm(l)
|
||||||
|
|
||||||
|
|
||||||
|
class Flip(nn.Module):
|
||||||
|
def forward(self, x, *args, reverse=False, **kwargs):
|
||||||
|
x = torch.flip(x, [1])
|
||||||
|
if not reverse:
|
||||||
|
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
||||||
|
return x, logdet
|
||||||
|
else:
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualCouplingLayer(nn.Module):
|
||||||
|
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
|
||||||
|
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||||
|
super().__init__()
|
||||||
|
self.channels = channels
|
||||||
|
self.hidden_channels = hidden_channels
|
||||||
|
self.kernel_size = kernel_size
|
||||||
|
self.dilation_rate = dilation_rate
|
||||||
|
self.n_layers = n_layers
|
||||||
|
self.half_channels = channels // 2
|
||||||
|
self.mean_only = mean_only
|
||||||
|
|
||||||
|
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||||
|
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
||||||
|
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||||
|
self.post.weight.data.zero_()
|
||||||
|
self.post.bias.data.zero_()
|
||||||
|
|
||||||
|
def forward(self, x, x_mask, g=None, reverse=False):
|
||||||
|
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
||||||
|
h = self.pre(x0) * x_mask
|
||||||
|
h = self.enc(h, x_mask, g=g)
|
||||||
|
stats = self.post(h) * x_mask
|
||||||
|
if not self.mean_only:
|
||||||
|
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
||||||
|
else:
|
||||||
|
m = stats
|
||||||
|
logs = torch.zeros_like(m)
|
||||||
|
|
||||||
|
if not reverse:
|
||||||
|
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||||
|
x = torch.cat([x0, x1], 1)
|
||||||
|
logdet = torch.sum(logs, [1, 2])
|
||||||
|
return x, logdet
|
||||||
|
else:
|
||||||
|
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||||
|
x = torch.cat([x0, x1], 1)
|
||||||
|
return x
|
1
server/voice_changer/MMVCv15/models/readme.txt
Normal file
1
server/voice_changer/MMVCv15/models/readme.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26
|
257
server/voice_changer/MMVCv15/models/residual_block.py
Normal file
257
server/voice_changer/MMVCv15/models/residual_block.py
Normal file
@ -0,0 +1,257 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 Reo Yoneyama (Nagoya University)
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""Residual block modules.
|
||||||
|
|
||||||
|
References:
|
||||||
|
- https://github.com/kan-bayashi/ParallelWaveGAN
|
||||||
|
- https://github.com/bigpon/QPPWG
|
||||||
|
- https://github.com/r9y9/wavenet_vocoder
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from logging import getLogger
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from .snake import Snake
|
||||||
|
from .index import index_initial, pd_indexing
|
||||||
|
|
||||||
|
# A logger for this file
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1d(nn.Conv1d):
|
||||||
|
"""Conv1d module with customized initialization."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
"""Initialize Conv1d module."""
|
||||||
|
super(Conv1d, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def reset_parameters(self):
|
||||||
|
"""Reset parameters."""
|
||||||
|
nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
|
||||||
|
if self.bias is not None:
|
||||||
|
nn.init.constant_(self.bias, 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
class Conv1d1x1(Conv1d):
|
||||||
|
"""1x1 Conv1d with customized initialization."""
|
||||||
|
|
||||||
|
def __init__(self, in_channels, out_channels, bias=True):
|
||||||
|
"""Initialize 1x1 Conv1d module."""
|
||||||
|
super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2d(nn.Conv2d):
|
||||||
|
"""Conv2d module with customized initialization."""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
"""Initialize Conv2d module."""
|
||||||
|
super(Conv2d, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def reset_parameters(self):
|
||||||
|
"""Reset parameters."""
|
||||||
|
nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu")
|
||||||
|
if self.bias is not None:
|
||||||
|
nn.init.constant_(self.bias, 0.0)
|
||||||
|
|
||||||
|
|
||||||
|
class Conv2d1x1(Conv2d):
|
||||||
|
"""1x1 Conv2d with customized initialization."""
|
||||||
|
|
||||||
|
def __init__(self, in_channels, out_channels, bias=True):
|
||||||
|
"""Initialize 1x1 Conv2d module."""
|
||||||
|
super(Conv2d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
|
||||||
|
|
||||||
|
|
||||||
|
class ResidualBlock(nn.Module):
|
||||||
|
"""Residual block module in HiFiGAN."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kernel_size=3,
|
||||||
|
channels=512,
|
||||||
|
dilations=(1, 3, 5),
|
||||||
|
bias=True,
|
||||||
|
use_additional_convs=True,
|
||||||
|
nonlinear_activation="LeakyReLU",
|
||||||
|
nonlinear_activation_params={"negative_slope": 0.1},
|
||||||
|
):
|
||||||
|
"""Initialize ResidualBlock module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kernel_size (int): Kernel size of dilation convolution layer.
|
||||||
|
channels (int): Number of channels for convolution layer.
|
||||||
|
dilations (List[int]): List of dilation factors.
|
||||||
|
use_additional_convs (bool): Whether to use additional convolution layers.
|
||||||
|
bias (bool): Whether to add bias parameter in convolution layers.
|
||||||
|
nonlinear_activation (str): Activation function module name.
|
||||||
|
nonlinear_activation_params (dict): Hyperparameters for activation function.
|
||||||
|
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.use_additional_convs = use_additional_convs
|
||||||
|
self.convs1 = nn.ModuleList()
|
||||||
|
if use_additional_convs:
|
||||||
|
self.convs2 = nn.ModuleList()
|
||||||
|
assert kernel_size % 2 == 1, "Kernel size must be odd number."
|
||||||
|
for dilation in dilations:
|
||||||
|
if nonlinear_activation == "Snake":
|
||||||
|
nonlinear = Snake(channels, **nonlinear_activation_params)
|
||||||
|
else:
|
||||||
|
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
|
||||||
|
self.convs1 += [
|
||||||
|
nn.Sequential(
|
||||||
|
nonlinear,
|
||||||
|
nn.Conv1d(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
dilation=dilation,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2 * dilation,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
if use_additional_convs:
|
||||||
|
if nonlinear_activation == "Snake":
|
||||||
|
nonlinear = Snake(channels, **nonlinear_activation_params)
|
||||||
|
else:
|
||||||
|
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
|
||||||
|
self.convs2 += [
|
||||||
|
nn.Sequential(
|
||||||
|
nonlinear,
|
||||||
|
nn.Conv1d(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
dilation=1,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input tensor (B, channels, T).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Output tensor (B, channels, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
for idx in range(len(self.convs1)):
|
||||||
|
xt = self.convs1[idx](x)
|
||||||
|
if self.use_additional_convs:
|
||||||
|
xt = self.convs2[idx](xt)
|
||||||
|
x = xt + x
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
class AdaptiveResidualBlock(nn.Module):
|
||||||
|
"""Residual block module in HiFiGAN."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
kernel_size=3,
|
||||||
|
channels=512,
|
||||||
|
dilations=(1, 2, 4),
|
||||||
|
bias=True,
|
||||||
|
use_additional_convs=True,
|
||||||
|
nonlinear_activation="LeakyReLU",
|
||||||
|
nonlinear_activation_params={"negative_slope": 0.1},
|
||||||
|
):
|
||||||
|
"""Initialize ResidualBlock module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
kernel_size (int): Kernel size of dilation convolution layer.
|
||||||
|
channels (int): Number of channels for convolution layer.
|
||||||
|
bias (bool): Whether to add bias parameter in convolution layers.
|
||||||
|
nonlinear_activation (str): Activation function module name.
|
||||||
|
nonlinear_activation_params (dict): Hyperparameters for activation function.
|
||||||
|
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.use_additional_convs = use_additional_convs
|
||||||
|
assert kernel_size == 3, "Currently only kernel_size = 3 is supported."
|
||||||
|
self.channels = channels
|
||||||
|
self.dilations = dilations
|
||||||
|
self.nonlinears = nn.ModuleList()
|
||||||
|
self.convsC = nn.ModuleList()
|
||||||
|
self.convsP = nn.ModuleList()
|
||||||
|
self.convsF = nn.ModuleList()
|
||||||
|
if use_additional_convs:
|
||||||
|
self.convsA = nn.ModuleList()
|
||||||
|
for _ in dilations:
|
||||||
|
if nonlinear_activation == "Snake":
|
||||||
|
self.nonlinears += [Snake(channels, **nonlinear_activation_params)]
|
||||||
|
else:
|
||||||
|
self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)]
|
||||||
|
self.convsC += [
|
||||||
|
Conv1d1x1(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
self.convsP += [
|
||||||
|
Conv1d1x1(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
self.convsF += [
|
||||||
|
Conv1d1x1(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
bias=bias,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
if use_additional_convs:
|
||||||
|
if nonlinear_activation == "Snake":
|
||||||
|
nonlinear = Snake(channels, **nonlinear_activation_params)
|
||||||
|
else:
|
||||||
|
nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
|
||||||
|
self.convsA += [
|
||||||
|
nn.Sequential(
|
||||||
|
nonlinear,
|
||||||
|
nn.Conv1d(
|
||||||
|
channels,
|
||||||
|
channels,
|
||||||
|
kernel_size,
|
||||||
|
dilation=1,
|
||||||
|
bias=bias,
|
||||||
|
padding=(kernel_size - 1) // 2,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def forward(self, x, d):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input tensor (B, channels, T).
|
||||||
|
d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Output tensor (B, channels, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False)
|
||||||
|
batch_index = torch.tensor(batch_index).to(x.device)
|
||||||
|
ch_index = torch.tensor(ch_index).to(x.device)
|
||||||
|
|
||||||
|
for i, dilation in enumerate(self.dilations):
|
||||||
|
xt = self.nonlinears[i](x)
|
||||||
|
xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index)
|
||||||
|
xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF)
|
||||||
|
if self.use_additional_convs:
|
||||||
|
xt = self.convsA[i](xt)
|
||||||
|
x = xt + x
|
||||||
|
return x
|
47
server/voice_changer/MMVCv15/models/snake.py
Normal file
47
server/voice_changer/MMVCv15/models/snake.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# Copyright 2022 Reo Yoneyama (Nagoya University)
|
||||||
|
# MIT License (https://opensource.org/licenses/MIT)
|
||||||
|
|
||||||
|
"""Snake Activation Function Module.
|
||||||
|
|
||||||
|
References:
|
||||||
|
- Neural Networks Fail to Learn Periodic Functions and How to Fix It
|
||||||
|
https://arxiv.org/pdf/2006.08195.pdf
|
||||||
|
- BigVGAN: A Universal Neural Vocoder with Large-Scale Training
|
||||||
|
https://arxiv.org/pdf/2206.04658.pdf
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class Snake(nn.Module):
|
||||||
|
"""Snake activation function module."""
|
||||||
|
|
||||||
|
def __init__(self, channels, init=50):
|
||||||
|
"""Initialize Snake module.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
channels (int): Number of feature channels.
|
||||||
|
init (float): Initial value of the learnable parameter alpha.
|
||||||
|
According to the original paper, 5 ~ 50 would be
|
||||||
|
suitable for periodic data (i.e. voices).
|
||||||
|
|
||||||
|
"""
|
||||||
|
super(Snake, self).__init__()
|
||||||
|
alpha = init * torch.ones(1, channels, 1)
|
||||||
|
self.alpha = nn.Parameter(alpha)
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
"""Calculate forward propagation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (Tensor): Input noise signal (B, channels, T).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tensor: Output tensor (B, channels, T).
|
||||||
|
|
||||||
|
"""
|
||||||
|
return x + torch.sin(self.alpha * x) ** 2 / self.alpha
|
Loading…
Reference in New Issue
Block a user