WIP: integrate vcs to new gui 3

2025-01-23 21:45:00 +03:00 · 2023-06-22 06:56:00 +09:00 · 2023-06-22 06:56:00 +09:00 · d83590dc35
commit d83590dc35
parent fa7894de50
10 changed files with 1552 additions and 7 deletions
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -20,7 +20,7 @@ import torch
 import onnxruntime
 import pyworld as pw
-from models import SynthesizerTrn  # type:ignore
+from voice_changer.MMVCv15.models.models import SynthesizerTrn  # type:ignore
 from voice_changer.MMVCv15.client_modules import (
    convert_continuos_f0,
    spectrogram_torch,
@ -156,8 +156,7 @@ class MMVCv15:
    def get_info(self):
        data = asdict(self.settings)
-        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
+        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
        return data
    def get_processing_sampling_rate(self):
@ -231,10 +230,6 @@ class MMVCv15:
        return [spec, f0, sid]
    def _onnx_inference(self, data):
        if self.settings.onnxModelFile == "" and self.settings.onnxModelFile is None:
            print("[Voice Changer] No ONNX session.")
            raise NoModeLoadedException("ONNX")
        spec, f0, sid_src = data
        spec = spec.unsqueeze(0)
        spec_lengths = torch.tensor([spec.size(2)])
--- a/server/voice_changer/MMVCv15/models/commons.py
+++ b/server/voice_changer/MMVCv15/models/commons.py
@ -0,0 +1,27 @@
 import torch
 def init_weights(m, mean=0.0, std=0.01):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:
        m.weight.data.normal_(mean, std)
 def get_padding(kernel_size, dilation=1):
    return int((kernel_size * dilation - dilation) / 2)
 def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
    n_channels_int = n_channels[0]
    in_act = input_a + input_b
    t_act = torch.tanh(in_act[:, :n_channels_int, :])
    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
    acts = t_act * s_act
    return acts
 def sequence_mask(length, max_length=None):
    if max_length is None:
        max_length = length.max()
    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
    return x.unsqueeze(0) < length.unsqueeze(1)
--- a/server/voice_changer/MMVCv15/models/features.py
+++ b/server/voice_changer/MMVCv15/models/features.py
@ -0,0 +1,200 @@
 # -*- coding: utf-8 -*-
 # Copyright 2022 Reo Yoneyama (Nagoya University)
 #  MIT License (https://opensource.org/licenses/MIT)
 """Feature-related functions.
 References:
    - https://github.com/bigpon/QPPWG
    - https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts
 """
 import sys
 from logging import getLogger
 import numpy as np
 import torch
 from torch.nn.functional import interpolate
 # A logger for this file
 logger = getLogger(__name__)
 def validate_length(xs, ys=None, hop_size=None):
    """Validate length
    Args:
        xs (ndarray): numpy array of features
        ys (ndarray): numpy array of audios
        hop_size (int): upsampling factor
    Returns:
        (ndarray): length adjusted features
    """
    min_len_x = min([x.shape[0] for x in xs])
    if ys is not None:
        min_len_y = min([y.shape[0] for y in ys])
        if min_len_y < min_len_x * hop_size:
            min_len_x = min_len_y // hop_size
        if min_len_y > min_len_x * hop_size:
            min_len_y = min_len_x * hop_size
        ys = [y[:min_len_y] for y in ys]
    xs = [x[:min_len_x] for x in xs]
    return xs + ys if ys is not None else xs
 def dilated_factor(batch_f0, fs, dense_factor):
    """Pitch-dependent dilated factor
    Args:
        batch_f0 (ndarray): the f0 sequence (T)
        fs (int): sampling rate
        dense_factor (int): the number of taps in one cycle
    Return:
        dilated_factors(np array):
            float array of the pitch-dependent dilated factors (T)
    """
    batch_f0[batch_f0 == 0] = fs / dense_factor
    dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0
    # assert np.all(dilated_factors > 0)
    return dilated_factors
 class SignalGenerator:
    """Input signal generator module."""
    def __init__(
        self,
        sample_rate=24000,
        hop_size=120,
        sine_amp=0.1,
        noise_amp=0.003,
        signal_types=["sine", "noise"],
    ):
        """Initialize WaveNetResidualBlock module.
        Args:
            sample_rate (int): Sampling rate.
            hop_size (int): Hop size of input F0.
            sine_amp (float): Sine amplitude for NSF-based sine generation.
            noise_amp (float): Noise amplitude for NSF-based sine generation.
            signal_types (list): List of input signal types for generator.
        """
        self.sample_rate = sample_rate
        self.hop_size = hop_size
        self.signal_types = signal_types
        self.sine_amp = sine_amp
        self.noise_amp = noise_amp
        for signal_type in signal_types:
            if signal_type not in ["noise", "sine", "sines", "uv"]:
                logger.info(f"{signal_type} is not supported type for generator input.")
                sys.exit(0)
        # logger.info(f"Use {signal_types} for generator input signals.")
    @torch.no_grad()
    def __call__(self, f0, f0_scale=1.0):
        signals = []
        for typ in self.signal_types:
            if "noise" == typ:
                signals.append(self.random_noise(f0))
            if "sine" == typ:
                signals.append(self.sinusoid(f0))
            if "sines" == typ:
                signals.append(self.sinusoids(f0))
            if "uv" == typ:
                signals.append(self.vuv_binary(f0))
        input_batch = signals[0]
        for signal in signals[1:]:
            input_batch = torch.cat([input_batch, signal], axis=1)
        return input_batch * f0_scale
    @torch.no_grad()
    def random_noise(self, f0):
        """Calculate noise signals.
        Args:
            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
        Returns:
            Tensor: Gaussian noise signals (B, 1, T).
        """
        B, _, T = f0.size()
        noise = torch.randn((B, 1, T * self.hop_size), device=f0.device)
        return noise
    @torch.no_grad()
    def sinusoid(self, f0):
        """Calculate sine signals.
        Args:
            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
        Returns:
            Tensor: Sines generated following NSF (B, 1, T).
        """
        B, _, T = f0.size()
        vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
        radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1
        sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp
        if self.noise_amp > 0:
            noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
            noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
            sine = sine + noise
        return sine
    @torch.no_grad()
    def sinusoids(self, f0):
        """Calculate sines.
        Args:
            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
        Returns:
            Tensor: Sines generated following NSF (B, 1, T).
        """
        B, _, T = f0.size()
        vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
        f0 = interpolate(f0, T * self.hop_size)
        sines = torch.zeros_like(f0, device=f0.device)
        harmonics = 5  # currently only fixed number of harmonics is supported
        for i in range(harmonics):
            radious = (f0 * (i + 1) / self.sample_rate) % 1
            sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi)
        sines = self.sine_amp * sines * vuv / harmonics
        if self.noise_amp > 0:
            noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
            noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
            sines = sines + noise
        return sines
    @torch.no_grad()
    def vuv_binary(self, f0):
        """Calculate V/UV binary sequences.
        Args:
            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
        Returns:
            Tensor: V/UV binary sequences (B, 1, T).
        """
        _, _, T = f0.size()
        uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
        return uv
--- a/server/voice_changer/MMVCv15/models/generator.py
+++ b/server/voice_changer/MMVCv15/models/generator.py
@ -0,0 +1,312 @@
 # -*- coding: utf-8 -*-
 # Copyright 2022 Reo Yoneyama (Nagoya University)
 #  MIT License (https://opensource.org/licenses/MIT)
 """HiFiGAN and SiFiGAN Generator modules.
 References:
    - https://github.com/kan-bayashi/ParallelWaveGAN
    - https://github.com/bigpon/QPPWG
    - https://github.com/jik876/hifi-gan
 """
 from logging import getLogger
 import torch.nn as nn
 from .residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock
 # A logger for this file
 logger = getLogger(__name__)
 class SiFiGANGenerator(nn.Module):
    """SiFiGAN generator module."""
    def __init__(
        self,
        in_channels,
        out_channels=1,
        channels=512,
        kernel_size=7,
        upsample_scales=(5, 4, 3, 2),
        upsample_kernel_sizes=(10, 8, 6, 4),
        source_network_params={
            "resblock_kernel_size": 3,  # currently only 3 is supported.
            "resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)],
            "use_additional_convs": True,
        },
        filter_network_params={
            "resblock_kernel_sizes": (3, 5, 7),
            "resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)],
            "use_additional_convs": False,
        },
        share_upsamples=False,
        share_downsamples=False,
        bias=True,
        nonlinear_activation="LeakyReLU",
        nonlinear_activation_params={"negative_slope": 0.1},
        use_weight_norm=True,
        requires_grad=True,
    ):
        """Initialize SiFiGANGenerator module.
        Args:
            in_channels (int): Number of input channels.
            out_channels (int): Number of output channels.
            channels (int): Number of hidden representation channels.
            kernel_size (int): Kernel size of initial and final conv layer.
            upsample_scales (list): List of upsampling scales.
            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
            source_network_params (dict): Parameters for source-network.
            filter_network_params (dict): Parameters for filter-network.
            share_upsamples (bool): Whether to share up-sampling transposed CNNs.
            share_downsamples (bool): Whether to share down-sampling CNNs.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (dict): Hyperparameters for activation function.
            use_weight_norm (bool): Whether to use weight norm.
                If set to true, it will be applied to all of the conv layers.
        """
        super().__init__()
        # check hyperparameters are valid
        assert kernel_size % 2 == 1, "Kernel size must be odd number."
        assert len(upsample_scales) == len(upsample_kernel_sizes)
        # define modules
        self.num_upsamples = len(upsample_kernel_sizes)
        self.source_network_params = source_network_params
        self.filter_network_params = filter_network_params
        self.share_upsamples = share_upsamples
        self.share_downsamples = share_downsamples
        self.sn = nn.ModuleDict()
        self.fn = nn.ModuleDict()
        self.input_conv = Conv1d(
            in_channels,
            channels,
            kernel_size,
            bias=bias,
            padding=(kernel_size - 1) // 2,
        )
        self.sn["upsamples"] = nn.ModuleList()
        self.fn["upsamples"] = nn.ModuleList()
        self.sn["blocks"] = nn.ModuleList()
        self.fn["blocks"] = nn.ModuleList()
        for i in range(len(upsample_kernel_sizes)):
            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
            self.sn["upsamples"] += [
                nn.Sequential(
                    getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
                    nn.ConvTranspose1d(
                        channels // (2**i),
                        channels // (2 ** (i + 1)),
                        upsample_kernel_sizes[i],
                        upsample_scales[i],
                        padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
                        output_padding=upsample_scales[i] % 2,
                        bias=bias,
                    ),
                )
            ]
            if not share_upsamples:
                self.fn["upsamples"] += [
                    nn.Sequential(
                        getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
                        nn.ConvTranspose1d(
                            channels // (2**i),
                            channels // (2 ** (i + 1)),
                            upsample_kernel_sizes[i],
                            upsample_scales[i],
                            padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
                            output_padding=upsample_scales[i] % 2,
                            bias=bias,
                        ),
                    )
                ]
            self.sn["blocks"] += [
                AdaptiveResidualBlock(
                    kernel_size=source_network_params["resblock_kernel_size"],
                    channels=channels // (2 ** (i + 1)),
                    dilations=source_network_params["resblock_dilations"][i],
                    bias=bias,
                    use_additional_convs=source_network_params["use_additional_convs"],
                    nonlinear_activation=nonlinear_activation,
                    nonlinear_activation_params=nonlinear_activation_params,
                )
            ]
            for j in range(len(filter_network_params["resblock_kernel_sizes"])):
                self.fn["blocks"] += [
                    ResidualBlock(
                        kernel_size=filter_network_params["resblock_kernel_sizes"][j],
                        channels=channels // (2 ** (i + 1)),
                        dilations=filter_network_params["resblock_dilations"][j],
                        bias=bias,
                        use_additional_convs=filter_network_params["use_additional_convs"],
                        nonlinear_activation=nonlinear_activation,
                        nonlinear_activation_params=nonlinear_activation_params,
                    )
                ]
        self.sn["output_conv"] = nn.Sequential(
            nn.LeakyReLU(),
            nn.Conv1d(
                channels // (2 ** (i + 1)),
                out_channels,
                kernel_size,
                bias=bias,
                padding=(kernel_size - 1) // 2,
            ),
        )
        self.fn["output_conv"] = nn.Sequential(
            nn.LeakyReLU(),
            nn.Conv1d(
                channels // (2 ** (i + 1)),
                out_channels,
                kernel_size,
                bias=bias,
                padding=(kernel_size - 1) // 2,
            ),
            nn.Tanh(),
        )
        # sine embedding layers
        self.sn["emb"] = Conv1d(
            1,
            channels // (2 ** len(upsample_kernel_sizes)),
            kernel_size,
            bias=bias,
            padding=(kernel_size - 1) // 2,
        )
        # down-sampling CNNs
        self.sn["downsamples"] = nn.ModuleList()
        for i in reversed(range(1, len(upsample_kernel_sizes))):
            self.sn["downsamples"] += [
                nn.Sequential(
                    nn.Conv1d(
                        channels // (2 ** (i + 1)),
                        channels // (2**i),
                        upsample_kernel_sizes[i],
                        upsample_scales[i],
                        padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
                        bias=bias,
                    ),
                    getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
                )
            ]
        if not share_downsamples:
            self.fn["downsamples"] = nn.ModuleList()
            for i in reversed(range(1, len(upsample_kernel_sizes))):
                self.fn["downsamples"] += [
                    nn.Sequential(
                        nn.Conv1d(
                            channels // (2 ** (i + 1)),
                            channels // (2**i),
                            upsample_kernel_sizes[i],
                            upsample_scales[i],
                            padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
                            bias=bias,
                        ),
                        getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
                    )
                ]
        # apply weight norm
        if use_weight_norm:
            self.apply_weight_norm()
        # reset parameters
        self.reset_parameters()
        if requires_grad is False:
            for param in self.parameters():
                param.requires_grad = False
    def forward(self, x, c, d, sid):
        """Calculate forward propagation.
        Args:
            x (Tensor): Input sine signal (B, 1, T).
            c (Tensor): Input tensor (B, in_channels, T).
            d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples].
        Returns:
            Tensor: Output tensor (B, out_channels, T).
        """
        # currently, same input feature is input to each network
        c = self.input_conv(c)
        e = c
        # source-network forward
        x = self.sn["emb"](x)
        embs = [x]
        for i in range(self.num_upsamples - 1):
            x = self.sn["downsamples"][i](x)
            embs += [x]
        for i in range(self.num_upsamples):
            # excitation generation network
            e = self.sn["upsamples"][i](e) + embs[-i - 1]
            e = self.sn["blocks"][i](e, d[i])
        e_ = self.sn["output_conv"](e)
        # filter-network forward
        embs = [e]
        for i in range(self.num_upsamples - 1):
            if self.share_downsamples:
                e = self.sn["downsamples"][i](e)
            else:
                e = self.fn["downsamples"][i](e)
            embs += [e]
        num_blocks = len(self.filter_network_params["resblock_kernel_sizes"])
        for i in range(self.num_upsamples):
            # resonance filtering network
            if self.share_upsamples:
                c = self.sn["upsamples"][i](c) + embs[-i - 1]
            else:
                c = self.fn["upsamples"][i](c) + embs[-i - 1]
            cs = 0.0  # initialize
            for j in range(num_blocks):
                cs += self.fn["blocks"][i * num_blocks + j](c)
            c = cs / num_blocks
        c = self.fn["output_conv"](c)
        return c, e_
    def reset_parameters(self):
        """Reset parameters.
        This initialization follows the official implementation manner.
        https://github.com/jik876/hifi-gan/blob/master/models.py
        """
        def _reset_parameters(m):
            if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
                m.weight.data.normal_(0.0, 0.01)
                logger.debug(f"Reset parameters in {m}.")
        self.apply(_reset_parameters)
    def remove_weight_norm(self):
        """Remove weight normalization module from all of the layers."""
        def _remove_weight_norm(m):
            try:
                logger.debug(f"Weight norm is removed from {m}.")
                nn.utils.remove_weight_norm(m)
            except ValueError:  # this module didn't have weight norm
                return
        self.apply(_remove_weight_norm)
    def apply_weight_norm(self):
        """Apply weight normalization module from all of the layers."""
        def _apply_weight_norm(m):
            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
                nn.utils.weight_norm(m)
                logger.debug(f"Weight norm is applied to {m}.")
        self.apply(_apply_weight_norm)
--- a/server/voice_changer/MMVCv15/models/index.py
+++ b/server/voice_changer/MMVCv15/models/index.py
@ -0,0 +1,82 @@
 # -*- coding: utf-8 -*-
 # Copyright 2020 Yi-Chiao Wu (Nagoya University)
 #  MIT License (https://opensource.org/licenses/MIT)
 """Indexing-related functions."""
 import torch
 from torch.nn import ConstantPad1d as pad1d
 def pd_indexing(x, d, dilation, batch_index, ch_index):
    """Pitch-dependent indexing of past and future samples.
    Args:
        x (Tensor): Input feature map (B, C, T).
        d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
        dilation (Int): Dilation size.
        batch_index (Tensor): Batch index
        ch_index (Tensor): Channel index
    Returns:
        Tensor: Past output tensor (B, out_channels, T)
        Tensor: Future output tensor (B, out_channels, T)
    """
    (_, _, batch_length) = d.size()
    dilations = d * dilation
    # get past index
    idxP = torch.arange(-batch_length, 0).float()
    idxP = idxP.to(x.device)
    idxP = torch.add(-dilations, idxP)
    idxP = idxP.round().long()
    maxP = -((torch.min(idxP) + batch_length))
    assert maxP >= 0
    idxP = (batch_index, ch_index, idxP)
    # padding past tensor
    xP = pad1d((maxP, 0), 0)(x)
    # get future index
    idxF = torch.arange(0, batch_length).float()
    idxF = idxF.to(x.device)
    idxF = torch.add(dilations, idxF)
    idxF = idxF.round().long()
    maxF = torch.max(idxF) - (batch_length - 1)
    assert maxF >= 0
    idxF = (batch_index, ch_index, idxF)
    # padding future tensor
    xF = pad1d((0, maxF), 0)(x)
    return xP[idxP], xF[idxF]
 def index_initial(n_batch, n_ch, tensor=True):
    """Tensor batch and channel index initialization.
    Args:
        n_batch (Int): Number of batch.
        n_ch (Int): Number of channel.
        tensor (bool): Return tensor or numpy array
    Returns:
        Tensor: Batch index
        Tensor: Channel index
    """
    batch_index = []
    for i in range(n_batch):
        batch_index.append([[i]] * n_ch)
    ch_index = []
    for i in range(n_ch):
        ch_index += [[i]]
    ch_index = [ch_index] * n_batch
    if tensor:
        batch_index = torch.tensor(batch_index)
        ch_index = torch.tensor(ch_index)
        if torch.cuda.is_available():
            batch_index = batch_index.cuda()
            ch_index = ch_index.cuda()
    return batch_index, ch_index
--- a/server/voice_changer/MMVCv15/models/models.py
+++ b/server/voice_changer/MMVCv15/models/models.py
@ -0,0 +1,438 @@
 import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F
 from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE
 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from .commons import init_weights, get_padding, sequence_mask
 from .generator import SiFiGANGenerator
 from .features import SignalGenerator, dilated_factor
 class TextEncoder(nn.Module):
    def __init__(self, out_channels, hidden_channels, requires_grad=True):
        super().__init__()
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
        # パラメータを学習しない
        if requires_grad is False:
            for param in self.parameters():
                param.requires_grad = False
    def forward(self, x, x_lengths):
        x = torch.transpose(x.half(), 1, -1)  # [b, h, t]
        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        return x, m, logs, x_mask
 class ResidualCouplingBlock(nn.Module):
    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0, requires_grad=True):
        super().__init__()
        self.channels = channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.n_layers = n_layers
        self.n_flows = n_flows
        self.gin_channels = gin_channels
        self.flows = nn.ModuleList()
        for i in range(n_flows):
            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
            self.flows.append(Flip())
        # パラメータを学習しない
        if requires_grad is False:
            for param in self.parameters():
                param.requires_grad = False
    def forward(self, x, x_mask, g=None, reverse=False):
        if not reverse:
            for flow in self.flows:
                x, _ = flow(x, x_mask, g=g, reverse=reverse)
        else:
            for flow in reversed(self.flows):
                x = flow(x, x_mask, g=g, reverse=reverse)
        return x
 class PosteriorEncoder(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, requires_grad=True):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.n_layers = n_layers
        self.gin_channels = gin_channels
        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
        # パラメータを学習しない
        if requires_grad is False:
            for param in self.parameters():
                param.requires_grad = False
    def forward(self, x, x_lengths, g=None):
        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
        x = self.pre(x) * x_mask
        x = self.enc(x, x_mask, g=g)
        stats = self.proj(x) * x_mask
        m, logs = torch.split(stats, self.out_channels, dim=1)
        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
        return z, m, logs, x_mask
 class Generator(torch.nn.Module):
    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, requires_grad=True):
        super(Generator, self).__init__()
        self.num_kernels = len(resblock_kernel_sizes)
        self.num_upsamples = len(upsample_rates)
        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
        resblock = ResBlock1 if resblock == "1" else ResBlock2
        self.ups = nn.ModuleList()
        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
        self.resblocks = nn.ModuleList()
        for i in range(len(self.ups)):
            ch = upsample_initial_channel // (2 ** (i + 1))
            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
                self.resblocks.append(resblock(ch, k, d))
        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
        self.ups.apply(init_weights)
        if requires_grad is False:
            for param in self.parameters():
                param.requires_grad = False
    def forward(self, x, g=None):
        x = self.conv_pre(x)
        for i in range(self.num_upsamples):
            x = F.leaky_relu(x, LRELU_SLOPE)
            x = self.ups[i](x)
            xs = None
            for j in range(self.num_kernels):
                if xs is None:
                    xs = self.resblocks[i * self.num_kernels + j](x)
                else:
                    xs += self.resblocks[i * self.num_kernels + j](x)
            x = xs / self.num_kernels
        x = F.leaky_relu(x)
        x = self.conv_post(x)
        x = torch.tanh(x)
        return x
    def remove_weight_norm(self):
        print("Removing weight norm...")
        for l in self.ups:
            remove_weight_norm(l)
        for l in self.resblocks:
            l.remove_weight_norm()
 class DiscriminatorP(torch.nn.Module):
    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
        super(DiscriminatorP, self).__init__()
        self.period = period
        self.use_spectral_norm = use_spectral_norm
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
        self.convs = nn.ModuleList(
            [
                norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
                norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
                norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
                norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
            ]
        )
        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
    def forward(self, x):
        fmap = []
        # 1d to 2d
        b, c, t = x.shape
        if t % self.period != 0:  # pad first
            n_pad = self.period - (t % self.period)
            x = F.pad(x, (0, n_pad), "reflect")
            t = t + n_pad
        x = x.view(b, c, t // self.period, self.period)
        for l in self.convs:
            x = l(x)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)
        return x, fmap
 class DiscriminatorS(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(DiscriminatorS, self).__init__()
        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
        self.convs = nn.ModuleList(
            [
                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
            ]
        )
        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
    def forward(self, x):
        fmap = []
        for l in self.convs:
            x = l(x)
            x = F.leaky_relu(x, LRELU_SLOPE)
            fmap.append(x)
        x = self.conv_post(x)
        fmap.append(x)
        x = torch.flatten(x, 1, -1)
        return x, fmap
 class MultiPeriodDiscriminator(torch.nn.Module):
    def __init__(self, use_spectral_norm=False):
        super(MultiPeriodDiscriminator, self).__init__()
        # periods = [2,3,5,7,11]
        periods = [3, 5, 7, 11, 13]
        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
        self.discriminators = nn.ModuleList(discs)
    def forward(self, y, y_hat, flag=True):
        if flag:
            y_d_rs = []
            y_d_gs = []
            fmap_rs = []
            fmap_gs = []
            for i, d in enumerate(self.discriminators):
                y_d_r, fmap_r = d(y)
                y_d_g, fmap_g = d(y_hat)
                y_d_rs.append(y_d_r)
                y_d_gs.append(y_d_g)
                fmap_rs.append(fmap_r)
                fmap_gs.append(fmap_g)
            return y_d_rs, y_d_gs, fmap_rs, fmap_gs
        else:
            y_d_gs = []
            with torch.no_grad():
                for i, d in enumerate(self.discriminators):
                    y_d_g, _ = d(y_hat)
                    y_d_gs.append(y_d_g)
            return y_d_gs
 class SynthesizerTrn(nn.Module):
    """
    Synthesizer for Training
    """
    def __init__(
        self,
        spec_channels,
        segment_size,
        inter_channels,
        hidden_channels,
        upsample_rates,
        upsample_initial_channel,
        upsample_kernel_sizes,
        n_flow,
        dec_out_channels=1,
        dec_kernel_size=7,
        n_speakers=0,
        gin_channels=0,
        requires_grad_pe=True,
        requires_grad_flow=True,
        requires_grad_text_enc=True,
        requires_grad_dec=True,
        requires_grad_emb_g=True,
        sample_rate=24000,
        hop_size=128,
        sine_amp=0.1,
        noise_amp=0.003,
        signal_types=["sine"],
        dense_factors=[0.5, 1, 4, 8],
        upsample_scales=[8, 4, 2, 2],
    ):
        super().__init__()
        self.spec_channels = spec_channels
        self.hidden_channels = hidden_channels
        self.upsample_rates = upsample_rates
        self.upsample_initial_channel = upsample_initial_channel
        self.upsample_kernel_sizes = upsample_kernel_sizes
        self.segment_size = segment_size
        self.dec_out_channels = dec_out_channels
        self.dec_kernel_size = dec_kernel_size
        self.n_speakers = n_speakers
        self.gin_channels = gin_channels
        self.requires_grad_pe = requires_grad_pe
        self.requires_grad_flow = requires_grad_flow
        self.requires_grad_text_enc = requires_grad_text_enc
        self.requires_grad_dec = requires_grad_dec
        self.requires_grad_emb_g = requires_grad_emb_g
        self.sample_rate = sample_rate
        self.hop_size = hop_size
        self.sine_amp = sine_amp
        self.noise_amp = noise_amp
        self.signal_types = signal_types
        self.dense_factors = dense_factors
        self.upsample_scales = upsample_scales
        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels, requires_grad=requires_grad_pe)
        self.enc_p = TextEncoder(inter_channels, hidden_channels, requires_grad=requires_grad_text_enc)
        self.dec = SiFiGANGenerator(in_channels=inter_channels, out_channels=dec_out_channels, channels=upsample_initial_channel, kernel_size=dec_kernel_size, upsample_scales=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, requires_grad=requires_grad_dec)
        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels, requires_grad=requires_grad_flow)
        self.signal_generator = SignalGenerator(sample_rate=sample_rate, hop_size=hop_size, noise_amp=noise_amp, signal_types=signal_types)
        if n_speakers > 1:
            self.emb_g = nn.Embedding(n_speakers, gin_channels)
            self.emb_g.requires_grad = requires_grad_emb_g
    def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None):
        pass
        # sin, d = self.make_sin_d(f0)
        # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
        # # target sid 作成
        # target_sids = self.make_random_target_sids(target_ids, sid)
        # if self.n_speakers > 0:
        #     g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
        #     tgt_g = self.emb_g(target_sids).unsqueeze(-1)  # [b, h, 1]
        # else:
        #     g = None
        # # PE
        # z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
        # # Flow
        # z_p = self.flow(z, y_mask, g=g)
        # # VC
        # tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True)
        # # アライメントの作成
        # liner_alignment = F.one_hot(torch.arange(0, x.shape[2] + 2)).cuda()
        # liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0)
        # liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode="linear", align_corners=True)
        # liner_alignment = liner_alignment[:, 1:-1, :]
        # # TextEncとPEのshape合わせ
        # m_p = torch.matmul(m_p, liner_alignment)
        # logs_p = torch.matmul(logs_p, liner_alignment)
        # # slice
        # z_slice = slice_segments(z, slice_id, self.segment_size)
        # # targetのslice
        # tgt_z_slice = slice_segments(tgt_z, slice_id, self.segment_size)
        # # Dec
        # o = self.dec(sin, z_slice, d, sid=g)
        # tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g)
        # return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)
    def make_sin_d(self, f0):
        # f0 から sin と d を作成
        # f0 : [b, 1, t]
        # sin : [b, 1, t]
        # d : [4][b, 1, t]
        prod_upsample_scales = np.cumprod(self.upsample_scales)
        dfs_batch = []
        for df, us in zip(self.dense_factors, prod_upsample_scales):
            dilated_tensor = dilated_factor(f0, self.sample_rate, df)
            # result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
            result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
            dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))
        in_batch = self.signal_generator(f0)
        return in_batch, dfs_batch
    def make_random_target_sids(self, target_ids, sid):
        # target_sids は target_ids をランダムで埋める
        target_sids = torch.zeros_like(sid)
        for i in range(len(target_sids)):
            source_id = sid[i]
            deleted_target_ids = target_ids[target_ids != source_id]  # source_id と target_id が同じにならないよう sid と同じものを削除
            if len(deleted_target_ids) >= 1:
                target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))]
            else:
                # target_id 候補が無いときは仕方ないので sid を使う
                target_sids[i] = source_id
        return target_sids
    def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
        sin, d = self.make_sin_d(f0)
        g_src = self.emb_g(sid_src).unsqueeze(-1)
        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
        z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        # print("VC", sin.device, d[0].device, g_tgt.device)
        o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
        return o_hat[0]
    def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
        g_src = self.emb_g(sid_src).unsqueeze(-1)
        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
        o_hat = self.dec(z * y_mask, g=g_tgt)
        return o_hat, y_mask, (z)
    def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
        g_src = self.emb_g(sid_src).unsqueeze(-1)
        # g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
        o_hat = self.dec(z * y_mask, g=g_src)
        return o_hat, y_mask, (z)
    def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
        g_src = self.emb_g(sid_src).unsqueeze(-1)
        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
        z_p = self.flow(z, y_mask, g=g_src)
        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
        z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
        z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
        o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
        return o_hat, y_mask, (z, z_p, z_hat)
    def save_synthesizer(self, path):
        enc_q = self.enc_q.state_dict()
        dec = self.dec.state_dict()
        emb_g = self.emb_g.state_dict()
        torch.save({"enc_q": enc_q, "dec": dec, "emb_g": emb_g}, path)
    def load_synthesizer(self, path):
        dict = torch.load(path, map_location="cpu")
        enc_q = dict["enc_q"]
        dec = dict["dec"]
        emb_g = dict["emb_g"]
        self.enc_q.load_state_dict(enc_q)
        self.dec.load_state_dict(dec)
        self.emb_g.load_state_dict(emb_g)
--- a/server/voice_changer/MMVCv15/models/modules.py
+++ b/server/voice_changer/MMVCv15/models/modules.py
@ -0,0 +1,186 @@
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.nn import Conv1d
 from torch.nn.utils import weight_norm, remove_weight_norm
 from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
 LRELU_SLOPE = 0.1
 class WN(torch.nn.Module):
    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
        super(WN, self).__init__()
        assert kernel_size % 2 == 1
        self.hidden_channels = hidden_channels
        self.kernel_size = (kernel_size,)
        self.dilation_rate = dilation_rate
        self.n_layers = n_layers
        self.gin_channels = gin_channels
        self.p_dropout = p_dropout
        self.in_layers = torch.nn.ModuleList()
        self.res_skip_layers = torch.nn.ModuleList()
        self.drop = nn.Dropout(p_dropout)
        if gin_channels != 0:
            cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
        for i in range(n_layers):
            dilation = dilation_rate**i
            padding = int((kernel_size * dilation - dilation) / 2)
            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
            self.in_layers.append(in_layer)
            # last one is not necessary
            if i < n_layers - 1:
                res_skip_channels = 2 * hidden_channels
            else:
                res_skip_channels = hidden_channels
            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
            self.res_skip_layers.append(res_skip_layer)
    def forward(self, x, x_mask, g=None, **kwargs):
        output = torch.zeros_like(x)
        n_channels_tensor = torch.IntTensor([self.hidden_channels])
        if g is not None:
            g = self.cond_layer(g)
        for i in range(self.n_layers):
            x_in = self.in_layers[i](x)
            if g is not None:
                cond_offset = i * 2 * self.hidden_channels
                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
            else:
                g_l = torch.zeros_like(x_in)
            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
            acts = self.drop(acts)
            res_skip_acts = self.res_skip_layers[i](acts)
            if i < self.n_layers - 1:
                res_acts = res_skip_acts[:, : self.hidden_channels, :]
                x = (x + res_acts) * x_mask
                output = output + res_skip_acts[:, self.hidden_channels :, :]
            else:
                output = output + res_skip_acts
        return output * x_mask
    def remove_weight_norm(self):
        if self.gin_channels != 0:
            torch.nn.utils.remove_weight_norm(self.cond_layer)
        for l in self.in_layers:
            torch.nn.utils.remove_weight_norm(l)
        for l in self.res_skip_layers:
            torch.nn.utils.remove_weight_norm(l)
 class ResBlock1(torch.nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
        super(ResBlock1, self).__init__()
        self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
        self.convs1.apply(init_weights)
        self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
        self.convs2.apply(init_weights)
    def forward(self, x, x_mask=None):
        for c1, c2 in zip(self.convs1, self.convs2):
            xt = F.leaky_relu(x, LRELU_SLOPE)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c1(xt)
            xt = F.leaky_relu(xt, LRELU_SLOPE)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c2(xt)
            x = xt + x
        if x_mask is not None:
            x = x * x_mask
        return x
    def remove_weight_norm(self):
        for l in self.convs1:
            remove_weight_norm(l)
        for l in self.convs2:
            remove_weight_norm(l)
 class ResBlock2(torch.nn.Module):
    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
        super(ResBlock2, self).__init__()
        self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
        self.convs.apply(init_weights)
    def forward(self, x, x_mask=None):
        for c in self.convs:
            xt = F.leaky_relu(x, LRELU_SLOPE)
            if x_mask is not None:
                xt = xt * x_mask
            xt = c(xt)
            x = xt + x
        if x_mask is not None:
            x = x * x_mask
        return x
    def remove_weight_norm(self):
        for l in self.convs:
            remove_weight_norm(l)
 class Flip(nn.Module):
    def forward(self, x, *args, reverse=False, **kwargs):
        x = torch.flip(x, [1])
        if not reverse:
            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
            return x, logdet
        else:
            return x
 class ResidualCouplingLayer(nn.Module):
    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
        assert channels % 2 == 0, "channels should be divisible by 2"
        super().__init__()
        self.channels = channels
        self.hidden_channels = hidden_channels
        self.kernel_size = kernel_size
        self.dilation_rate = dilation_rate
        self.n_layers = n_layers
        self.half_channels = channels // 2
        self.mean_only = mean_only
        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
        self.post.weight.data.zero_()
        self.post.bias.data.zero_()
    def forward(self, x, x_mask, g=None, reverse=False):
        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
        h = self.pre(x0) * x_mask
        h = self.enc(h, x_mask, g=g)
        stats = self.post(h) * x_mask
        if not self.mean_only:
            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
        else:
            m = stats
            logs = torch.zeros_like(m)
        if not reverse:
            x1 = m + x1 * torch.exp(logs) * x_mask
            x = torch.cat([x0, x1], 1)
            logdet = torch.sum(logs, [1, 2])
            return x, logdet
        else:
            x1 = (x1 - m) * torch.exp(-logs) * x_mask
            x = torch.cat([x0, x1], 1)
            return x
--- a/server/voice_changer/MMVCv15/models/readme.txt
+++ b/server/voice_changer/MMVCv15/models/readme.txt
@ -0,0 +1 @@
 modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26
--- a/server/voice_changer/MMVCv15/models/residual_block.py
+++ b/server/voice_changer/MMVCv15/models/residual_block.py
@ -0,0 +1,257 @@
 # -*- coding: utf-8 -*-
 # Copyright 2022 Reo Yoneyama (Nagoya University)
 #  MIT License (https://opensource.org/licenses/MIT)
 """Residual block modules.
 References:
    - https://github.com/kan-bayashi/ParallelWaveGAN
    - https://github.com/bigpon/QPPWG
    - https://github.com/r9y9/wavenet_vocoder
 """
 from logging import getLogger
 import torch
 import torch.nn as nn
 from .snake import Snake
 from .index import index_initial, pd_indexing
 # A logger for this file
 logger = getLogger(__name__)
 class Conv1d(nn.Conv1d):
    """Conv1d module with customized initialization."""
    def __init__(self, *args, **kwargs):
        """Initialize Conv1d module."""
        super(Conv1d, self).__init__(*args, **kwargs)
    def reset_parameters(self):
        """Reset parameters."""
        nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
        if self.bias is not None:
            nn.init.constant_(self.bias, 0.0)
 class Conv1d1x1(Conv1d):
    """1x1 Conv1d with customized initialization."""
    def __init__(self, in_channels, out_channels, bias=True):
        """Initialize 1x1 Conv1d module."""
        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
 class Conv2d(nn.Conv2d):
    """Conv2d module with customized initialization."""
    def __init__(self, *args, **kwargs):
        """Initialize Conv2d module."""
        super(Conv2d, self).__init__(*args, **kwargs)
    def reset_parameters(self):
        """Reset parameters."""
        nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu")
        if self.bias is not None:
            nn.init.constant_(self.bias, 0.0)
 class Conv2d1x1(Conv2d):
    """1x1 Conv2d with customized initialization."""
    def __init__(self, in_channels, out_channels, bias=True):
        """Initialize 1x1 Conv2d module."""
        super(Conv2d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
 class ResidualBlock(nn.Module):
    """Residual block module in HiFiGAN."""
    def __init__(
        self,
        kernel_size=3,
        channels=512,
        dilations=(1, 3, 5),
        bias=True,
        use_additional_convs=True,
        nonlinear_activation="LeakyReLU",
        nonlinear_activation_params={"negative_slope": 0.1},
    ):
        """Initialize ResidualBlock module.
        Args:
            kernel_size (int): Kernel size of dilation convolution layer.
            channels (int): Number of channels for convolution layer.
            dilations (List[int]): List of dilation factors.
            use_additional_convs (bool): Whether to use additional convolution layers.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (dict): Hyperparameters for activation function.
        """
        super().__init__()
        self.use_additional_convs = use_additional_convs
        self.convs1 = nn.ModuleList()
        if use_additional_convs:
            self.convs2 = nn.ModuleList()
        assert kernel_size % 2 == 1, "Kernel size must be odd number."
        for dilation in dilations:
            if nonlinear_activation == "Snake":
                nonlinear = Snake(channels, **nonlinear_activation_params)
            else:
                nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
            self.convs1 += [
                nn.Sequential(
                    nonlinear,
                    nn.Conv1d(
                        channels,
                        channels,
                        kernel_size,
                        dilation=dilation,
                        bias=bias,
                        padding=(kernel_size - 1) // 2 * dilation,
                    ),
                )
            ]
            if use_additional_convs:
                if nonlinear_activation == "Snake":
                    nonlinear = Snake(channels, **nonlinear_activation_params)
                else:
                    nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
                self.convs2 += [
                    nn.Sequential(
                        nonlinear,
                        nn.Conv1d(
                            channels,
                            channels,
                            kernel_size,
                            dilation=1,
                            bias=bias,
                            padding=(kernel_size - 1) // 2,
                        ),
                    )
                ]
    def forward(self, x):
        """Calculate forward propagation.
        Args:
            x (Tensor): Input tensor (B, channels, T).
        Returns:
            Tensor: Output tensor (B, channels, T).
        """
        for idx in range(len(self.convs1)):
            xt = self.convs1[idx](x)
            if self.use_additional_convs:
                xt = self.convs2[idx](xt)
            x = xt + x
        return x
 class AdaptiveResidualBlock(nn.Module):
    """Residual block module in HiFiGAN."""
    def __init__(
        self,
        kernel_size=3,
        channels=512,
        dilations=(1, 2, 4),
        bias=True,
        use_additional_convs=True,
        nonlinear_activation="LeakyReLU",
        nonlinear_activation_params={"negative_slope": 0.1},
    ):
        """Initialize ResidualBlock module.
        Args:
            kernel_size (int): Kernel size of dilation convolution layer.
            channels (int): Number of channels for convolution layer.
            bias (bool): Whether to add bias parameter in convolution layers.
            nonlinear_activation (str): Activation function module name.
            nonlinear_activation_params (dict): Hyperparameters for activation function.
        """
        super().__init__()
        self.use_additional_convs = use_additional_convs
        assert kernel_size == 3, "Currently only kernel_size = 3 is supported."
        self.channels = channels
        self.dilations = dilations
        self.nonlinears = nn.ModuleList()
        self.convsC = nn.ModuleList()
        self.convsP = nn.ModuleList()
        self.convsF = nn.ModuleList()
        if use_additional_convs:
            self.convsA = nn.ModuleList()
        for _ in dilations:
            if nonlinear_activation == "Snake":
                self.nonlinears += [Snake(channels, **nonlinear_activation_params)]
            else:
                self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)]
            self.convsC += [
                Conv1d1x1(
                    channels,
                    channels,
                    bias=bias,
                ),
            ]
            self.convsP += [
                Conv1d1x1(
                    channels,
                    channels,
                    bias=bias,
                ),
            ]
            self.convsF += [
                Conv1d1x1(
                    channels,
                    channels,
                    bias=bias,
                ),
            ]
            if use_additional_convs:
                if nonlinear_activation == "Snake":
                    nonlinear = Snake(channels, **nonlinear_activation_params)
                else:
                    nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
                self.convsA += [
                    nn.Sequential(
                        nonlinear,
                        nn.Conv1d(
                            channels,
                            channels,
                            kernel_size,
                            dilation=1,
                            bias=bias,
                            padding=(kernel_size - 1) // 2,
                        ),
                    )
                ]
    def forward(self, x, d):
        """Calculate forward propagation.
        Args:
            x (Tensor): Input tensor (B, channels, T).
            d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
        Returns:
            Tensor: Output tensor (B, channels, T).
        """
        batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False)
        batch_index = torch.tensor(batch_index).to(x.device)
        ch_index = torch.tensor(ch_index).to(x.device)
        for i, dilation in enumerate(self.dilations):
            xt = self.nonlinears[i](x)
            xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index)
            xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF)
            if self.use_additional_convs:
                xt = self.convsA[i](xt)
            x = xt + x
        return x
--- a/server/voice_changer/MMVCv15/models/snake.py
+++ b/server/voice_changer/MMVCv15/models/snake.py
@ -0,0 +1,47 @@
 # -*- coding: utf-8 -*-
 # Copyright 2022 Reo Yoneyama (Nagoya University)
 #  MIT License (https://opensource.org/licenses/MIT)
 """Snake Activation Function Module.
 References:
    - Neural Networks Fail to Learn Periodic Functions and How to Fix It
        https://arxiv.org/pdf/2006.08195.pdf
    - BigVGAN: A Universal Neural Vocoder with Large-Scale Training
        https://arxiv.org/pdf/2206.04658.pdf
 """
 import torch
 import torch.nn as nn
 class Snake(nn.Module):
    """Snake activation function module."""
    def __init__(self, channels, init=50):
        """Initialize Snake module.
        Args:
            channels (int): Number of feature channels.
            init (float): Initial value of the learnable parameter alpha.
                          According to the original paper, 5 ~ 50 would be
                          suitable for periodic data (i.e. voices).
        """
        super(Snake, self).__init__()
        alpha = init * torch.ones(1, channels, 1)
        self.alpha = nn.Parameter(alpha)
    def forward(self, x):
        """Calculate forward propagation.
        Args:
            x (Tensor): Input noise signal (B, channels, T).
        Returns:
            Tensor: Output tensor (B, channels, T).
        """
        return x + torch.sin(self.alpha * x) ** 2 / self.alpha
		`@ -0,0 +1 @@`
							`modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26`