WIP: integrate vcs to new gui 3

2025-01-23 13:35:12 +03:00 · 2023-06-22 06:56:00 +09:00 · 2023-06-22 06:56:00 +09:00 · d83590dc35
commit d83590dc35
parent fa7894de50
10 changed files with 1552 additions and 7 deletions
--- a/server/voice_changer/MMVCv15/MMVCv15.py
+++ b/server/voice_changer/MMVCv15/MMVCv15.py
@ -20,7 +20,7 @@ import torch
 import onnxruntime
 import pyworld as pw

-from models import SynthesizerTrn  # type:ignore
+from voice_changer.MMVCv15.models.models import SynthesizerTrn  # type:ignore
 from voice_changer.MMVCv15.client_modules import (
    convert_continuos_f0,
    spectrogram_torch,
@ -156,8 +156,7 @@ class MMVCv15:
    def get_info(self):
        data = asdict(self.settings)

-        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.settings.onnxModelFile != "" and self.settings.onnxModelFile is not None else []
-
+        data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
        return data

    def get_processing_sampling_rate(self):
@ -231,10 +230,6 @@ class MMVCv15:
        return [spec, f0, sid]

    def _onnx_inference(self, data):
-        if self.settings.onnxModelFile == "" and self.settings.onnxModelFile is None:
-            print("[Voice Changer] No ONNX session.")
-            raise NoModeLoadedException("ONNX")
-
        spec, f0, sid_src = data
        spec = spec.unsqueeze(0)
        spec_lengths = torch.tensor([spec.size(2)])
--- a/server/voice_changer/MMVCv15/models/commons.py
+++ b/server/voice_changer/MMVCv15/models/commons.py
@ -0,0 +1,27 @@
+import torch
+
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+
+
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = length.max()
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
--- a/server/voice_changer/MMVCv15/models/features.py
+++ b/server/voice_changer/MMVCv15/models/features.py
@ -0,0 +1,200 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Reo Yoneyama (Nagoya University)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Feature-related functions.
+
+References:
+    - https://github.com/bigpon/QPPWG
+    - https://github.com/nii-yamagishilab/project-NN-Pytorch-scripts
+
+"""
+
+import sys
+from logging import getLogger
+
+import numpy as np
+import torch
+from torch.nn.functional import interpolate
+
+# A logger for this file
+logger = getLogger(__name__)
+
+
+def validate_length(xs, ys=None, hop_size=None):
+    """Validate length
+
+    Args:
+        xs (ndarray): numpy array of features
+        ys (ndarray): numpy array of audios
+        hop_size (int): upsampling factor
+
+    Returns:
+        (ndarray): length adjusted features
+
+    """
+    min_len_x = min([x.shape[0] for x in xs])
+    if ys is not None:
+        min_len_y = min([y.shape[0] for y in ys])
+        if min_len_y < min_len_x * hop_size:
+            min_len_x = min_len_y // hop_size
+        if min_len_y > min_len_x * hop_size:
+            min_len_y = min_len_x * hop_size
+        ys = [y[:min_len_y] for y in ys]
+    xs = [x[:min_len_x] for x in xs]
+
+    return xs + ys if ys is not None else xs
+
+
+def dilated_factor(batch_f0, fs, dense_factor):
+    """Pitch-dependent dilated factor
+
+    Args:
+        batch_f0 (ndarray): the f0 sequence (T)
+        fs (int): sampling rate
+        dense_factor (int): the number of taps in one cycle
+
+    Return:
+        dilated_factors(np array):
+            float array of the pitch-dependent dilated factors (T)
+
+    """
+    batch_f0[batch_f0 == 0] = fs / dense_factor
+    dilated_factors = torch.ones_like(batch_f0) * fs / dense_factor / batch_f0
+    # assert np.all(dilated_factors > 0)
+    return dilated_factors
+
+
+class SignalGenerator:
+    """Input signal generator module."""
+
+    def __init__(
+        self,
+        sample_rate=24000,
+        hop_size=120,
+        sine_amp=0.1,
+        noise_amp=0.003,
+        signal_types=["sine", "noise"],
+    ):
+        """Initialize WaveNetResidualBlock module.
+
+        Args:
+            sample_rate (int): Sampling rate.
+            hop_size (int): Hop size of input F0.
+            sine_amp (float): Sine amplitude for NSF-based sine generation.
+            noise_amp (float): Noise amplitude for NSF-based sine generation.
+            signal_types (list): List of input signal types for generator.
+
+        """
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.signal_types = signal_types
+        self.sine_amp = sine_amp
+        self.noise_amp = noise_amp
+
+        for signal_type in signal_types:
+            if signal_type not in ["noise", "sine", "sines", "uv"]:
+                logger.info(f"{signal_type} is not supported type for generator input.")
+                sys.exit(0)
+        # logger.info(f"Use {signal_types} for generator input signals.")
+
+    @torch.no_grad()
+    def __call__(self, f0, f0_scale=1.0):
+        signals = []
+        for typ in self.signal_types:
+            if "noise" == typ:
+                signals.append(self.random_noise(f0))
+            if "sine" == typ:
+                signals.append(self.sinusoid(f0))
+            if "sines" == typ:
+                signals.append(self.sinusoids(f0))
+            if "uv" == typ:
+                signals.append(self.vuv_binary(f0))
+
+        input_batch = signals[0]
+        for signal in signals[1:]:
+            input_batch = torch.cat([input_batch, signal], axis=1)
+
+        return input_batch * f0_scale
+
+    @torch.no_grad()
+    def random_noise(self, f0):
+        """Calculate noise signals.
+
+        Args:
+            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
+
+        Returns:
+            Tensor: Gaussian noise signals (B, 1, T).
+
+        """
+        B, _, T = f0.size()
+        noise = torch.randn((B, 1, T * self.hop_size), device=f0.device)
+
+        return noise
+
+    @torch.no_grad()
+    def sinusoid(self, f0):
+        """Calculate sine signals.
+
+        Args:
+            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
+
+        Returns:
+            Tensor: Sines generated following NSF (B, 1, T).
+
+        """
+        B, _, T = f0.size()
+        vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
+        radious = (interpolate(f0, T * self.hop_size) / self.sample_rate) % 1
+        sine = vuv * torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi) * self.sine_amp
+        if self.noise_amp > 0:
+            noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
+            noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
+            sine = sine + noise
+
+        return sine
+
+    @torch.no_grad()
+    def sinusoids(self, f0):
+        """Calculate sines.
+
+        Args:
+            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
+
+        Returns:
+            Tensor: Sines generated following NSF (B, 1, T).
+
+        """
+        B, _, T = f0.size()
+        vuv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
+        f0 = interpolate(f0, T * self.hop_size)
+        sines = torch.zeros_like(f0, device=f0.device)
+        harmonics = 5  # currently only fixed number of harmonics is supported
+        for i in range(harmonics):
+            radious = (f0 * (i + 1) / self.sample_rate) % 1
+            sines += torch.sin(torch.cumsum(radious, dim=2) * 2 * np.pi)
+        sines = self.sine_amp * sines * vuv / harmonics
+        if self.noise_amp > 0:
+            noise_amp = vuv * self.noise_amp + (1.0 - vuv) * self.noise_amp / 3.0
+            noise = torch.randn((B, 1, T * self.hop_size), device=f0.device) * noise_amp
+            sines = sines + noise
+
+        return sines
+
+    @torch.no_grad()
+    def vuv_binary(self, f0):
+        """Calculate V/UV binary sequences.
+
+        Args:
+            f0 (Tensor): F0 tensor (B, 1, T // hop_size).
+
+        Returns:
+            Tensor: V/UV binary sequences (B, 1, T).
+
+        """
+        _, _, T = f0.size()
+        uv = interpolate((f0 > 0) * torch.ones_like(f0), T * self.hop_size)
+
+        return uv
--- a/server/voice_changer/MMVCv15/models/generator.py
+++ b/server/voice_changer/MMVCv15/models/generator.py
@ -0,0 +1,312 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Reo Yoneyama (Nagoya University)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""HiFiGAN and SiFiGAN Generator modules.
+
+References:
+    - https://github.com/kan-bayashi/ParallelWaveGAN
+    - https://github.com/bigpon/QPPWG
+    - https://github.com/jik876/hifi-gan
+
+"""
+
+from logging import getLogger
+
+import torch.nn as nn
+from .residual_block import AdaptiveResidualBlock, Conv1d, ResidualBlock
+
+# A logger for this file
+logger = getLogger(__name__)
+
+
+class SiFiGANGenerator(nn.Module):
+    """SiFiGAN generator module."""
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels=1,
+        channels=512,
+        kernel_size=7,
+        upsample_scales=(5, 4, 3, 2),
+        upsample_kernel_sizes=(10, 8, 6, 4),
+        source_network_params={
+            "resblock_kernel_size": 3,  # currently only 3 is supported.
+            "resblock_dilations": [(1,), (1, 2), (1, 2, 4), (1, 2, 4, 8)],
+            "use_additional_convs": True,
+        },
+        filter_network_params={
+            "resblock_kernel_sizes": (3, 5, 7),
+            "resblock_dilations": [(1, 3, 5), (1, 3, 5), (1, 3, 5)],
+            "use_additional_convs": False,
+        },
+        share_upsamples=False,
+        share_downsamples=False,
+        bias=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+        use_weight_norm=True,
+        requires_grad=True,
+    ):
+        """Initialize SiFiGANGenerator module.
+
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            channels (int): Number of hidden representation channels.
+            kernel_size (int): Kernel size of initial and final conv layer.
+            upsample_scales (list): List of upsampling scales.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            source_network_params (dict): Parameters for source-network.
+            filter_network_params (dict): Parameters for filter-network.
+            share_upsamples (bool): Whether to share up-sampling transposed CNNs.
+            share_downsamples (bool): Whether to share down-sampling CNNs.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+            use_weight_norm (bool): Whether to use weight norm.
+                If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        # check hyperparameters are valid
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
+
+        # define modules
+        self.num_upsamples = len(upsample_kernel_sizes)
+        self.source_network_params = source_network_params
+        self.filter_network_params = filter_network_params
+        self.share_upsamples = share_upsamples
+        self.share_downsamples = share_downsamples
+        self.sn = nn.ModuleDict()
+        self.fn = nn.ModuleDict()
+        self.input_conv = Conv1d(
+            in_channels,
+            channels,
+            kernel_size,
+            bias=bias,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.sn["upsamples"] = nn.ModuleList()
+        self.fn["upsamples"] = nn.ModuleList()
+        self.sn["blocks"] = nn.ModuleList()
+        self.fn["blocks"] = nn.ModuleList()
+        for i in range(len(upsample_kernel_sizes)):
+            assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
+            self.sn["upsamples"] += [
+                nn.Sequential(
+                    getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
+                    nn.ConvTranspose1d(
+                        channels // (2**i),
+                        channels // (2 ** (i + 1)),
+                        upsample_kernel_sizes[i],
+                        upsample_scales[i],
+                        padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
+                        output_padding=upsample_scales[i] % 2,
+                        bias=bias,
+                    ),
+                )
+            ]
+            if not share_upsamples:
+                self.fn["upsamples"] += [
+                    nn.Sequential(
+                        getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
+                        nn.ConvTranspose1d(
+                            channels // (2**i),
+                            channels // (2 ** (i + 1)),
+                            upsample_kernel_sizes[i],
+                            upsample_scales[i],
+                            padding=upsample_scales[i] // 2 + upsample_scales[i] % 2,
+                            output_padding=upsample_scales[i] % 2,
+                            bias=bias,
+                        ),
+                    )
+                ]
+            self.sn["blocks"] += [
+                AdaptiveResidualBlock(
+                    kernel_size=source_network_params["resblock_kernel_size"],
+                    channels=channels // (2 ** (i + 1)),
+                    dilations=source_network_params["resblock_dilations"][i],
+                    bias=bias,
+                    use_additional_convs=source_network_params["use_additional_convs"],
+                    nonlinear_activation=nonlinear_activation,
+                    nonlinear_activation_params=nonlinear_activation_params,
+                )
+            ]
+            for j in range(len(filter_network_params["resblock_kernel_sizes"])):
+                self.fn["blocks"] += [
+                    ResidualBlock(
+                        kernel_size=filter_network_params["resblock_kernel_sizes"][j],
+                        channels=channels // (2 ** (i + 1)),
+                        dilations=filter_network_params["resblock_dilations"][j],
+                        bias=bias,
+                        use_additional_convs=filter_network_params["use_additional_convs"],
+                        nonlinear_activation=nonlinear_activation,
+                        nonlinear_activation_params=nonlinear_activation_params,
+                    )
+                ]
+        self.sn["output_conv"] = nn.Sequential(
+            nn.LeakyReLU(),
+            nn.Conv1d(
+                channels // (2 ** (i + 1)),
+                out_channels,
+                kernel_size,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+        )
+        self.fn["output_conv"] = nn.Sequential(
+            nn.LeakyReLU(),
+            nn.Conv1d(
+                channels // (2 ** (i + 1)),
+                out_channels,
+                kernel_size,
+                bias=bias,
+                padding=(kernel_size - 1) // 2,
+            ),
+            nn.Tanh(),
+        )
+
+        # sine embedding layers
+        self.sn["emb"] = Conv1d(
+            1,
+            channels // (2 ** len(upsample_kernel_sizes)),
+            kernel_size,
+            bias=bias,
+            padding=(kernel_size - 1) // 2,
+        )
+        # down-sampling CNNs
+        self.sn["downsamples"] = nn.ModuleList()
+        for i in reversed(range(1, len(upsample_kernel_sizes))):
+            self.sn["downsamples"] += [
+                nn.Sequential(
+                    nn.Conv1d(
+                        channels // (2 ** (i + 1)),
+                        channels // (2**i),
+                        upsample_kernel_sizes[i],
+                        upsample_scales[i],
+                        padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
+                        bias=bias,
+                    ),
+                    getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
+                )
+            ]
+        if not share_downsamples:
+            self.fn["downsamples"] = nn.ModuleList()
+            for i in reversed(range(1, len(upsample_kernel_sizes))):
+                self.fn["downsamples"] += [
+                    nn.Sequential(
+                        nn.Conv1d(
+                            channels // (2 ** (i + 1)),
+                            channels // (2**i),
+                            upsample_kernel_sizes[i],
+                            upsample_scales[i],
+                            padding=upsample_scales[i] - (upsample_kernel_sizes[i] % 2 == 0),
+                            bias=bias,
+                        ),
+                        getattr(nn, nonlinear_activation)(**nonlinear_activation_params),
+                    )
+                ]
+
+        # apply weight norm
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+        # reset parameters
+        self.reset_parameters()
+
+        if requires_grad is False:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, x, c, d, sid):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input sine signal (B, 1, T).
+            c (Tensor): Input tensor (B, in_channels, T).
+            d (List): F0-dependent dilation factors [(B, 1, T) x num_upsamples].
+
+        Returns:
+            Tensor: Output tensor (B, out_channels, T).
+
+        """
+
+        # currently, same input feature is input to each network
+        c = self.input_conv(c)
+        e = c
+
+        # source-network forward
+        x = self.sn["emb"](x)
+        embs = [x]
+        for i in range(self.num_upsamples - 1):
+            x = self.sn["downsamples"][i](x)
+            embs += [x]
+        for i in range(self.num_upsamples):
+            # excitation generation network
+            e = self.sn["upsamples"][i](e) + embs[-i - 1]
+            e = self.sn["blocks"][i](e, d[i])
+        e_ = self.sn["output_conv"](e)
+
+        # filter-network forward
+        embs = [e]
+        for i in range(self.num_upsamples - 1):
+            if self.share_downsamples:
+                e = self.sn["downsamples"][i](e)
+            else:
+                e = self.fn["downsamples"][i](e)
+            embs += [e]
+        num_blocks = len(self.filter_network_params["resblock_kernel_sizes"])
+        for i in range(self.num_upsamples):
+            # resonance filtering network
+            if self.share_upsamples:
+                c = self.sn["upsamples"][i](c) + embs[-i - 1]
+            else:
+                c = self.fn["upsamples"][i](c) + embs[-i - 1]
+            cs = 0.0  # initialize
+            for j in range(num_blocks):
+                cs += self.fn["blocks"][i * num_blocks + j](c)
+            c = cs / num_blocks
+        c = self.fn["output_conv"](c)
+
+        return c, e_
+
+    def reset_parameters(self):
+        """Reset parameters.
+
+        This initialization follows the official implementation manner.
+        https://github.com/jik876/hifi-gan/blob/master/models.py
+
+        """
+
+        def _reset_parameters(m):
+            if isinstance(m, (nn.Conv1d, nn.ConvTranspose1d)):
+                m.weight.data.normal_(0.0, 0.01)
+                logger.debug(f"Reset parameters in {m}.")
+
+        self.apply(_reset_parameters)
+
+    def remove_weight_norm(self):
+        """Remove weight normalization module from all of the layers."""
+
+        def _remove_weight_norm(m):
+            try:
+                logger.debug(f"Weight norm is removed from {m}.")
+                nn.utils.remove_weight_norm(m)
+            except ValueError:  # this module didn't have weight norm
+                return
+
+        self.apply(_remove_weight_norm)
+
+    def apply_weight_norm(self):
+        """Apply weight normalization module from all of the layers."""
+
+        def _apply_weight_norm(m):
+            if isinstance(m, nn.Conv1d) or isinstance(m, nn.ConvTranspose1d):
+                nn.utils.weight_norm(m)
+                logger.debug(f"Weight norm is applied to {m}.")
+
+        self.apply(_apply_weight_norm)
--- a/server/voice_changer/MMVCv15/models/index.py
+++ b/server/voice_changer/MMVCv15/models/index.py
@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Yi-Chiao Wu (Nagoya University)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Indexing-related functions."""
+
+import torch
+from torch.nn import ConstantPad1d as pad1d
+
+
+def pd_indexing(x, d, dilation, batch_index, ch_index):
+    """Pitch-dependent indexing of past and future samples.
+
+    Args:
+        x (Tensor): Input feature map (B, C, T).
+        d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
+        dilation (Int): Dilation size.
+        batch_index (Tensor): Batch index
+        ch_index (Tensor): Channel index
+
+    Returns:
+        Tensor: Past output tensor (B, out_channels, T)
+        Tensor: Future output tensor (B, out_channels, T)
+
+    """
+    (_, _, batch_length) = d.size()
+    dilations = d * dilation
+
+    # get past index
+    idxP = torch.arange(-batch_length, 0).float()
+    idxP = idxP.to(x.device)
+    idxP = torch.add(-dilations, idxP)
+    idxP = idxP.round().long()
+    maxP = -((torch.min(idxP) + batch_length))
+    assert maxP >= 0
+    idxP = (batch_index, ch_index, idxP)
+    # padding past tensor
+    xP = pad1d((maxP, 0), 0)(x)
+
+    # get future index
+    idxF = torch.arange(0, batch_length).float()
+    idxF = idxF.to(x.device)
+    idxF = torch.add(dilations, idxF)
+    idxF = idxF.round().long()
+    maxF = torch.max(idxF) - (batch_length - 1)
+    assert maxF >= 0
+    idxF = (batch_index, ch_index, idxF)
+    # padding future tensor
+    xF = pad1d((0, maxF), 0)(x)
+
+    return xP[idxP], xF[idxF]
+
+
+def index_initial(n_batch, n_ch, tensor=True):
+    """Tensor batch and channel index initialization.
+
+    Args:
+        n_batch (Int): Number of batch.
+        n_ch (Int): Number of channel.
+        tensor (bool): Return tensor or numpy array
+
+    Returns:
+        Tensor: Batch index
+        Tensor: Channel index
+
+    """
+    batch_index = []
+    for i in range(n_batch):
+        batch_index.append([[i]] * n_ch)
+    ch_index = []
+    for i in range(n_ch):
+        ch_index += [[i]]
+    ch_index = [ch_index] * n_batch
+
+    if tensor:
+        batch_index = torch.tensor(batch_index)
+        ch_index = torch.tensor(ch_index)
+        if torch.cuda.is_available():
+            batch_index = batch_index.cuda()
+            ch_index = ch_index.cuda()
+    return batch_index, ch_index
--- a/server/voice_changer/MMVCv15/models/models.py
+++ b/server/voice_changer/MMVCv15/models/models.py
@ -0,0 +1,438 @@
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .modules import ResidualCouplingLayer, Flip, WN, ResBlock1, ResBlock2, LRELU_SLOPE
+
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from .commons import init_weights, get_padding, sequence_mask
+from .generator import SiFiGANGenerator
+from .features import SignalGenerator, dilated_factor
+
+
+class TextEncoder(nn.Module):
+    def __init__(self, out_channels, hidden_channels, requires_grad=True):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+        # パラメータを学習しない
+        if requires_grad is False:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, x, x_lengths):
+        x = torch.transpose(x.half(), 1, -1)  # [b, h, t]
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+
+
+class ResidualCouplingBlock(nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0, requires_grad=True):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
+            self.flows.append(Flip())
+
+        # パラメータを学習しない
+        if requires_grad is False:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+
+
+class PosteriorEncoder(nn.Module):
+    def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, requires_grad=True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+
+        # パラメータを学習しない
+        if requires_grad is False:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, requires_grad=True):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
+        resblock = ResBlock1 if resblock == "1" else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(weight_norm(ConvTranspose1d(upsample_initial_channel // (2**i), upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(resblock(ch, k, d))
+
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+
+        if requires_grad is False:
+            for param in self.parameters():
+                param.requires_grad = False
+
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+                norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+                norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+                norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
+                norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        # periods = [2,3,5,7,11]
+        periods = [3, 5, 7, 11, 13]
+
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
+        self.discriminators = nn.ModuleList(discs)
+
+    def forward(self, y, y_hat, flag=True):
+        if flag:
+            y_d_rs = []
+            y_d_gs = []
+            fmap_rs = []
+            fmap_gs = []
+            for i, d in enumerate(self.discriminators):
+                y_d_r, fmap_r = d(y)
+                y_d_g, fmap_g = d(y_hat)
+                y_d_rs.append(y_d_r)
+                y_d_gs.append(y_d_g)
+                fmap_rs.append(fmap_r)
+                fmap_gs.append(fmap_g)
+
+            return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+        else:
+            y_d_gs = []
+            with torch.no_grad():
+                for i, d in enumerate(self.discriminators):
+                    y_d_g, _ = d(y_hat)
+                    y_d_gs.append(y_d_g)
+
+            return y_d_gs
+
+
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+
+    def __init__(
+        self,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_flow,
+        dec_out_channels=1,
+        dec_kernel_size=7,
+        n_speakers=0,
+        gin_channels=0,
+        requires_grad_pe=True,
+        requires_grad_flow=True,
+        requires_grad_text_enc=True,
+        requires_grad_dec=True,
+        requires_grad_emb_g=True,
+        sample_rate=24000,
+        hop_size=128,
+        sine_amp=0.1,
+        noise_amp=0.003,
+        signal_types=["sine"],
+        dense_factors=[0.5, 1, 4, 8],
+        upsample_scales=[8, 4, 2, 2],
+    ):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.hidden_channels = hidden_channels
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.dec_out_channels = dec_out_channels
+        self.dec_kernel_size = dec_kernel_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.requires_grad_pe = requires_grad_pe
+        self.requires_grad_flow = requires_grad_flow
+        self.requires_grad_text_enc = requires_grad_text_enc
+        self.requires_grad_dec = requires_grad_dec
+        self.requires_grad_emb_g = requires_grad_emb_g
+        self.sample_rate = sample_rate
+        self.hop_size = hop_size
+        self.sine_amp = sine_amp
+        self.noise_amp = noise_amp
+        self.signal_types = signal_types
+        self.dense_factors = dense_factors
+        self.upsample_scales = upsample_scales
+
+        self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels, requires_grad=requires_grad_pe)
+        self.enc_p = TextEncoder(inter_channels, hidden_channels, requires_grad=requires_grad_text_enc)
+        self.dec = SiFiGANGenerator(in_channels=inter_channels, out_channels=dec_out_channels, channels=upsample_initial_channel, kernel_size=dec_kernel_size, upsample_scales=upsample_rates, upsample_kernel_sizes=upsample_kernel_sizes, requires_grad=requires_grad_dec)
+        self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels, requires_grad=requires_grad_flow)
+        self.signal_generator = SignalGenerator(sample_rate=sample_rate, hop_size=hop_size, noise_amp=noise_amp, signal_types=signal_types)
+
+        if n_speakers > 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+            self.emb_g.requires_grad = requires_grad_emb_g
+
+    def forward(self, x, x_lengths, y, y_lengths, f0, slice_id, sid=None, target_ids=None):
+        pass
+        # sin, d = self.make_sin_d(f0)
+
+        # x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
+        # # target sid 作成
+        # target_sids = self.make_random_target_sids(target_ids, sid)
+
+        # if self.n_speakers > 0:
+        #     g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+        #     tgt_g = self.emb_g(target_sids).unsqueeze(-1)  # [b, h, 1]
+        # else:
+        #     g = None
+
+        # # PE
+        # z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
+        # # Flow
+        # z_p = self.flow(z, y_mask, g=g)
+        # # VC
+        # tgt_z = self.flow(z_p, y_mask, g=tgt_g, reverse=True)
+        # # アライメントの作成
+        # liner_alignment = F.one_hot(torch.arange(0, x.shape[2] + 2)).cuda()
+        # liner_alignment = torch.stack([liner_alignment for _ in range(x.shape[0])], axis=0)
+        # liner_alignment = F.interpolate(liner_alignment.float(), size=(z.shape[2]), mode="linear", align_corners=True)
+        # liner_alignment = liner_alignment[:, 1:-1, :]
+        # # TextEncとPEのshape合わせ
+        # m_p = torch.matmul(m_p, liner_alignment)
+        # logs_p = torch.matmul(logs_p, liner_alignment)
+
+        # # slice
+        # z_slice = slice_segments(z, slice_id, self.segment_size)
+        # # targetのslice
+        # tgt_z_slice = slice_segments(tgt_z, slice_id, self.segment_size)
+        # # Dec
+        # o = self.dec(sin, z_slice, d, sid=g)
+        # tgt_o = self.dec(sin, tgt_z_slice, d, sid=tgt_g)
+
+        # return (o, tgt_o), slice_id, x_mask, y_mask, ((z, z_p, m_p), logs_p, m_q, logs_q)
+
+    def make_sin_d(self, f0):
+        # f0 から sin と d を作成
+        # f0 : [b, 1, t]
+        # sin : [b, 1, t]
+        # d : [4][b, 1, t]
+        prod_upsample_scales = np.cumprod(self.upsample_scales)
+        dfs_batch = []
+        for df, us in zip(self.dense_factors, prod_upsample_scales):
+            dilated_tensor = dilated_factor(f0, self.sample_rate, df)
+            # result += [torch.repeat_interleave(dilated_tensor, us, dim=1)]
+            result = [torch.stack([dilated_tensor for _ in range(us)], -1).reshape(dilated_tensor.shape[0], -1)]
+            dfs_batch.append(torch.cat(result, dim=0).unsqueeze(1))
+        in_batch = self.signal_generator(f0)
+
+        return in_batch, dfs_batch
+
+    def make_random_target_sids(self, target_ids, sid):
+        # target_sids は target_ids をランダムで埋める
+        target_sids = torch.zeros_like(sid)
+        for i in range(len(target_sids)):
+            source_id = sid[i]
+            deleted_target_ids = target_ids[target_ids != source_id]  # source_id と target_id が同じにならないよう sid と同じものを削除
+            if len(deleted_target_ids) >= 1:
+                target_sids[i] = deleted_target_ids[torch.randint(len(deleted_target_ids), (1,))]
+            else:
+                # target_id 候補が無いときは仕方ないので sid を使う
+                target_sids[i] = source_id
+        return target_sids
+
+    def voice_conversion(self, y, y_lengths, f0, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        sin, d = self.make_sin_d(f0)
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, _, _, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        # print("VC", sin.device, d[0].device, g_tgt.device)
+        o_hat = self.dec(sin, z_hat * y_mask, d, sid=g_tgt)
+        return o_hat[0]
+
+    def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        o_hat = self.dec(z * y_mask, g=g_tgt)
+        return o_hat, y_mask, (z)
+
+    def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        # g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        o_hat = self.dec(z * y_mask, g=g_src)
+        return o_hat, y_mask, (z)
+
+    def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
+        assert self.n_speakers > 0, "n_speakers have to be larger than 0."
+        g_src = self.emb_g(sid_src).unsqueeze(-1)
+        g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
+        z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
+        z_p = self.flow(z, y_mask, g=g_src)
+        z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
+        z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
+        z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
+        o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
+        return o_hat, y_mask, (z, z_p, z_hat)
+
+    def save_synthesizer(self, path):
+        enc_q = self.enc_q.state_dict()
+        dec = self.dec.state_dict()
+        emb_g = self.emb_g.state_dict()
+        torch.save({"enc_q": enc_q, "dec": dec, "emb_g": emb_g}, path)
+
+    def load_synthesizer(self, path):
+        dict = torch.load(path, map_location="cpu")
+        enc_q = dict["enc_q"]
+        dec = dict["dec"]
+        emb_g = dict["emb_g"]
+        self.enc_q.load_state_dict(enc_q)
+        self.dec.load_state_dict(dec)
+        self.emb_g.load_state_dict(emb_g)
--- a/server/voice_changer/MMVCv15/models/modules.py
+++ b/server/voice_changer/MMVCv15/models/modules.py
@ -0,0 +1,186 @@
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from torch.nn import Conv1d
+from torch.nn.utils import weight_norm, remove_weight_norm
+
+from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
+
+
+LRELU_SLOPE = 0.1
+
+
+class WN(torch.nn.Module):
+    def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
+        super(WN, self).__init__()
+        assert kernel_size % 2 == 1
+        self.hidden_channels = hidden_channels
+        self.kernel_size = (kernel_size,)
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.p_dropout = p_dropout
+
+        self.in_layers = torch.nn.ModuleList()
+        self.res_skip_layers = torch.nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+
+        if gin_channels != 0:
+            cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+            self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
+
+        for i in range(n_layers):
+            dilation = dilation_rate**i
+            padding = int((kernel_size * dilation - dilation) / 2)
+            in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
+            in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
+            self.in_layers.append(in_layer)
+
+            # last one is not necessary
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+
+            res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
+            res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
+            self.res_skip_layers.append(res_skip_layer)
+
+    def forward(self, x, x_mask, g=None, **kwargs):
+        output = torch.zeros_like(x)
+        n_channels_tensor = torch.IntTensor([self.hidden_channels])
+
+        if g is not None:
+            g = self.cond_layer(g)
+
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
+            else:
+                g_l = torch.zeros_like(x_in)
+
+            acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
+            acts = self.drop(acts)
+
+            res_skip_acts = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                res_acts = res_skip_acts[:, : self.hidden_channels, :]
+                x = (x + res_acts) * x_mask
+                output = output + res_skip_acts[:, self.hidden_channels :, :]
+            else:
+                output = output + res_skip_acts
+        return output * x_mask
+
+    def remove_weight_norm(self):
+        if self.gin_channels != 0:
+            torch.nn.utils.remove_weight_norm(self.cond_layer)
+        for l in self.in_layers:
+            torch.nn.utils.remove_weight_norm(l)
+        for l in self.res_skip_layers:
+            torch.nn.utils.remove_weight_norm(l)
+
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c2(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
+        self.convs.apply(init_weights)
+
+    def forward(self, x, x_mask=None):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            if x_mask is not None:
+                xt = xt * x_mask
+            xt = c(xt)
+            x = xt + x
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Flip(nn.Module):
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        if not reverse:
+            logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
+            return x, logdet
+        else:
+            return x
+
+
+class ResidualCouplingLayer(nn.Module):
+    def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
+        assert channels % 2 == 0, "channels should be divisible by 2"
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
+        self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        if not self.mean_only:
+            m, logs = torch.split(stats, [self.half_channels] * 2, 1)
+        else:
+            m = stats
+            logs = torch.zeros_like(m)
+
+        if not reverse:
+            x1 = m + x1 * torch.exp(logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            logdet = torch.sum(logs, [1, 2])
+            return x, logdet
+        else:
+            x1 = (x1 - m) * torch.exp(-logs) * x_mask
+            x = torch.cat([x0, x1], 1)
+            return x
--- a/server/voice_changer/MMVCv15/models/readme.txt
+++ b/server/voice_changer/MMVCv15/models/readme.txt
@ -0,0 +1 @@
+modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26
--- a/server/voice_changer/MMVCv15/models/residual_block.py
+++ b/server/voice_changer/MMVCv15/models/residual_block.py
@ -0,0 +1,257 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Reo Yoneyama (Nagoya University)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Residual block modules.
+
+References:
+    - https://github.com/kan-bayashi/ParallelWaveGAN
+    - https://github.com/bigpon/QPPWG
+    - https://github.com/r9y9/wavenet_vocoder
+
+"""
+
+from logging import getLogger
+
+import torch
+import torch.nn as nn
+from .snake import Snake
+from .index import index_initial, pd_indexing
+
+# A logger for this file
+logger = getLogger(__name__)
+
+
+class Conv1d(nn.Conv1d):
+    """Conv1d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv1d module."""
+        super(Conv1d, self).__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.0)
+
+
+class Conv1d1x1(Conv1d):
+    """1x1 Conv1d with customized initialization."""
+
+    def __init__(self, in_channels, out_channels, bias=True):
+        """Initialize 1x1 Conv1d module."""
+        super(Conv1d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
+
+
+class Conv2d(nn.Conv2d):
+    """Conv2d module with customized initialization."""
+
+    def __init__(self, *args, **kwargs):
+        """Initialize Conv2d module."""
+        super(Conv2d, self).__init__(*args, **kwargs)
+
+    def reset_parameters(self):
+        """Reset parameters."""
+        nn.init.kaiming_normal_(self.weight, mode="fan_out", nonlinearity="relu")
+        if self.bias is not None:
+            nn.init.constant_(self.bias, 0.0)
+
+
+class Conv2d1x1(Conv2d):
+    """1x1 Conv2d with customized initialization."""
+
+    def __init__(self, in_channels, out_channels, bias=True):
+        """Initialize 1x1 Conv2d module."""
+        super(Conv2d1x1, self).__init__(in_channels, out_channels, kernel_size=1, padding=0, dilation=1, bias=bias)
+
+
+class ResidualBlock(nn.Module):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=512,
+        dilations=(1, 3, 5),
+        bias=True,
+        use_additional_convs=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        """Initialize ResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            dilations (List[int]): List of dilation factors.
+            use_additional_convs (bool): Whether to use additional convolution layers.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        self.convs1 = nn.ModuleList()
+        if use_additional_convs:
+            self.convs2 = nn.ModuleList()
+        assert kernel_size % 2 == 1, "Kernel size must be odd number."
+        for dilation in dilations:
+            if nonlinear_activation == "Snake":
+                nonlinear = Snake(channels, **nonlinear_activation_params)
+            else:
+                nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+            self.convs1 += [
+                nn.Sequential(
+                    nonlinear,
+                    nn.Conv1d(
+                        channels,
+                        channels,
+                        kernel_size,
+                        dilation=dilation,
+                        bias=bias,
+                        padding=(kernel_size - 1) // 2 * dilation,
+                    ),
+                )
+            ]
+            if use_additional_convs:
+                if nonlinear_activation == "Snake":
+                    nonlinear = Snake(channels, **nonlinear_activation_params)
+                else:
+                    nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+                self.convs2 += [
+                    nn.Sequential(
+                        nonlinear,
+                        nn.Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            dilation=1,
+                            bias=bias,
+                            padding=(kernel_size - 1) // 2,
+                        ),
+                    )
+                ]
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        for idx in range(len(self.convs1)):
+            xt = self.convs1[idx](x)
+            if self.use_additional_convs:
+                xt = self.convs2[idx](xt)
+            x = xt + x
+        return x
+
+
+class AdaptiveResidualBlock(nn.Module):
+    """Residual block module in HiFiGAN."""
+
+    def __init__(
+        self,
+        kernel_size=3,
+        channels=512,
+        dilations=(1, 2, 4),
+        bias=True,
+        use_additional_convs=True,
+        nonlinear_activation="LeakyReLU",
+        nonlinear_activation_params={"negative_slope": 0.1},
+    ):
+        """Initialize ResidualBlock module.
+
+        Args:
+            kernel_size (int): Kernel size of dilation convolution layer.
+            channels (int): Number of channels for convolution layer.
+            bias (bool): Whether to add bias parameter in convolution layers.
+            nonlinear_activation (str): Activation function module name.
+            nonlinear_activation_params (dict): Hyperparameters for activation function.
+
+        """
+        super().__init__()
+        self.use_additional_convs = use_additional_convs
+        assert kernel_size == 3, "Currently only kernel_size = 3 is supported."
+        self.channels = channels
+        self.dilations = dilations
+        self.nonlinears = nn.ModuleList()
+        self.convsC = nn.ModuleList()
+        self.convsP = nn.ModuleList()
+        self.convsF = nn.ModuleList()
+        if use_additional_convs:
+            self.convsA = nn.ModuleList()
+        for _ in dilations:
+            if nonlinear_activation == "Snake":
+                self.nonlinears += [Snake(channels, **nonlinear_activation_params)]
+            else:
+                self.nonlinears += [getattr(nn, nonlinear_activation)(**nonlinear_activation_params)]
+            self.convsC += [
+                Conv1d1x1(
+                    channels,
+                    channels,
+                    bias=bias,
+                ),
+            ]
+            self.convsP += [
+                Conv1d1x1(
+                    channels,
+                    channels,
+                    bias=bias,
+                ),
+            ]
+            self.convsF += [
+                Conv1d1x1(
+                    channels,
+                    channels,
+                    bias=bias,
+                ),
+            ]
+            if use_additional_convs:
+                if nonlinear_activation == "Snake":
+                    nonlinear = Snake(channels, **nonlinear_activation_params)
+                else:
+                    nonlinear = getattr(nn, nonlinear_activation)(**nonlinear_activation_params)
+                self.convsA += [
+                    nn.Sequential(
+                        nonlinear,
+                        nn.Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            dilation=1,
+                            bias=bias,
+                            padding=(kernel_size - 1) // 2,
+                        ),
+                    )
+                ]
+
+    def forward(self, x, d):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input tensor (B, channels, T).
+            d (Tensor): Input pitch-dependent dilated factors (B, 1, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        batch_index, ch_index = index_initial(x.size(0), self.channels, tensor=False)
+        batch_index = torch.tensor(batch_index).to(x.device)
+        ch_index = torch.tensor(ch_index).to(x.device)
+
+        for i, dilation in enumerate(self.dilations):
+            xt = self.nonlinears[i](x)
+            xP, xF = pd_indexing(xt, d, dilation, batch_index, ch_index)
+            xt = self.convsC[i](xt) + self.convsP[i](xP) + self.convsF[i](xF)
+            if self.use_additional_convs:
+                xt = self.convsA[i](xt)
+            x = xt + x
+        return x
--- a/server/voice_changer/MMVCv15/models/snake.py
+++ b/server/voice_changer/MMVCv15/models/snake.py
@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Reo Yoneyama (Nagoya University)
+#  MIT License (https://opensource.org/licenses/MIT)
+
+"""Snake Activation Function Module.
+
+References:
+    - Neural Networks Fail to Learn Periodic Functions and How to Fix It
+        https://arxiv.org/pdf/2006.08195.pdf
+    - BigVGAN: A Universal Neural Vocoder with Large-Scale Training
+        https://arxiv.org/pdf/2206.04658.pdf
+
+"""
+
+import torch
+import torch.nn as nn
+
+
+class Snake(nn.Module):
+    """Snake activation function module."""
+
+    def __init__(self, channels, init=50):
+        """Initialize Snake module.
+
+        Args:
+            channels (int): Number of feature channels.
+            init (float): Initial value of the learnable parameter alpha.
+                          According to the original paper, 5 ~ 50 would be
+                          suitable for periodic data (i.e. voices).
+
+        """
+        super(Snake, self).__init__()
+        alpha = init * torch.ones(1, channels, 1)
+        self.alpha = nn.Parameter(alpha)
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Input noise signal (B, channels, T).
+
+        Returns:
+            Tensor: Output tensor (B, channels, T).
+
+        """
+        return x + torch.sin(self.alpha * x) ** 2 / self.alpha
				`@ -0,0 +1 @@`
				`modules in this folder from https://github.com/isletennos/MMVC_Client.git at 461cb231b57cbb17243110eaac8435d9cca24a26`