mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
WIP: integrate vcs to new gui 4
This commit is contained in:
parent
68b1c8953e
commit
4d31d2238d
@ -1,4 +1,3 @@
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from typing import Union
|
||||
@ -42,7 +41,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(res)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_upload_file ex:", e)
|
||||
|
||||
def post_concat_uploaded_file(self, filename: str = Form(...), filenameChunkNum: int = Form(...)):
|
||||
try:
|
||||
@ -50,7 +49,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(res)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_concat_uploaded_file ex:", e)
|
||||
|
||||
def get_info(self):
|
||||
try:
|
||||
@ -58,7 +57,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] get_info ex:", e)
|
||||
|
||||
def get_performance(self):
|
||||
try:
|
||||
@ -66,7 +65,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] get_performance ex:", e)
|
||||
|
||||
def post_update_settings(self, key: str = Form(...), val: Union[int, str, float] = Form(...)):
|
||||
try:
|
||||
@ -75,8 +74,10 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print(sys.exc_info())
|
||||
print("[Voice Changer] post_update_settings ex:", e)
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
def post_load_model(
|
||||
self,
|
||||
@ -95,7 +96,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_load_model ex:", e)
|
||||
|
||||
def post_model_type(self, modelType: ModelType = Form(...)):
|
||||
try:
|
||||
@ -103,7 +104,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_model_type ex:", e)
|
||||
|
||||
def get_model_type(self):
|
||||
try:
|
||||
@ -111,7 +112,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] get_model_type ex:", e)
|
||||
|
||||
def get_onnx(self):
|
||||
try:
|
||||
@ -119,7 +120,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] get_onnx ex:", e)
|
||||
|
||||
def post_merge_models(self, request: str = Form(...)):
|
||||
try:
|
||||
@ -128,7 +129,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_merge_models ex:", e)
|
||||
|
||||
def post_update_model_default(self):
|
||||
try:
|
||||
@ -136,7 +137,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_update_model_default ex:", e)
|
||||
|
||||
def post_update_model_info(self, newData: str = Form(...)):
|
||||
try:
|
||||
@ -144,7 +145,7 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_update_model_info ex:", e)
|
||||
|
||||
def post_upload_model_assets(self, params: str = Form(...)):
|
||||
try:
|
||||
@ -152,4 +153,4 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] ex:", e)
|
||||
print("[Voice Changer] post_update_model_info ex:", e)
|
||||
|
@ -24,9 +24,10 @@ import onnxruntime
|
||||
|
||||
import pyworld as pw
|
||||
|
||||
from models import SynthesizerTrn # type:ignore
|
||||
import cluster # type:ignore
|
||||
import utils
|
||||
# from models import SynthesizerTrn # type:ignore
|
||||
from .models.models import SynthesizerTrn
|
||||
from .models.utils import interpolate_f0, get_hparams_from_file, load_checkpoint, repeat_expand_2d, get_hubert_content
|
||||
from .models.cluster import get_cluster_model, get_cluster_center_result
|
||||
from fairseq import checkpoint_utils
|
||||
import librosa
|
||||
|
||||
@ -91,13 +92,13 @@ class SoVitsSvc40:
|
||||
|
||||
def initialize(self):
|
||||
print("[Voice Changer] [so-vits-svc40] Initializing... ")
|
||||
self.hps = utils.get_hparams_from_file(self.slotInfo.configFile)
|
||||
self.hps = get_hparams_from_file(self.slotInfo.configFile)
|
||||
self.settings.speakers = self.hps.spk
|
||||
|
||||
# cluster
|
||||
try:
|
||||
if self.slotInfo.clusterFile is not None:
|
||||
self.cluster_model = cluster.get_cluster_model(self.slotInfo.clusterFile)
|
||||
self.cluster_model = get_cluster_model(self.slotInfo.clusterFile)
|
||||
else:
|
||||
self.cluster_model = None
|
||||
except Exception as e:
|
||||
@ -121,7 +122,7 @@ class SoVitsSvc40:
|
||||
)
|
||||
net_g.eval()
|
||||
self.net_g = net_g
|
||||
utils.load_checkpoint(self.slotInfo.modelFile, self.net_g, None)
|
||||
load_checkpoint(self.slotInfo.modelFile, self.net_g, None)
|
||||
|
||||
def getOnnxExecutionProvider(self):
|
||||
availableProviders = onnxruntime.get_available_providers()
|
||||
@ -192,7 +193,7 @@ class SoVitsSvc40:
|
||||
if wav_44k.shape[0] % self.hps.data.hop_length != 0:
|
||||
print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}")
|
||||
|
||||
f0, uv = utils.interpolate_f0(f0)
|
||||
f0, uv = interpolate_f0(f0)
|
||||
f0 = torch.FloatTensor(f0)
|
||||
uv = torch.FloatTensor(uv)
|
||||
f0 = f0 * 2 ** (tran / 12)
|
||||
@ -224,12 +225,12 @@ class SoVitsSvc40:
|
||||
else:
|
||||
self.hubert_model = self.hubert_model.to(dev)
|
||||
wav16k_tensor = wav16k_tensor.to(dev)
|
||||
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k_tensor)
|
||||
c = get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k_tensor)
|
||||
|
||||
uv = uv.to(dev)
|
||||
f0 = f0.to(dev)
|
||||
|
||||
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
||||
c = repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
||||
|
||||
if self.settings.clusterInferRatio != 0 and hasattr(self, "cluster_model") and self.cluster_model is not None:
|
||||
speaker = [key for key, value in self.settings.speakers.items() if value == self.settings.dstId]
|
||||
@ -237,7 +238,7 @@ class SoVitsSvc40:
|
||||
pass
|
||||
# print("not only one speaker found.", speaker)
|
||||
else:
|
||||
cluster_c = cluster.get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker[0]).T
|
||||
cluster_c = get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker[0]).T
|
||||
cluster_c = torch.FloatTensor(cluster_c).to(dev)
|
||||
c = c.to(dev)
|
||||
c = self.settings.clusterInferRatio * cluster_c + (1 - self.settings.clusterInferRatio) * c
|
||||
|
29
server/voice_changer/SoVitsSvc40/models/cluster/__init__.py
Normal file
29
server/voice_changer/SoVitsSvc40/models/cluster/__init__.py
Normal file
@ -0,0 +1,29 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
def get_cluster_model(ckpt_path):
|
||||
checkpoint = torch.load(ckpt_path)
|
||||
kmeans_dict = {}
|
||||
for spk, ckpt in checkpoint.items():
|
||||
km = KMeans(ckpt["n_features_in_"])
|
||||
km.__dict__["n_features_in_"] = ckpt["n_features_in_"]
|
||||
km.__dict__["_n_threads"] = ckpt["_n_threads"]
|
||||
km.__dict__["cluster_centers_"] = ckpt["cluster_centers_"]
|
||||
kmeans_dict[spk] = km
|
||||
return kmeans_dict
|
||||
|
||||
def get_cluster_result(model, x, speaker):
|
||||
"""
|
||||
x: np.array [t, 256]
|
||||
return cluster class result
|
||||
"""
|
||||
return model[speaker].predict(x)
|
||||
|
||||
def get_cluster_center_result(model, x,speaker):
|
||||
"""x: np.array [t, 256]"""
|
||||
predict = model[speaker].predict(x)
|
||||
return model[speaker].cluster_centers_[predict]
|
||||
|
||||
def get_center(model, x,speaker):
|
||||
return model[speaker].cluster_centers_[x]
|
@ -0,0 +1,81 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import torch
|
||||
import logging
|
||||
import argparse
|
||||
import numpy as np
|
||||
from sklearn.cluster import KMeans, MiniBatchKMeans
|
||||
import tqdm
|
||||
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
import time
|
||||
|
||||
|
||||
def train_cluster(in_dir, n_clusters, use_minibatch=True, verbose=False):
|
||||
logger.info(f"Loading features from {in_dir}")
|
||||
features = []
|
||||
nums = 0
|
||||
for path in tqdm.tqdm(in_dir.glob("*.soft.pt")):
|
||||
features.append(torch.load(path).squeeze(0).numpy().T)
|
||||
# print(features[-1].shape)
|
||||
features = np.concatenate(features, axis=0)
|
||||
print(nums, features.nbytes / 1024**2, "MB , shape:", features.shape, features.dtype)
|
||||
features = features.astype(np.float32)
|
||||
logger.info(f"Clustering features of shape: {features.shape}")
|
||||
t = time.time()
|
||||
if use_minibatch:
|
||||
kmeans = MiniBatchKMeans(n_clusters=n_clusters, verbose=verbose, batch_size=4096, max_iter=80).fit(features)
|
||||
else:
|
||||
kmeans = KMeans(n_clusters=n_clusters, verbose=verbose).fit(features)
|
||||
print(time.time() - t, "s")
|
||||
|
||||
x = {
|
||||
"n_features_in_": kmeans.n_features_in_,
|
||||
"_n_threads": kmeans._n_threads,
|
||||
"cluster_centers_": kmeans.cluster_centers_,
|
||||
}
|
||||
print("end")
|
||||
|
||||
return x
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dataset", type=Path, default="./dataset/44k", help="path of training data directory")
|
||||
parser.add_argument("--output", type=Path, default="logs/44k", help="path of model output directory")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
checkpoint_dir = args.output
|
||||
dataset = args.dataset
|
||||
n_clusters = 10000
|
||||
|
||||
ckpt = {}
|
||||
for spk in os.listdir(dataset):
|
||||
if os.path.isdir(dataset / spk):
|
||||
print(f"train kmeans for {spk}...")
|
||||
in_dir = dataset / spk
|
||||
x = train_cluster(in_dir, n_clusters, verbose=False)
|
||||
ckpt[spk] = x
|
||||
|
||||
checkpoint_path = checkpoint_dir / f"kmeans_{n_clusters}.pt"
|
||||
checkpoint_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
torch.save(
|
||||
ckpt,
|
||||
checkpoint_path,
|
||||
)
|
||||
|
||||
# import cluster
|
||||
# for spk in tqdm.tqdm(os.listdir("dataset")):
|
||||
# if os.path.isdir(f"dataset/{spk}"):
|
||||
# print(f"start kmeans inference for {spk}...")
|
||||
# for feature_path in tqdm.tqdm(glob(f"dataset/{spk}/*.discrete.npy", recursive=True)):
|
||||
# mel_path = feature_path.replace(".discrete.npy",".mel.npy")
|
||||
# mel_spectrogram = np.load(mel_path)
|
||||
# feature_len = mel_spectrogram.shape[-1]
|
||||
# c = np.load(feature_path)
|
||||
# c = utils.tools.repeat_expand_2d(torch.FloatTensor(c), feature_len).numpy()
|
||||
# feature = c.T
|
||||
# feature_class = cluster.get_cluster_result(feature, spk)
|
||||
# np.save(feature_path.replace(".discrete.npy", ".discrete_class.npy"), feature_class)
|
351
server/voice_changer/SoVitsSvc40/models/models.py
Normal file
351
server/voice_changer/SoVitsSvc40/models/models.py
Normal file
@ -0,0 +1,351 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
# import modules.attentions as attentions
|
||||
# import modules.commons as commons
|
||||
# import modules.modules as modules
|
||||
|
||||
from torch.nn import Conv1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, spectral_norm
|
||||
|
||||
# import utils
|
||||
|
||||
# from modules.commons import init_weights, get_padding
|
||||
from .modules.commons import get_padding
|
||||
|
||||
# from vdecoder.hifigan.models import Generator
|
||||
from .vdecoder.hifigan.models import Generator
|
||||
from .utils import f0_to_coarse, normalize_f0
|
||||
from .modules.modules import ResidualCouplingLayer, Flip, WN, LRELU_SLOPE
|
||||
from .modules.commons import sequence_mask, rand_slice_segments_with_pitch
|
||||
from .modules.attentions import Encoder as attentionsEncoder
|
||||
from .modules.attentions import FFT
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_flows=4, gin_channels=0):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for i in range(n_flows):
|
||||
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
||||
self.flows.append(Flip())
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
if not reverse:
|
||||
for flow in self.flows:
|
||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||
else:
|
||||
for flow in reversed(self.flows):
|
||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||
return x
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
# print(x.shape,x_lengths.shape)
|
||||
x_mask = torch.unsqueeze(sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(self, out_channels, hidden_channels, kernel_size, n_layers, gin_channels=0, filter_channels=None, n_heads=None, p_dropout=None):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
self.f0_emb = nn.Embedding(256, hidden_channels)
|
||||
|
||||
self.enc_ = attentionsEncoder(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
|
||||
|
||||
def forward(self, x, x_mask, f0=None, noice_scale=1):
|
||||
x = x + self.f0_emb(f0).transpose(1, 2)
|
||||
x = self.enc_(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs) * noice_scale) * x_mask
|
||||
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
periods = [2, 3, 5, 7, 11]
|
||||
|
||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||
self.discriminators = nn.ModuleList(discs)
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_rs.append(fmap_r)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class SpeakerEncoder(torch.nn.Module):
|
||||
def __init__(self, mel_n_channels=80, model_num_layers=3, model_hidden_size=256, model_embedding_size=256):
|
||||
super(SpeakerEncoder, self).__init__()
|
||||
self.lstm = nn.LSTM(mel_n_channels, model_hidden_size, model_num_layers, batch_first=True)
|
||||
self.linear = nn.Linear(model_hidden_size, model_embedding_size)
|
||||
self.relu = nn.ReLU()
|
||||
|
||||
def forward(self, mels):
|
||||
self.lstm.flatten_parameters()
|
||||
_, (hidden, _) = self.lstm(mels)
|
||||
embeds_raw = self.relu(self.linear(hidden[-1]))
|
||||
return embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
|
||||
|
||||
def compute_partial_slices(self, total_frames, partial_frames, partial_hop):
|
||||
mel_slices = []
|
||||
for i in range(0, total_frames - partial_frames, partial_hop):
|
||||
mel_range = torch.arange(i, i + partial_frames)
|
||||
mel_slices.append(mel_range)
|
||||
|
||||
return mel_slices
|
||||
|
||||
def embed_utterance(self, mel, partial_frames=128, partial_hop=64):
|
||||
mel_len = mel.size(1)
|
||||
last_mel = mel[:, -partial_frames:]
|
||||
|
||||
if mel_len > partial_frames:
|
||||
mel_slices = self.compute_partial_slices(mel_len, partial_frames, partial_hop)
|
||||
mels = list(mel[:, s] for s in mel_slices)
|
||||
mels.append(last_mel)
|
||||
mels = torch.stack(tuple(mels), 0).squeeze(1)
|
||||
|
||||
with torch.no_grad():
|
||||
partial_embeds = self(mels)
|
||||
embed = torch.mean(partial_embeds, axis=0).unsqueeze(0)
|
||||
# embed = embed / torch.linalg.norm(embed, 2)
|
||||
else:
|
||||
with torch.no_grad():
|
||||
embed = self(last_mel)
|
||||
|
||||
return embed
|
||||
|
||||
|
||||
class F0Decoder(nn.Module):
|
||||
def __init__(self, out_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, spk_channels=0):
|
||||
super().__init__()
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.spk_channels = spk_channels
|
||||
|
||||
self.prenet = nn.Conv1d(hidden_channels, hidden_channels, 3, padding=1)
|
||||
self.decoder = FFT(hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
self.f0_prenet = nn.Conv1d(1, hidden_channels, 3, padding=1)
|
||||
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
||||
|
||||
def forward(self, x, norm_f0, x_mask, spk_emb=None):
|
||||
x = torch.detach(x)
|
||||
if spk_emb is not None:
|
||||
x = x + self.cond(spk_emb)
|
||||
x += self.f0_prenet(norm_f0)
|
||||
x = self.prenet(x) * x_mask
|
||||
x = self.decoder(x * x_mask, x_mask)
|
||||
x = self.proj(x) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels, ssl_dim, n_speakers, sampling_rate=44100, **kwargs):
|
||||
super().__init__()
|
||||
self.spec_channels = spec_channels
|
||||
self.inter_channels = inter_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.gin_channels = gin_channels
|
||||
self.ssl_dim = ssl_dim
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
|
||||
self.pre = nn.Conv1d(ssl_dim, hidden_channels, kernel_size=5, padding=2)
|
||||
|
||||
self.enc_p = TextEncoder(inter_channels, hidden_channels, filter_channels=filter_channels, n_heads=n_heads, n_layers=n_layers, kernel_size=kernel_size, p_dropout=p_dropout)
|
||||
hps = {
|
||||
"sampling_rate": sampling_rate,
|
||||
"inter_channels": inter_channels,
|
||||
"resblock": resblock,
|
||||
"resblock_kernel_sizes": resblock_kernel_sizes,
|
||||
"resblock_dilation_sizes": resblock_dilation_sizes,
|
||||
"upsample_rates": upsample_rates,
|
||||
"upsample_initial_channel": upsample_initial_channel,
|
||||
"upsample_kernel_sizes": upsample_kernel_sizes,
|
||||
"gin_channels": gin_channels,
|
||||
}
|
||||
self.dec = Generator(h=hps)
|
||||
self.enc_q = Encoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
||||
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
||||
self.f0_decoder = F0Decoder(1, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, spk_channels=gin_channels)
|
||||
self.emb_uv = nn.Embedding(2, hidden_channels)
|
||||
|
||||
def forward(self, c, f0, uv, spec, g=None, c_lengths=None, spec_lengths=None):
|
||||
g = self.emb_g(g).transpose(1, 2)
|
||||
# ssl prenet
|
||||
x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
||||
|
||||
# f0 predict
|
||||
lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
|
||||
norm_lf0 = normalize_f0(lf0, x_mask, uv)
|
||||
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
||||
|
||||
# encoder
|
||||
z_ptemp, m_p, logs_p, _ = self.enc_p(x, x_mask, f0=f0_to_coarse(f0))
|
||||
z, m_q, logs_q, spec_mask = self.enc_q(spec, spec_lengths, g=g)
|
||||
|
||||
# flow
|
||||
z_p = self.flow(z, spec_mask, g=g)
|
||||
z_slice, pitch_slice, ids_slice = rand_slice_segments_with_pitch(z, f0, spec_lengths, self.segment_size)
|
||||
|
||||
# nsf decoder
|
||||
o = self.dec(z_slice, g=g, f0=pitch_slice)
|
||||
|
||||
return o, ids_slice, spec_mask, (z, z_p, m_p, logs_p, m_q, logs_q), pred_lf0, norm_lf0, lf0
|
||||
|
||||
def infer(self, c, f0, uv, g=None, noice_scale=0.35, predict_f0=False):
|
||||
c_lengths = (torch.ones(c.size(0)) * c.size(-1)).to(c.device)
|
||||
g = self.emb_g(g).transpose(1, 2)
|
||||
x_mask = torch.unsqueeze(sequence_mask(c_lengths, c.size(2)), 1).to(c.dtype)
|
||||
x = self.pre(c) * x_mask + self.emb_uv(uv.long()).transpose(1, 2)
|
||||
|
||||
if predict_f0:
|
||||
lf0 = 2595.0 * torch.log10(1.0 + f0.unsqueeze(1) / 700.0) / 500
|
||||
norm_lf0 = normalize_f0(lf0, x_mask, uv, random_scale=False)
|
||||
pred_lf0 = self.f0_decoder(x, norm_lf0, x_mask, spk_emb=g)
|
||||
f0 = (700 * (torch.pow(10, pred_lf0 * 500 / 2595) - 1)).squeeze(1)
|
||||
|
||||
z_p, m_p, logs_p, c_mask = self.enc_p(x, x_mask, f0=f0_to_coarse(f0), noice_scale=noice_scale)
|
||||
z = self.flow(z_p, c_mask, g=g, reverse=True)
|
||||
o = self.dec(z * c_mask, g=g, f0=f0)
|
||||
return o
|
342
server/voice_changer/SoVitsSvc40/models/modules/attentions.py
Normal file
342
server/voice_changer/SoVitsSvc40/models/modules/attentions.py
Normal file
@ -0,0 +1,342 @@
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
# import modules.commons as commons
|
||||
# import modules.modules as modules
|
||||
from .modules import LayerNorm
|
||||
from .commons import subsequent_mask, convert_pad_shape
|
||||
|
||||
|
||||
class FFT(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0.0, proximal_bias=False, proximal_init=True, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.self_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_0 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
"""
|
||||
x: decoder input
|
||||
h: encoder output
|
||||
"""
|
||||
self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_0[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, window_size=4, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.attn_layers[i](x, x, attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0.0, proximal_bias=False, proximal_init=True, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.self_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_0 = nn.ModuleList()
|
||||
self.encdec_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
||||
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask, h, h_mask):
|
||||
"""
|
||||
x: decoder input
|
||||
h: encoder output
|
||||
"""
|
||||
self_attn_mask = subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
||||
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_0[i](x + y)
|
||||
|
||||
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, channels, out_channels, n_heads, p_dropout=0.0, window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
|
||||
super().__init__()
|
||||
assert channels % n_heads == 0
|
||||
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels
|
||||
self.n_heads = n_heads
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
self.heads_share = heads_share
|
||||
self.block_length = block_length
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
self.attn = None
|
||||
|
||||
self.k_channels = channels // n_heads
|
||||
self.conv_q = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_k = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_v = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if window_size is not None:
|
||||
n_heads_rel = 1 if heads_share else n_heads
|
||||
rel_stddev = self.k_channels**-0.5
|
||||
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
|
||||
nn.init.xavier_uniform_(self.conv_q.weight)
|
||||
nn.init.xavier_uniform_(self.conv_k.weight)
|
||||
nn.init.xavier_uniform_(self.conv_v.weight)
|
||||
if proximal_init:
|
||||
with torch.no_grad():
|
||||
self.conv_k.weight.copy_(self.conv_q.weight)
|
||||
self.conv_k.bias.copy_(self.conv_q.bias)
|
||||
|
||||
def forward(self, x, c, attn_mask=None):
|
||||
q = self.conv_q(x)
|
||||
k = self.conv_k(c)
|
||||
v = self.conv_v(c)
|
||||
|
||||
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
||||
|
||||
x = self.conv_o(x)
|
||||
return x
|
||||
|
||||
def attention(self, query, key, value, mask=None):
|
||||
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
||||
b, d, t_s, t_t = (*key.size(), query.size(2)) # type: ignore
|
||||
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
||||
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
|
||||
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
||||
if self.window_size is not None:
|
||||
assert t_s == t_t, "Relative attention is only available for self-attention."
|
||||
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
||||
rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
|
||||
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
||||
scores = scores + scores_local
|
||||
if self.proximal_bias:
|
||||
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
||||
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
||||
if mask is not None:
|
||||
scores = scores.masked_fill(mask == 0, -1e4)
|
||||
if self.block_length is not None:
|
||||
assert t_s == t_t, "Local attention is only available for self-attention."
|
||||
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
||||
scores = scores.masked_fill(block_mask == 0, -1e4)
|
||||
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
||||
p_attn = self.drop(p_attn)
|
||||
output = torch.matmul(p_attn, value)
|
||||
if self.window_size is not None:
|
||||
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
||||
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
||||
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
||||
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
||||
return output, p_attn
|
||||
|
||||
def _matmul_with_relative_values(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, m]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, d]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0))
|
||||
return ret
|
||||
|
||||
def _matmul_with_relative_keys(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, d]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, m]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
||||
return ret
|
||||
|
||||
def _get_relative_embeddings(self, relative_embeddings, length):
|
||||
max_relative_position = 2 * self.window_size + 1 # NOQA
|
||||
# Pad first before slice to avoid using cond ops.
|
||||
pad_length = max(length - (self.window_size + 1), 0)
|
||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
||||
slice_end_position = slice_start_position + 2 * length - 1
|
||||
if pad_length > 0:
|
||||
padded_relative_embeddings = F.pad(relative_embeddings, convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
||||
else:
|
||||
padded_relative_embeddings = relative_embeddings
|
||||
used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
|
||||
return used_relative_embeddings
|
||||
|
||||
def _relative_position_to_absolute_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, 2*l-1]
|
||||
ret: [b, h, l, l]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# Concat columns of pad to shift from relative to absolute indexing.
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
||||
|
||||
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
||||
x_flat = x.view([batch, heads, length * 2 * length])
|
||||
x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]))
|
||||
|
||||
# Reshape and slice out the padded elements.
|
||||
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[:, :, :length, length - 1 :]
|
||||
return x_final
|
||||
|
||||
def _absolute_position_to_relative_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, l]
|
||||
ret: [b, h, l, 2*l-1]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# padd along column
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]))
|
||||
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
|
||||
# add 0's in the beginning that will skew the elements after reshape
|
||||
x_flat = F.pad(x_flat, convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
||||
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
||||
return x_final
|
||||
|
||||
def _attention_bias_proximal(self, length):
|
||||
"""Bias for self-attention to encourage attention to close positions.
|
||||
Args:
|
||||
length: an integer scalar.
|
||||
Returns:
|
||||
a Tensor with shape [1, 1, length, length]
|
||||
"""
|
||||
r = torch.arange(length, dtype=torch.float32)
|
||||
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
||||
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
||||
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0.0, activation=None, causal=False):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.activation = activation
|
||||
self.causal = causal
|
||||
|
||||
if causal:
|
||||
self.padding = self._causal_padding
|
||||
else:
|
||||
self.padding = self._same_padding
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x = self.conv_1(self.padding(x * x_mask))
|
||||
if self.activation == "gelu":
|
||||
x = x * torch.sigmoid(1.702 * x)
|
||||
else:
|
||||
x = torch.relu(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(self.padding(x * x_mask))
|
||||
return x * x_mask
|
||||
|
||||
def _causal_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = self.kernel_size - 1
|
||||
pad_r = 0
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, convert_pad_shape(padding))
|
||||
return x
|
||||
|
||||
def _same_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = (self.kernel_size - 1) // 2
|
||||
pad_r = self.kernel_size // 2
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, convert_pad_shape(padding))
|
||||
return x
|
179
server/voice_changer/SoVitsSvc40/models/modules/commons.py
Normal file
179
server/voice_changer/SoVitsSvc40/models/modules/commons.py
Normal file
@ -0,0 +1,179 @@
|
||||
import math
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def slice_pitch_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
idx_str = ids_str[i]
|
||||
idx_end = idx_str + segment_size
|
||||
ret[i] = x[i, idx_str:idx_end]
|
||||
return ret
|
||||
|
||||
|
||||
def rand_slice_segments_with_pitch(x, pitch, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size + 1
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
ret_pitch = slice_pitch_segments(pitch, ids_str, segment_size)
|
||||
return ret, ret_pitch, ids_str
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size * dilation - dilation) / 2)
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def intersperse(lst, item):
|
||||
result = [item] * (len(lst) * 2 + 1)
|
||||
result[1::2] = lst
|
||||
return result
|
||||
|
||||
|
||||
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
||||
"""KL(P||Q)"""
|
||||
kl = (logs_q - logs_p) - 0.5
|
||||
kl += 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
||||
return kl
|
||||
|
||||
|
||||
def rand_gumbel(shape):
|
||||
"""Sample from the Gumbel distribution, protect from overflows."""
|
||||
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
||||
return -torch.log(-torch.log(uniform_samples))
|
||||
|
||||
|
||||
def rand_gumbel_like(x):
|
||||
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
||||
return g
|
||||
|
||||
|
||||
def slice_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
idx_str = ids_str[i]
|
||||
idx_end = idx_str + segment_size
|
||||
ret[i] = x[i, :, idx_str:idx_end]
|
||||
return ret
|
||||
|
||||
|
||||
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size + 1
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
return ret, ids_str
|
||||
|
||||
|
||||
def rand_spec_segments(x, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
return ret, ids_str
|
||||
|
||||
|
||||
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
||||
position = torch.arange(length, dtype=torch.float)
|
||||
num_timescales = channels // 2
|
||||
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (num_timescales - 1)
|
||||
inv_timescales = min_timescale * torch.exp(torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
||||
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
||||
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
||||
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
||||
signal = signal.view(1, channels, length)
|
||||
return signal
|
||||
|
||||
|
||||
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return x + signal.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
||||
|
||||
|
||||
def subsequent_mask(length):
|
||||
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
||||
return mask
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
def shift_1d(x):
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
||||
return x
|
||||
|
||||
|
||||
def sequence_mask(length, max_length=None):
|
||||
if max_length is None:
|
||||
max_length = length.max()
|
||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||
return x.unsqueeze(0) < length.unsqueeze(1)
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
duration: [b, 1, t_x]
|
||||
mask: [b, 1, t_y, t_x]
|
||||
"""
|
||||
device = duration.device # NOQA
|
||||
|
||||
b, _, t_y, t_x = mask.shape
|
||||
cum_duration = torch.cumsum(duration, -1)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||
path = path.unsqueeze(1).transpose(2, 3) * mask
|
||||
return path
|
||||
|
||||
|
||||
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
norm_type = float(norm_type)
|
||||
if clip_value is not None:
|
||||
clip_value = float(clip_value)
|
||||
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item() ** norm_type
|
||||
if clip_value is not None:
|
||||
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
||||
total_norm = total_norm ** (1.0 / norm_type)
|
||||
return total_norm
|
317
server/voice_changer/SoVitsSvc40/models/modules/crepe.py
Normal file
317
server/voice_changer/SoVitsSvc40/models/modules/crepe.py
Normal file
@ -0,0 +1,317 @@
|
||||
from typing import Optional, Union
|
||||
|
||||
try:
|
||||
from typing import Literal
|
||||
except Exception:
|
||||
from typing_extensions import Literal # type: ignore
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchcrepe
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
import scipy
|
||||
|
||||
# from:https://github.com/fishaudio/fish-diffusion
|
||||
|
||||
|
||||
def repeat_expand(content: Union[torch.Tensor, np.ndarray], target_len: int, mode: str = "nearest"):
|
||||
"""Repeat content to target length.
|
||||
This is a wrapper of torch.nn.functional.interpolate.
|
||||
|
||||
Args:
|
||||
content (torch.Tensor): tensor
|
||||
target_len (int): target length
|
||||
mode (str, optional): interpolation mode. Defaults to "nearest".
|
||||
|
||||
Returns:
|
||||
torch.Tensor: tensor
|
||||
"""
|
||||
|
||||
ndim = content.ndim
|
||||
|
||||
if content.ndim == 1:
|
||||
content = content[None, None]
|
||||
elif content.ndim == 2:
|
||||
content = content[None]
|
||||
|
||||
assert content.ndim == 3
|
||||
|
||||
is_np = isinstance(content, np.ndarray)
|
||||
if is_np:
|
||||
content = torch.from_numpy(content)
|
||||
|
||||
results = torch.nn.functional.interpolate(content, size=target_len, mode=mode)
|
||||
|
||||
if is_np:
|
||||
results = results.numpy()
|
||||
|
||||
if ndim == 1:
|
||||
return results[0, 0]
|
||||
elif ndim == 2:
|
||||
return results[0]
|
||||
|
||||
|
||||
class BasePitchExtractor:
|
||||
def __init__(
|
||||
self,
|
||||
hop_length: int = 512,
|
||||
f0_min: float = 50.0,
|
||||
f0_max: float = 1100.0,
|
||||
keep_zeros: bool = True,
|
||||
):
|
||||
"""Base pitch extractor.
|
||||
|
||||
Args:
|
||||
hop_length (int, optional): Hop length. Defaults to 512.
|
||||
f0_min (float, optional): Minimum f0. Defaults to 50.0.
|
||||
f0_max (float, optional): Maximum f0. Defaults to 1100.0.
|
||||
keep_zeros (bool, optional): Whether keep zeros in pitch. Defaults to True.
|
||||
"""
|
||||
|
||||
self.hop_length = hop_length
|
||||
self.f0_min = f0_min
|
||||
self.f0_max = f0_max
|
||||
self.keep_zeros = keep_zeros
|
||||
|
||||
def __call__(self, x, sampling_rate=44100, pad_to=None):
|
||||
raise NotImplementedError("BasePitchExtractor is not callable.")
|
||||
|
||||
def post_process(self, x, sampling_rate, f0, pad_to):
|
||||
if isinstance(f0, np.ndarray):
|
||||
f0 = torch.from_numpy(f0).float().to(x.device)
|
||||
|
||||
if pad_to is None:
|
||||
return f0
|
||||
|
||||
f0 = repeat_expand(f0, pad_to)
|
||||
|
||||
if self.keep_zeros:
|
||||
return f0
|
||||
|
||||
vuv_vector = torch.zeros_like(f0)
|
||||
vuv_vector[f0 > 0.0] = 1.0
|
||||
vuv_vector[f0 <= 0.0] = 0.0
|
||||
|
||||
# 去掉0频率, 并线性插值
|
||||
nzindex = torch.nonzero(f0).squeeze()
|
||||
f0 = torch.index_select(f0, dim=0, index=nzindex).cpu().numpy()
|
||||
time_org = self.hop_length / sampling_rate * nzindex.cpu().numpy()
|
||||
time_frame = np.arange(pad_to) * self.hop_length / sampling_rate
|
||||
|
||||
if f0.shape[0] <= 0:
|
||||
return torch.zeros(pad_to, dtype=torch.float, device=x.device), torch.zeros(pad_to, dtype=torch.float, device=x.device)
|
||||
|
||||
if f0.shape[0] == 1:
|
||||
return torch.ones(pad_to, dtype=torch.float, device=x.device) * f0[0], torch.ones(pad_to, dtype=torch.float, device=x.device)
|
||||
|
||||
# 大概可以用 torch 重写?
|
||||
f0 = np.interp(time_frame, time_org, f0, left=f0[0], right=f0[-1])
|
||||
vuv_vector = vuv_vector.cpu().numpy()
|
||||
vuv_vector = np.ceil(scipy.ndimage.zoom(vuv_vector, pad_to / len(vuv_vector), order=0))
|
||||
|
||||
return f0, vuv_vector
|
||||
|
||||
|
||||
class MaskedAvgPool1d(nn.Module):
|
||||
def __init__(self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0):
|
||||
"""An implementation of mean pooling that supports masked values.
|
||||
|
||||
Args:
|
||||
kernel_size (int): The size of the median pooling window.
|
||||
stride (int, optional): The stride of the median pooling window. Defaults to None.
|
||||
padding (int, optional): The padding of the median pooling window. Defaults to 0.
|
||||
"""
|
||||
|
||||
super(MaskedAvgPool1d, self).__init__()
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride or kernel_size
|
||||
self.padding = padding
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
ndim = x.dim()
|
||||
if ndim == 2:
|
||||
x = x.unsqueeze(1)
|
||||
|
||||
assert x.dim() == 3, "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
|
||||
|
||||
# Apply the mask by setting masked elements to zero, or make NaNs zero
|
||||
if mask is None:
|
||||
mask = ~torch.isnan(x)
|
||||
|
||||
# Ensure mask has the same shape as the input tensor
|
||||
assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
|
||||
|
||||
masked_x = torch.where(mask, x, torch.zeros_like(x))
|
||||
# Create a ones kernel with the same number of channels as the input tensor
|
||||
ones_kernel = torch.ones(x.size(1), 1, self.kernel_size, device=x.device)
|
||||
|
||||
# Perform sum pooling
|
||||
sum_pooled = nn.functional.conv1d(
|
||||
masked_x,
|
||||
ones_kernel,
|
||||
stride=self.stride,
|
||||
padding=self.padding,
|
||||
groups=x.size(1),
|
||||
)
|
||||
|
||||
# Count the non-masked (valid) elements in each pooling window
|
||||
valid_count = nn.functional.conv1d(
|
||||
mask.float(),
|
||||
ones_kernel,
|
||||
stride=self.stride,
|
||||
padding=self.padding,
|
||||
groups=x.size(1),
|
||||
)
|
||||
valid_count = valid_count.clamp(min=1) # Avoid division by zero
|
||||
|
||||
# Perform masked average pooling
|
||||
avg_pooled = sum_pooled / valid_count
|
||||
|
||||
# Fill zero values with NaNs
|
||||
avg_pooled[avg_pooled == 0] = float("nan")
|
||||
|
||||
if ndim == 2:
|
||||
return avg_pooled.squeeze(1)
|
||||
|
||||
return avg_pooled
|
||||
|
||||
|
||||
class MaskedMedianPool1d(nn.Module):
|
||||
def __init__(self, kernel_size: int, stride: Optional[int] = None, padding: Optional[int] = 0):
|
||||
"""An implementation of median pooling that supports masked values.
|
||||
|
||||
This implementation is inspired by the median pooling implementation in
|
||||
https://gist.github.com/rwightman/f2d3849281624be7c0f11c85c87c1598
|
||||
|
||||
Args:
|
||||
kernel_size (int): The size of the median pooling window.
|
||||
stride (int, optional): The stride of the median pooling window. Defaults to None.
|
||||
padding (int, optional): The padding of the median pooling window. Defaults to 0.
|
||||
"""
|
||||
|
||||
super(MaskedMedianPool1d, self).__init__()
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride or kernel_size
|
||||
self.padding = padding
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
ndim = x.dim()
|
||||
if ndim == 2:
|
||||
x = x.unsqueeze(1)
|
||||
|
||||
assert x.dim() == 3, "Input tensor must have 2 or 3 dimensions (batch_size, channels, width)"
|
||||
|
||||
if mask is None:
|
||||
mask = ~torch.isnan(x)
|
||||
|
||||
assert x.shape == mask.shape, "Input tensor and mask must have the same shape"
|
||||
|
||||
masked_x = torch.where(mask, x, torch.zeros_like(x))
|
||||
|
||||
x = F.pad(masked_x, (self.padding, self.padding), mode="reflect")
|
||||
mask = F.pad(mask.float(), (self.padding, self.padding), mode="constant", value=0)
|
||||
|
||||
x = x.unfold(2, self.kernel_size, self.stride)
|
||||
mask = mask.unfold(2, self.kernel_size, self.stride)
|
||||
|
||||
x = x.contiguous().view(x.size()[:3] + (-1,))
|
||||
mask = mask.contiguous().view(mask.size()[:3] + (-1,)).to(x.device)
|
||||
|
||||
# Combine the mask with the input tensor
|
||||
# x_masked = torch.where(mask.bool(), x, torch.fill_(torch.zeros_like(x),float("inf")))
|
||||
x_masked = torch.where(mask.bool(), x, torch.FloatTensor([float("inf")]).to(x.device))
|
||||
|
||||
# Sort the masked tensor along the last dimension
|
||||
x_sorted, _ = torch.sort(x_masked, dim=-1)
|
||||
|
||||
# Compute the count of non-masked (valid) values
|
||||
valid_count = mask.sum(dim=-1)
|
||||
|
||||
# Calculate the index of the median value for each pooling window
|
||||
median_idx = (torch.div((valid_count - 1), 2, rounding_mode="trunc")).clamp(min=0)
|
||||
|
||||
# Gather the median values using the calculated indices
|
||||
median_pooled = x_sorted.gather(-1, median_idx.unsqueeze(-1).long()).squeeze(-1)
|
||||
|
||||
# Fill infinite values with NaNs
|
||||
median_pooled[torch.isinf(median_pooled)] = float("nan")
|
||||
|
||||
if ndim == 2:
|
||||
return median_pooled.squeeze(1)
|
||||
|
||||
return median_pooled
|
||||
|
||||
|
||||
class CrepePitchExtractor(BasePitchExtractor):
|
||||
def __init__(
|
||||
self,
|
||||
hop_length: int = 512,
|
||||
f0_min: float = 50.0,
|
||||
f0_max: float = 1100.0,
|
||||
threshold: float = 0.05,
|
||||
keep_zeros: bool = False,
|
||||
device=None,
|
||||
model: Literal["full", "tiny"] = "full",
|
||||
use_fast_filters: bool = True,
|
||||
):
|
||||
super().__init__(hop_length, f0_min, f0_max, keep_zeros)
|
||||
|
||||
self.threshold = threshold
|
||||
self.model = model
|
||||
self.use_fast_filters = use_fast_filters
|
||||
self.hop_length = hop_length
|
||||
if device is None:
|
||||
self.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
else:
|
||||
self.dev = torch.device(device)
|
||||
if self.use_fast_filters:
|
||||
self.median_filter = MaskedMedianPool1d(3, 1, 1).to(device)
|
||||
self.mean_filter = MaskedAvgPool1d(3, 1, 1).to(device)
|
||||
|
||||
def __call__(self, x, sampling_rate=44100, pad_to=None):
|
||||
"""Extract pitch using crepe.
|
||||
|
||||
|
||||
Args:
|
||||
x (torch.Tensor): Audio signal, shape (1, T).
|
||||
sampling_rate (int, optional): Sampling rate. Defaults to 44100.
|
||||
pad_to (int, optional): Pad to length. Defaults to None.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: Pitch, shape (T // hop_length,).
|
||||
"""
|
||||
|
||||
assert x.ndim == 2, f"Expected 2D tensor, got {x.ndim}D tensor."
|
||||
assert x.shape[0] == 1, f"Expected 1 channel, got {x.shape[0]} channels."
|
||||
|
||||
x = x.to(self.dev)
|
||||
f0, pd = torchcrepe.predict(
|
||||
x,
|
||||
sampling_rate,
|
||||
self.hop_length,
|
||||
self.f0_min,
|
||||
self.f0_max,
|
||||
pad=True,
|
||||
model=self.model,
|
||||
batch_size=1024,
|
||||
device=x.device,
|
||||
return_periodicity=True,
|
||||
)
|
||||
|
||||
# Filter, remove silence, set uv threshold, refer to the original warehouse readme
|
||||
if self.use_fast_filters:
|
||||
pd = self.median_filter(pd)
|
||||
else:
|
||||
pd = torchcrepe.filter.median(pd, 3)
|
||||
|
||||
pd = torchcrepe.threshold.Silence(-60.0)(pd, x, sampling_rate, 512)
|
||||
f0 = torchcrepe.threshold.At(self.threshold)(f0, pd)
|
||||
|
||||
if self.use_fast_filters:
|
||||
f0 = self.mean_filter(f0)
|
||||
else:
|
||||
f0 = torchcrepe.filter.mean(f0, 3)
|
||||
|
||||
f0 = torch.where(torch.isnan(f0), torch.full_like(f0, 0), f0)[0]
|
||||
|
||||
return self.post_process(x, sampling_rate, f0, pad_to)
|
94
server/voice_changer/SoVitsSvc40/models/modules/enhancer.py
Normal file
94
server/voice_changer/SoVitsSvc40/models/modules/enhancer.py
Normal file
@ -0,0 +1,94 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
# from vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
# from vdecoder.nsf_hifigan.models import load_model
|
||||
from ..vdecoder.nsf_hifigan.nvSTFT import STFT
|
||||
from ..vdecoder.nsf_hifigan.models import load_model
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
|
||||
class Enhancer:
|
||||
def __init__(self, enhancer_type, enhancer_ckpt, device=None):
|
||||
if device is None:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.device = device
|
||||
|
||||
if enhancer_type == "nsf-hifigan":
|
||||
self.enhancer = NsfHifiGAN(enhancer_ckpt, device=self.device)
|
||||
else:
|
||||
raise ValueError(f" [x] Unknown enhancer: {enhancer_type}")
|
||||
|
||||
self.resample_kernel = {}
|
||||
self.enhancer_sample_rate = self.enhancer.sample_rate()
|
||||
self.enhancer_hop_size = self.enhancer.hop_size()
|
||||
|
||||
def enhance(self, audio, sample_rate, f0, hop_size, adaptive_key=0, silence_front=0): # 1, T # 1, n_frames, 1
|
||||
# enhancer start time
|
||||
start_frame = int(silence_front * sample_rate / hop_size)
|
||||
real_silence_front = start_frame * hop_size / sample_rate
|
||||
audio = audio[:, int(np.round(real_silence_front * sample_rate)) :]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
|
||||
# adaptive parameters
|
||||
adaptive_factor = 2 ** (-adaptive_key / 12)
|
||||
adaptive_sample_rate = 100 * int(np.round(self.enhancer_sample_rate / adaptive_factor / 100))
|
||||
real_factor = self.enhancer_sample_rate / adaptive_sample_rate
|
||||
|
||||
# resample the ddsp output
|
||||
if sample_rate == adaptive_sample_rate:
|
||||
audio_res = audio
|
||||
else:
|
||||
key_str = str(sample_rate) + str(adaptive_sample_rate)
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(sample_rate, adaptive_sample_rate, lowpass_filter_width=128).to(self.device)
|
||||
audio_res = self.resample_kernel[key_str](audio)
|
||||
|
||||
n_frames = int(audio_res.size(-1) // self.enhancer_hop_size + 1)
|
||||
|
||||
# resample f0
|
||||
f0_np = f0.squeeze(0).squeeze(-1).cpu().numpy()
|
||||
f0_np *= real_factor
|
||||
time_org = (hop_size / sample_rate) * np.arange(len(f0_np)) / real_factor
|
||||
time_frame = (self.enhancer_hop_size / self.enhancer_sample_rate) * np.arange(n_frames)
|
||||
f0_res = np.interp(time_frame, time_org, f0_np, left=f0_np[0], right=f0_np[-1])
|
||||
f0_res = torch.from_numpy(f0_res).unsqueeze(0).float().to(self.device) # 1, n_frames
|
||||
|
||||
# enhance
|
||||
enhanced_audio, enhancer_sample_rate = self.enhancer(audio_res, f0_res)
|
||||
|
||||
# resample the enhanced output
|
||||
if adaptive_factor != 0:
|
||||
key_str = str(adaptive_sample_rate) + str(enhancer_sample_rate)
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(adaptive_sample_rate, enhancer_sample_rate, lowpass_filter_width=128).to(self.device)
|
||||
enhanced_audio = self.resample_kernel[key_str](enhanced_audio)
|
||||
|
||||
# pad the silence frames
|
||||
if start_frame > 0:
|
||||
enhanced_audio = F.pad(enhanced_audio, (int(np.round(enhancer_sample_rate * real_silence_front)), 0))
|
||||
|
||||
return enhanced_audio, enhancer_sample_rate
|
||||
|
||||
|
||||
class NsfHifiGAN(torch.nn.Module):
|
||||
def __init__(self, model_path, device=None):
|
||||
super().__init__()
|
||||
if device is None:
|
||||
device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
self.device = device
|
||||
print("| Load HifiGAN: ", model_path)
|
||||
self.model, self.h = load_model(model_path, device=self.device)
|
||||
|
||||
def sample_rate(self):
|
||||
return self.h.sampling_rate
|
||||
|
||||
def hop_size(self):
|
||||
return self.h.hop_size
|
||||
|
||||
def forward(self, audio, f0):
|
||||
stft = STFT(self.h.sampling_rate, self.h.num_mels, self.h.n_fft, self.h.win_size, self.h.hop_size, self.h.fmin, self.h.fmax)
|
||||
with torch.no_grad():
|
||||
mel = stft.get_mel(audio)
|
||||
enhanced_audio = self.model(mel, f0[:, : mel.size(-1)]).view(-1)
|
||||
return enhanced_audio, self.h.sampling_rate
|
58
server/voice_changer/SoVitsSvc40/models/modules/losses.py
Normal file
58
server/voice_changer/SoVitsSvc40/models/modules/losses.py
Normal file
@ -0,0 +1,58 @@
|
||||
import torch
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
rl = rl.float().detach()
|
||||
gl = gl.float()
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
dr = dr.float()
|
||||
dg = dg.float()
|
||||
r_loss = torch.mean((1 - dr) ** 2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += r_loss + g_loss
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
dg = dg.float()
|
||||
l = torch.mean((1 - dg) ** 2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
||||
|
||||
|
||||
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
||||
"""
|
||||
z_p, logs_q: [b, h, t_t]
|
||||
m_p, logs_p: [b, h, t_t]
|
||||
"""
|
||||
z_p = z_p.float()
|
||||
logs_q = logs_q.float()
|
||||
m_p = m_p.float()
|
||||
logs_p = logs_p.float()
|
||||
z_mask = z_mask.float()
|
||||
# print(logs_p)
|
||||
kl = logs_p - logs_q - 0.5
|
||||
kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
|
||||
kl = torch.sum(kl * z_mask)
|
||||
l = kl / torch.sum(z_mask)
|
||||
return l
|
@ -0,0 +1,99 @@
|
||||
import torch
|
||||
import torch.utils.data
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor
|
||||
"""
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor used to compress
|
||||
"""
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
def spectral_normalize_torch(magnitudes):
|
||||
output = dynamic_range_compression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
def spectral_de_normalize_torch(magnitudes):
|
||||
output = dynamic_range_decompression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
mel_basis = {}
|
||||
hann_window = {}
|
||||
|
||||
|
||||
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global hann_window
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect")
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=False)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
return spec
|
||||
|
||||
|
||||
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
global mel_basis
|
||||
dtype_device = str(spec.dtype) + "_" + str(spec.device)
|
||||
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
return spec
|
||||
|
||||
|
||||
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
dtype_device = str(y.dtype) + "_" + str(y.device)
|
||||
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
||||
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect")
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=False)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
|
||||
return spec
|
303
server/voice_changer/SoVitsSvc40/models/modules/modules.py
Normal file
303
server/voice_changer/SoVitsSvc40/models/modules/modules.py
Normal file
@ -0,0 +1,303 @@
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from torch.nn import Conv1d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||
|
||||
from .commons import init_weights, get_padding, fused_add_tanh_sigmoid_multiply
|
||||
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, channels, eps=1e-5):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.eps = eps
|
||||
|
||||
self.gamma = nn.Parameter(torch.ones(channels))
|
||||
self.beta = nn.Parameter(torch.zeros(channels))
|
||||
|
||||
def forward(self, x):
|
||||
x = x.transpose(1, -1)
|
||||
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
||||
return x.transpose(1, -1)
|
||||
|
||||
|
||||
class ConvReluNorm(nn.Module):
|
||||
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.out_channels = out_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
assert n_layers > 1, "Number of layers should be larger than 0."
|
||||
|
||||
self.conv_layers = nn.ModuleList()
|
||||
self.norm_layers = nn.ModuleList()
|
||||
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
|
||||
for _ in range(n_layers - 1):
|
||||
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
self.proj.weight.data.zero_()
|
||||
self.proj.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x_org = x
|
||||
for i in range(self.n_layers):
|
||||
x = self.conv_layers[i](x * x_mask)
|
||||
x = self.norm_layers[i](x)
|
||||
x = self.relu_drop(x)
|
||||
x = x_org + self.proj(x)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class DDSConv(nn.Module):
|
||||
"""
|
||||
Dialted and Depth-Separable Convolution
|
||||
"""
|
||||
|
||||
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.convs_sep = nn.ModuleList()
|
||||
self.convs_1x1 = nn.ModuleList()
|
||||
self.norms_1 = nn.ModuleList()
|
||||
self.norms_2 = nn.ModuleList()
|
||||
for i in range(n_layers):
|
||||
dilation = kernel_size**i
|
||||
padding = (kernel_size * dilation - dilation) // 2
|
||||
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, groups=channels, dilation=dilation, padding=padding))
|
||||
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
||||
self.norms_1.append(LayerNorm(channels))
|
||||
self.norms_2.append(LayerNorm(channels))
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
if g is not None:
|
||||
x = x + g
|
||||
for i in range(self.n_layers):
|
||||
y = self.convs_sep[i](x * x_mask)
|
||||
y = self.norms_1[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.convs_1x1[i](y)
|
||||
y = self.norms_2[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.drop(y)
|
||||
x = x + y
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
||||
super(WN, self).__init__()
|
||||
assert kernel_size % 2 == 1
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = (kernel_size,)
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if gin_channels != 0:
|
||||
cond_layer = torch.nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = dilation_rate**i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, dilation=dilation, padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2 * hidden_channels
|
||||
else:
|
||||
res_skip_channels = hidden_channels
|
||||
|
||||
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, x, x_mask, g=None, **kwargs):
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
x_in = self.in_layers[i](x)
|
||||
if g is not None:
|
||||
cond_offset = i * 2 * self.hidden_channels
|
||||
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
||||
else:
|
||||
g_l = torch.zeros_like(x_in)
|
||||
|
||||
acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
||||
acts = self.drop(acts)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
||||
x = (x + res_acts) * x_mask
|
||||
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
||||
else:
|
||||
output = output + res_skip_acts
|
||||
return output * x_mask
|
||||
|
||||
def remove_weight_norm(self):
|
||||
if self.gin_channels != 0:
|
||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||
for l in self.in_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
for l in self.res_skip_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class Log(nn.Module):
|
||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||
if not reverse:
|
||||
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
||||
logdet = torch.sum(-y, [1, 2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = torch.exp(x) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Flip(nn.Module):
|
||||
def forward(self, x, *args, reverse=False, **kwargs):
|
||||
x = torch.flip(x, [1])
|
||||
if not reverse:
|
||||
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
||||
return x, logdet
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class ElementwiseAffine(nn.Module):
|
||||
def __init__(self, channels):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.m = nn.Parameter(torch.zeros(channels, 1))
|
||||
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
||||
|
||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||
if not reverse:
|
||||
y = self.m + torch.exp(self.logs) * x
|
||||
y = y * x_mask
|
||||
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class ResidualCouplingLayer(nn.Module):
|
||||
def __init__(self, channels, hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=0, gin_channels=0, mean_only=False):
|
||||
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.half_channels = channels // 2
|
||||
self.mean_only = mean_only
|
||||
|
||||
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
||||
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||
self.post.weight.data.zero_()
|
||||
self.post.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
||||
h = self.pre(x0) * x_mask
|
||||
h = self.enc(h, x_mask, g=g)
|
||||
stats = self.post(h) * x_mask
|
||||
if not self.mean_only:
|
||||
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
||||
else:
|
||||
m = stats
|
||||
logs = torch.zeros_like(m)
|
||||
|
||||
if not reverse:
|
||||
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
logdet = torch.sum(logs, [1, 2])
|
||||
return x, logdet
|
||||
else:
|
||||
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
return x
|
527
server/voice_changer/SoVitsSvc40/models/utils.py
Normal file
527
server/voice_changer/SoVitsSvc40/models/utils.py
Normal file
@ -0,0 +1,527 @@
|
||||
import os
|
||||
import glob
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import subprocess
|
||||
import warnings
|
||||
import functools
|
||||
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
logger = logging
|
||||
|
||||
f0_bin = 256
|
||||
f0_max = 1100.0
|
||||
f0_min = 50.0
|
||||
f0_mel_min = 1127 * np.log(1 + f0_min / 700)
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
|
||||
# def normalize_f0(f0, random_scale=True):
|
||||
# f0_norm = f0.clone() # create a copy of the input Tensor
|
||||
# batch_size, _, frame_length = f0_norm.shape
|
||||
# for i in range(batch_size):
|
||||
# means = torch.mean(f0_norm[i, 0, :])
|
||||
# if random_scale:
|
||||
# factor = random.uniform(0.8, 1.2)
|
||||
# else:
|
||||
# factor = 1
|
||||
# f0_norm[i, 0, :] = (f0_norm[i, 0, :] - means) * factor
|
||||
# return f0_norm
|
||||
# def normalize_f0(f0, random_scale=True):
|
||||
# means = torch.mean(f0[:, 0, :], dim=1, keepdim=True)
|
||||
# if random_scale:
|
||||
# factor = torch.Tensor(f0.shape[0],1).uniform_(0.8, 1.2).to(f0.device)
|
||||
# else:
|
||||
# factor = torch.ones(f0.shape[0], 1, 1).to(f0.device)
|
||||
# f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
|
||||
# return f0_norm
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""This is a decorator which can be used to mark functions
|
||||
as deprecated. It will result in a warning being emitted
|
||||
when the function is used."""
|
||||
|
||||
@functools.wraps(func)
|
||||
def new_func(*args, **kwargs):
|
||||
warnings.simplefilter("always", DeprecationWarning) # turn off filter
|
||||
warnings.warn("Call to deprecated function {}.".format(func.__name__), category=DeprecationWarning, stacklevel=2)
|
||||
warnings.simplefilter("default", DeprecationWarning) # reset filter
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return new_func
|
||||
|
||||
|
||||
def normalize_f0(f0, x_mask, uv, random_scale=True):
|
||||
# calculate means based on x_mask
|
||||
uv_sum = torch.sum(uv, dim=1, keepdim=True)
|
||||
uv_sum[uv_sum == 0] = 9999
|
||||
means = torch.sum(f0[:, 0, :] * uv, dim=1, keepdim=True) / uv_sum
|
||||
|
||||
if random_scale:
|
||||
factor = torch.Tensor(f0.shape[0], 1).uniform_(0.8, 1.2).to(f0.device)
|
||||
else:
|
||||
factor = torch.ones(f0.shape[0], 1).to(f0.device)
|
||||
# normalize f0 based on means and factor
|
||||
f0_norm = (f0 - means.unsqueeze(-1)) * factor.unsqueeze(-1)
|
||||
if torch.isnan(f0_norm).any():
|
||||
exit(0)
|
||||
return f0_norm * x_mask
|
||||
|
||||
|
||||
def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512, device=None):
|
||||
from .modules.crepe import CrepePitchExtractor
|
||||
|
||||
x = wav_numpy
|
||||
if p_len is None:
|
||||
p_len = x.shape[0] // hop_length
|
||||
else:
|
||||
assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
|
||||
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
F0Creper = CrepePitchExtractor(hop_length=hop_length, f0_min=f0_min, f0_max=f0_max, device=device)
|
||||
f0, uv = F0Creper(x[None, :].float(), sampling_rate, pad_to=p_len)
|
||||
return f0, uv
|
||||
|
||||
|
||||
def plot_data_to_numpy(x, y):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger("matplotlib")
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
plt.plot(x)
|
||||
plt.plot(y)
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def interpolate_f0(f0):
|
||||
"""
|
||||
对F0进行插值处理
|
||||
"""
|
||||
|
||||
data = np.reshape(f0, (f0.size, 1))
|
||||
|
||||
vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
|
||||
vuv_vector[data > 0.0] = 1.0
|
||||
vuv_vector[data <= 0.0] = 0.0
|
||||
|
||||
ip_data = data
|
||||
|
||||
frame_number = data.size
|
||||
last_value = 0.0
|
||||
for i in range(frame_number):
|
||||
if data[i] <= 0.0:
|
||||
j = i + 1
|
||||
for j in range(i + 1, frame_number):
|
||||
if data[j] > 0.0:
|
||||
break
|
||||
if j < frame_number - 1:
|
||||
if last_value > 0.0:
|
||||
step = (data[j] - data[i - 1]) / float(j - i)
|
||||
for k in range(i, j):
|
||||
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
||||
else:
|
||||
for k in range(i, j):
|
||||
ip_data[k] = data[j]
|
||||
else:
|
||||
for k in range(i, frame_number):
|
||||
ip_data[k] = last_value
|
||||
else:
|
||||
ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
|
||||
last_value = data[i]
|
||||
|
||||
return ip_data[:, 0], vuv_vector[:, 0]
|
||||
|
||||
|
||||
def compute_f0_parselmouth(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
||||
import parselmouth
|
||||
|
||||
x = wav_numpy
|
||||
if p_len is None:
|
||||
p_len = x.shape[0] // hop_length
|
||||
else:
|
||||
assert abs(p_len - x.shape[0] // hop_length) < 4, "pad length error"
|
||||
time_step = hop_length / sampling_rate * 1000
|
||||
f0_min = 50
|
||||
f0_max = 1100
|
||||
f0 = parselmouth.Sound(x, sampling_rate).to_pitch_ac(time_step=time_step / 1000, voicing_threshold=0.6, pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array["frequency"]
|
||||
|
||||
pad_size = (p_len - len(f0) + 1) // 2
|
||||
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
|
||||
f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
|
||||
return f0
|
||||
|
||||
|
||||
def resize_f0(x, target_len):
|
||||
source = np.array(x)
|
||||
source[source < 0.001] = np.nan
|
||||
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source)
|
||||
res = np.nan_to_num(target)
|
||||
return res
|
||||
|
||||
|
||||
def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512):
|
||||
import pyworld
|
||||
|
||||
if p_len is None:
|
||||
p_len = wav_numpy.shape[0] // hop_length
|
||||
f0, t = pyworld.dio(
|
||||
wav_numpy.astype(np.double),
|
||||
fs=sampling_rate,
|
||||
f0_ceil=800,
|
||||
frame_period=1000 * hop_length / sampling_rate,
|
||||
)
|
||||
f0 = pyworld.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate)
|
||||
for index, pitch in enumerate(f0):
|
||||
f0[index] = round(pitch, 1)
|
||||
return resize_f0(f0, p_len)
|
||||
|
||||
|
||||
def f0_to_coarse(f0):
|
||||
is_torch = isinstance(f0, torch.Tensor)
|
||||
f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / (f0_mel_max - f0_mel_min) + 1
|
||||
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1
|
||||
f0_coarse = (f0_mel + 0.5).long() if is_torch else np.rint(f0_mel).astype(np.int)
|
||||
assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, (f0_coarse.max(), f0_coarse.min())
|
||||
return f0_coarse
|
||||
|
||||
|
||||
def get_hubert_model():
|
||||
vec_path = "hubert/checkpoint_best_legacy_500.pt"
|
||||
print("load model(s) from {}".format(vec_path))
|
||||
from fairseq import checkpoint_utils
|
||||
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[vec_path],
|
||||
suffix="",
|
||||
)
|
||||
model = models[0]
|
||||
model.eval()
|
||||
return model
|
||||
|
||||
|
||||
def get_hubert_content(hmodel, wav_16k_tensor):
|
||||
feats = wav_16k_tensor
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
inputs = {
|
||||
"source": feats.to(wav_16k_tensor.device),
|
||||
"padding_mask": padding_mask.to(wav_16k_tensor.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = hmodel.extract_features(**inputs)
|
||||
feats = hmodel.final_proj(logits[0])
|
||||
return feats.transpose(1, 2)
|
||||
|
||||
|
||||
def get_content(cmodel, y):
|
||||
with torch.no_grad():
|
||||
c = cmodel.extract_features(y.squeeze(1))[0]
|
||||
c = c.transpose(1, 2)
|
||||
return c
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer=None, skip_optimizer=False):
|
||||
assert os.path.isfile(checkpoint_path)
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
|
||||
iteration = checkpoint_dict["iteration"]
|
||||
learning_rate = checkpoint_dict["learning_rate"]
|
||||
if optimizer is not None and not skip_optimizer and checkpoint_dict["optimizer"] is not None:
|
||||
optimizer.load_state_dict(checkpoint_dict["optimizer"])
|
||||
saved_state_dict = checkpoint_dict["model"]
|
||||
if hasattr(model, "module"):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict = {}
|
||||
for k, v in state_dict.items():
|
||||
try:
|
||||
# assert "dec" in k or "disc" in k
|
||||
# print("load", k)
|
||||
new_state_dict[k] = saved_state_dict[k]
|
||||
assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
|
||||
except:
|
||||
print("error, %s is not in the checkpoint" % k)
|
||||
logger.info("%s is not in the checkpoint" % k)
|
||||
new_state_dict[k] = v
|
||||
if hasattr(model, "module"):
|
||||
model.module.load_state_dict(new_state_dict)
|
||||
else:
|
||||
model.load_state_dict(new_state_dict)
|
||||
print("load ")
|
||||
logger.info("Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration))
|
||||
return model, optimizer, learning_rate, iteration
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
||||
logger.info("Saving model and optimizer state at iteration {} to {}".format(iteration, checkpoint_path))
|
||||
if hasattr(model, "module"):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
torch.save({"model": state_dict, "iteration": iteration, "optimizer": optimizer.state_dict(), "learning_rate": learning_rate}, checkpoint_path)
|
||||
|
||||
|
||||
def clean_checkpoints(path_to_models="logs/44k/", n_ckpts_to_keep=2, sort_by_time=True):
|
||||
"""Freeing up space by deleting saved ckpts
|
||||
|
||||
Arguments:
|
||||
path_to_models -- Path to the model directory
|
||||
n_ckpts_to_keep -- Number of ckpts to keep, excluding G_0.pth and D_0.pth
|
||||
sort_by_time -- True -> chronologically delete ckpts
|
||||
False -> lexicographically delete ckpts
|
||||
"""
|
||||
ckpts_files = [f for f in os.listdir(path_to_models) if os.path.isfile(os.path.join(path_to_models, f))]
|
||||
name_key = lambda _f: int(re.compile("._(\d+)\.pth").match(_f).group(1)) # NOQA
|
||||
time_key = lambda _f: os.path.getmtime(os.path.join(path_to_models, _f)) # NOQA
|
||||
sort_key = time_key if sort_by_time else name_key
|
||||
x_sorted = lambda _x: sorted([f for f in ckpts_files if f.startswith(_x) and not f.endswith("_0.pth")], key=sort_key) # NOQA
|
||||
to_del = [os.path.join(path_to_models, fn) for fn in (x_sorted("G")[:-n_ckpts_to_keep] + x_sorted("D")[:-n_ckpts_to_keep])]
|
||||
del_info = lambda fn: logger.info(f".. Free up space by deleting ckpt {fn}") # NOQA
|
||||
del_routine = lambda x: [os.remove(x), del_info(x)] # NOQA
|
||||
rs = [del_routine(fn) for fn in to_del] # NOQA
|
||||
|
||||
|
||||
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
||||
for k, v in scalars.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
for k, v in histograms.items():
|
||||
writer.add_histogram(k, v, global_step)
|
||||
for k, v in images.items():
|
||||
writer.add_image(k, v, global_step, dataformats="HWC")
|
||||
for k, v in audios.items():
|
||||
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
||||
|
||||
|
||||
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
||||
f_list = glob.glob(os.path.join(dir_path, regex))
|
||||
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
||||
x = f_list[-1]
|
||||
print(x)
|
||||
return x
|
||||
|
||||
|
||||
def plot_spectrogram_to_numpy(spectrogram):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger("matplotlib")
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
|
||||
plt.colorbar(im, ax=ax)
|
||||
plt.xlabel("Frames")
|
||||
plt.ylabel("Channels")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def plot_alignment_to_numpy(alignment, info=None):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger("matplotlib")
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 4))
|
||||
im = ax.imshow(alignment.transpose(), aspect="auto", origin="lower", interpolation="none")
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = "Decoder timestep"
|
||||
if info is not None:
|
||||
xlabel += "\n\n" + info
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel("Encoder timestep")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path):
|
||||
sampling_rate, data = read(full_path)
|
||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||
|
||||
|
||||
def load_filepaths_and_text(filename, split="|"):
|
||||
with open(filename, encoding="utf-8") as f:
|
||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def get_hparams(init=True):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--config", type=str, default="./configs/base.json", help="JSON file for configuration")
|
||||
parser.add_argument("-m", "--model", type=str, required=True, help="Model name")
|
||||
|
||||
args = parser.parse_args()
|
||||
model_dir = os.path.join("./logs", args.model)
|
||||
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
|
||||
config_path = args.config
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
if init:
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
with open(config_save_path, "w") as f:
|
||||
f.write(data)
|
||||
else:
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_dir(model_dir):
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_file(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams = HParams(**config)
|
||||
return hparams
|
||||
|
||||
|
||||
def check_git_hash(model_dir):
|
||||
source_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
if not os.path.exists(os.path.join(source_dir, ".git")):
|
||||
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(source_dir))
|
||||
return
|
||||
|
||||
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
||||
|
||||
path = os.path.join(model_dir, "githash")
|
||||
if os.path.exists(path):
|
||||
saved_hash = open(path).read()
|
||||
if saved_hash != cur_hash:
|
||||
logger.warn("git hash values are different. {}(saved) != {}(current)".format(saved_hash[:8], cur_hash[:8]))
|
||||
else:
|
||||
open(path, "w").write(cur_hash)
|
||||
|
||||
|
||||
def get_logger(model_dir, filename="train.log"):
|
||||
global logger
|
||||
logger = logging.getLogger(os.path.basename(model_dir))
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename))
|
||||
h.setLevel(logging.DEBUG)
|
||||
h.setFormatter(formatter)
|
||||
logger.addHandler(h)
|
||||
return logger
|
||||
|
||||
|
||||
def repeat_expand_2d(content, target_len):
|
||||
# content : [h, t]
|
||||
|
||||
src_len = content.shape[-1]
|
||||
target = torch.zeros([content.shape[0], target_len], dtype=torch.float).to(content.device)
|
||||
temp = torch.arange(src_len + 1) * target_len / src_len
|
||||
current_pos = 0
|
||||
for i in range(target_len):
|
||||
if i < temp[current_pos + 1]:
|
||||
target[:, i] = content[:, current_pos]
|
||||
else:
|
||||
current_pos += 1
|
||||
target[:, i] = content[:, current_pos]
|
||||
|
||||
return target
|
||||
|
||||
|
||||
class HParams:
|
||||
def __init__(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
if type(v) == dict:
|
||||
v = HParams(**v)
|
||||
self[k] = v
|
||||
|
||||
def keys(self):
|
||||
return self.__dict__.keys()
|
||||
|
||||
def items(self):
|
||||
return self.__dict__.items()
|
||||
|
||||
def values(self):
|
||||
return self.__dict__.values()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__dict__)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return getattr(self, key)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
return setattr(self, key, value)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.__dict__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__dict__.__repr__()
|
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def build_env(config, config_name, path):
|
||||
t_path = os.path.join(path, config_name)
|
||||
if config != t_path:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
shutil.copyfile(config, os.path.join(path, config_name))
|
@ -0,0 +1,475 @@
|
||||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
def load_model(model_path, device="cuda"):
|
||||
config_file = os.path.join(os.path.split(model_path)[0], "config.json")
|
||||
with open(config_file) as f:
|
||||
data = f.read()
|
||||
|
||||
global h
|
||||
json_config = json.loads(data)
|
||||
h = AttrDict(json_config)
|
||||
|
||||
generator = Generator(h).to(device)
|
||||
|
||||
cp_dict = torch.load(model_path)
|
||||
generator.load_state_dict(cp_dict["generator"])
|
||||
generator.eval()
|
||||
generator.remove_weight_norm()
|
||||
del cp_dict
|
||||
return generator, h
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.h = h
|
||||
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.h = h
|
||||
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
def padDiff(x):
|
||||
return F.pad(F.pad(x, (0, 0, -1, 1), "constant", 0) - x, (0, 0, 0, -1), "constant", 0)
|
||||
|
||||
|
||||
class SineGen(torch.nn.Module):
|
||||
"""Definition of sine generator
|
||||
SineGen(samp_rate, harmonic_num = 0,
|
||||
sine_amp = 0.1, noise_std = 0.003,
|
||||
voiced_threshold = 0,
|
||||
flag_for_pulse=False)
|
||||
samp_rate: sampling rate in Hz
|
||||
harmonic_num: number of harmonic overtones (default 0)
|
||||
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
||||
noise_std: std of Gaussian noise (default 0.003)
|
||||
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
||||
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
||||
Note: when flag_for_pulse is True, the first time step of a voiced
|
||||
segment is always sin(np.pi) or cos(0)
|
||||
"""
|
||||
|
||||
def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0, flag_for_pulse=False):
|
||||
super(SineGen, self).__init__()
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = noise_std
|
||||
self.harmonic_num = harmonic_num
|
||||
self.dim = self.harmonic_num + 1
|
||||
self.sampling_rate = samp_rate
|
||||
self.voiced_threshold = voiced_threshold
|
||||
self.flag_for_pulse = flag_for_pulse
|
||||
|
||||
def _f02uv(self, f0):
|
||||
# generate uv signal
|
||||
uv = (f0 > self.voiced_threshold).type(torch.float32)
|
||||
return uv
|
||||
|
||||
def _f02sine(self, f0_values):
|
||||
"""f0_values: (batchsize, length, dim)
|
||||
where dim indicates fundamental tone and overtones
|
||||
"""
|
||||
# convert to F0 in rad. The interger part n can be ignored
|
||||
# because 2 * np.pi * n doesn't affect phase
|
||||
rad_values = (f0_values / self.sampling_rate) % 1
|
||||
|
||||
# initial phase noise (no noise for fundamental component)
|
||||
rand_ini = torch.rand(f0_values.shape[0], f0_values.shape[2], device=f0_values.device)
|
||||
rand_ini[:, 0] = 0
|
||||
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
||||
|
||||
# instantanouse phase sine[t] = sin(2*pi \sum_i=1 ^{t} rad)
|
||||
if not self.flag_for_pulse:
|
||||
# for normal case
|
||||
|
||||
# To prevent torch.cumsum numerical overflow,
|
||||
# it is necessary to add -1 whenever \sum_k=1^n rad_value_k > 1.
|
||||
# Buffer tmp_over_one_idx indicates the time step to add -1.
|
||||
# This will not change F0 of sine because (x-1) * 2*pi = x * 2*pi
|
||||
tmp_over_one = torch.cumsum(rad_values, 1) % 1
|
||||
tmp_over_one_idx = (padDiff(tmp_over_one)) < 0
|
||||
cumsum_shift = torch.zeros_like(rad_values)
|
||||
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
||||
|
||||
sines = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
|
||||
else:
|
||||
# If necessary, make sure that the first time step of every
|
||||
# voiced segments is sin(pi) or cos(0)
|
||||
# This is used for pulse-train generation
|
||||
|
||||
# identify the last time step in unvoiced segments
|
||||
uv = self._f02uv(f0_values)
|
||||
uv_1 = torch.roll(uv, shifts=-1, dims=1)
|
||||
uv_1[:, -1, :] = 1
|
||||
u_loc = (uv < 1) * (uv_1 > 0)
|
||||
|
||||
# get the instantanouse phase
|
||||
tmp_cumsum = torch.cumsum(rad_values, dim=1)
|
||||
# different batch needs to be processed differently
|
||||
for idx in range(f0_values.shape[0]):
|
||||
temp_sum = tmp_cumsum[idx, u_loc[idx, :, 0], :]
|
||||
temp_sum[1:, :] = temp_sum[1:, :] - temp_sum[0:-1, :]
|
||||
# stores the accumulation of i.phase within
|
||||
# each voiced segments
|
||||
tmp_cumsum[idx, :, :] = 0
|
||||
tmp_cumsum[idx, u_loc[idx, :, 0], :] = temp_sum
|
||||
|
||||
# rad_values - tmp_cumsum: remove the accumulation of i.phase
|
||||
# within the previous voiced segment.
|
||||
i_phase = torch.cumsum(rad_values - tmp_cumsum, dim=1)
|
||||
|
||||
# get the sines
|
||||
sines = torch.cos(i_phase * 2 * np.pi)
|
||||
return sines
|
||||
|
||||
def forward(self, f0):
|
||||
"""sine_tensor, uv = forward(f0)
|
||||
input F0: tensor(batchsize=1, length, dim=1)
|
||||
f0 for unvoiced steps should be 0
|
||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
with torch.no_grad():
|
||||
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) # NOQA
|
||||
# fundamental component
|
||||
fn = torch.multiply(f0, torch.FloatTensor([[range(1, self.harmonic_num + 2)]]).to(f0.device))
|
||||
|
||||
# generate sine waveforms
|
||||
sine_waves = self._f02sine(fn) * self.sine_amp
|
||||
|
||||
# generate uv signal
|
||||
# uv = torch.ones(f0.shape)
|
||||
# uv = uv * (f0 > self.voiced_threshold)
|
||||
uv = self._f02uv(f0)
|
||||
|
||||
# noise: for unvoiced should be similar to sine_amp
|
||||
# std = self.sine_amp/3 -> max value ~ self.sine_amp
|
||||
# . for voiced regions is self.noise_std
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
|
||||
# first: set the unvoiced part to 0 by uv
|
||||
# then: additive noise
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
|
||||
|
||||
class SourceModuleHnNSF(torch.nn.Module):
|
||||
"""SourceModule for hn-nsf
|
||||
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
||||
add_noise_std=0.003, voiced_threshod=0)
|
||||
sampling_rate: sampling_rate in Hz
|
||||
harmonic_num: number of harmonic above F0 (default: 0)
|
||||
sine_amp: amplitude of sine source signal (default: 0.1)
|
||||
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
||||
note that amplitude of noise in unvoiced is decided
|
||||
by sine_amp
|
||||
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||
F0_sampled (batchsize, length, 1)
|
||||
Sine_source (batchsize, length, 1)
|
||||
noise_source (batchsize, length 1)
|
||||
uv (batchsize, length, 1)
|
||||
"""
|
||||
|
||||
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0):
|
||||
super(SourceModuleHnNSF, self).__init__()
|
||||
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = add_noise_std
|
||||
|
||||
# to produce sine waveforms
|
||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
|
||||
|
||||
# to merge source harmonics into a single excitation
|
||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||
self.l_tanh = torch.nn.Tanh()
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||
F0_sampled (batchsize, length, 1)
|
||||
Sine_source (batchsize, length, 1)
|
||||
noise_source (batchsize, length 1)
|
||||
"""
|
||||
# source for harmonic branch
|
||||
sine_wavs, uv, _ = self.l_sin_gen(x)
|
||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
||||
|
||||
# source for noise branch, in the same shape as uv
|
||||
noise = torch.randn_like(uv) * self.sine_amp / 3
|
||||
return sine_merge, noise, uv
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, h):
|
||||
super(Generator, self).__init__()
|
||||
self.h = h
|
||||
|
||||
self.num_kernels = len(h["resblock_kernel_sizes"])
|
||||
self.num_upsamples = len(h["upsample_rates"])
|
||||
self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(h["upsample_rates"]))
|
||||
self.m_source = SourceModuleHnNSF(sampling_rate=h["sampling_rate"], harmonic_num=8)
|
||||
self.noise_convs = nn.ModuleList()
|
||||
self.conv_pre = weight_norm(Conv1d(h["inter_channels"], h["upsample_initial_channel"], 7, 1, padding=3))
|
||||
resblock = ResBlock1 if h["resblock"] == "1" else ResBlock2
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(h["upsample_rates"], h["upsample_kernel_sizes"])):
|
||||
c_cur = h["upsample_initial_channel"] // (2 ** (i + 1))
|
||||
self.ups.append(weight_norm(ConvTranspose1d(h["upsample_initial_channel"] // (2**i), h["upsample_initial_channel"] // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
|
||||
if i + 1 < len(h["upsample_rates"]): #
|
||||
stride_f0 = np.prod(h["upsample_rates"][i + 1 :])
|
||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
|
||||
else:
|
||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = h["upsample_initial_channel"] // (2 ** (i + 1))
|
||||
for j, (k, d) in enumerate(zip(h["resblock_kernel_sizes"], h["resblock_dilation_sizes"])):
|
||||
self.resblocks.append(resblock(h, ch, k, d))
|
||||
|
||||
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
||||
self.ups.apply(init_weights)
|
||||
self.conv_post.apply(init_weights)
|
||||
self.cond = nn.Conv1d(h["gin_channels"], h["upsample_initial_channel"], 1)
|
||||
|
||||
def forward(self, x, f0, g=None):
|
||||
# print(1,x.shape,f0.shape,f0[:, None].shape)
|
||||
f0 = self.f0_upsamp(f0[:, None]).transpose(1, 2) # bs,n,t
|
||||
# print(2,f0.shape)
|
||||
har_source, noi_source, uv = self.m_source(f0)
|
||||
har_source = har_source.transpose(1, 2)
|
||||
x = self.conv_pre(x)
|
||||
x = x + self.cond(g)
|
||||
# print(124,x.shape,har_source.shape)
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
# print(3,x.shape)
|
||||
x = self.ups[i](x)
|
||||
x_source = self.noise_convs[i](har_source)
|
||||
# print(4,x_source.shape,har_source.shape,x.shape)
|
||||
x = x + x_source
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print("Removing weight norm...")
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
remove_weight_norm(self.conv_pre)
|
||||
remove_weight_norm(self.conv_post)
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, periods=None):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
|
||||
self.discriminators = nn.ModuleList()
|
||||
for period in self.periods:
|
||||
self.discriminators.append(DiscriminatorP(period))
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiScaleDiscriminator(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MultiScaleDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList(
|
||||
[
|
||||
DiscriminatorS(use_spectral_norm=True),
|
||||
DiscriminatorS(),
|
||||
DiscriminatorS(),
|
||||
]
|
||||
)
|
||||
self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
if i != 0:
|
||||
y = self.meanpools[i - 1](y)
|
||||
y_hat = self.meanpools[i - 1](y_hat)
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
r_loss = torch.mean((1 - dr) ** 2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += r_loss + g_loss
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
l = torch.mean((1 - dg) ** 2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
@ -0,0 +1,114 @@
|
||||
import os
|
||||
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
import soundfile as sf
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
try:
|
||||
data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile.
|
||||
except Exception as ex:
|
||||
print(f"'{full_path}' failed to load.\nException:")
|
||||
print(ex)
|
||||
if return_empty_on_exception:
|
||||
return [], sampling_rate or target_sr or 32000
|
||||
else:
|
||||
raise Exception(ex)
|
||||
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0]
|
||||
assert len(data) > 2 # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
|
||||
|
||||
if np.issubdtype(data.dtype, np.integer): # if audio data is type int
|
||||
max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
|
||||
else: # if audio data is type fp32
|
||||
max_mag = max(np.amax(data), -np.amin(data))
|
||||
max_mag = (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
|
||||
|
||||
data = torch.FloatTensor(data.astype(np.float32)) / max_mag
|
||||
|
||||
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
|
||||
return [], sampling_rate or target_sr or 32000
|
||||
if target_sr is not None and sampling_rate != target_sr:
|
||||
data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
|
||||
sampling_rate = target_sr
|
||||
|
||||
return data, sampling_rate
|
||||
|
||||
|
||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression(x, C=1):
|
||||
return np.exp(x) / C
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
class STFT:
|
||||
def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
|
||||
self.target_sr = sr
|
||||
|
||||
self.n_mels = n_mels
|
||||
self.n_fft = n_fft
|
||||
self.win_size = win_size
|
||||
self.hop_length = hop_length
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.clip_val = clip_val
|
||||
self.mel_basis = {}
|
||||
self.hann_window = {}
|
||||
|
||||
def get_mel(self, y, center=False):
|
||||
sampling_rate = self.target_sr
|
||||
n_mels = self.n_mels
|
||||
n_fft = self.n_fft
|
||||
win_size = self.win_size
|
||||
hop_length = self.hop_length
|
||||
fmin = self.fmin
|
||||
fmax = self.fmax
|
||||
clip_val = self.clip_val
|
||||
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
if fmax not in self.mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
self.mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device)
|
||||
self.hann_window[str(y.device)] = torch.hann_window(self.win_size).to(y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode="reflect")
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], center=center, pad_mode="reflect", normalized=False, onesided=True)
|
||||
# print(111,spec)
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
|
||||
# print(222,spec)
|
||||
spec = torch.matmul(self.mel_basis[str(fmax) + "_" + str(y.device)], spec)
|
||||
# print(333,spec)
|
||||
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
||||
# print(444,spec)
|
||||
return spec
|
||||
|
||||
def __call__(self, audiopath):
|
||||
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
||||
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
||||
return spect
|
||||
|
||||
|
||||
stft = STFT()
|
@ -0,0 +1,66 @@
|
||||
import glob
|
||||
import os
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
# matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
|
||||
plt.colorbar(im, ax=ax)
|
||||
|
||||
fig.canvas.draw()
|
||||
plt.close()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def apply_weight_norm(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
weight_norm(m)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size * dilation - dilation) / 2)
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print("Complete.")
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def save_checkpoint(filepath, obj):
|
||||
print("Saving checkpoint to {}".format(filepath))
|
||||
torch.save(obj, filepath)
|
||||
print("Complete.")
|
||||
|
||||
|
||||
def del_old_checkpoints(cp_dir, prefix, n_models=2):
|
||||
pattern = os.path.join(cp_dir, prefix + "????????")
|
||||
cp_list = glob.glob(pattern) # get checkpoint paths
|
||||
cp_list = sorted(cp_list) # sort by iter
|
||||
if len(cp_list) > n_models: # if more than n_models models are found
|
||||
for cp in cp_list[:-n_models]: # delete the oldest models other than lastest n_models
|
||||
open(cp, "w").close() # empty file contents
|
||||
os.unlink(cp) # delete file (move to trash when using Colab)
|
||||
|
||||
|
||||
def scan_checkpoint(cp_dir, prefix):
|
||||
pattern = os.path.join(cp_dir, prefix + "????????")
|
||||
cp_list = glob.glob(pattern)
|
||||
if len(cp_list) == 0:
|
||||
return None
|
||||
return sorted(cp_list)[-1]
|
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def build_env(config, config_name, path):
|
||||
t_path = os.path.join(path, config_name)
|
||||
if config != t_path:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
shutil.copyfile(config, os.path.join(path, config_name))
|
@ -0,0 +1,406 @@
|
||||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
def load_model(model_path, device="cuda"):
|
||||
config_file = os.path.join(os.path.split(model_path)[0], "config.json")
|
||||
with open(config_file) as f:
|
||||
data = f.read()
|
||||
|
||||
json_config = json.loads(data)
|
||||
h = AttrDict(json_config)
|
||||
|
||||
generator = Generator(h).to(device)
|
||||
|
||||
cp_dict = torch.load(model_path, map_location=device)
|
||||
generator.load_state_dict(cp_dict["generator"])
|
||||
generator.eval()
|
||||
generator.remove_weight_norm()
|
||||
del cp_dict
|
||||
return generator, h
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.h = h
|
||||
self.convs1 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2])))])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1)))])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.h = h
|
||||
self.convs = nn.ModuleList([weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]))), weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1])))])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class SineGen(torch.nn.Module):
|
||||
"""Definition of sine generator
|
||||
SineGen(samp_rate, harmonic_num = 0,
|
||||
sine_amp = 0.1, noise_std = 0.003,
|
||||
voiced_threshold = 0,
|
||||
flag_for_pulse=False)
|
||||
samp_rate: sampling rate in Hz
|
||||
harmonic_num: number of harmonic overtones (default 0)
|
||||
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
||||
noise_std: std of Gaussian noise (default 0.003)
|
||||
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
||||
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
||||
Note: when flag_for_pulse is True, the first time step of a voiced
|
||||
segment is always sin(np.pi) or cos(0)
|
||||
"""
|
||||
|
||||
def __init__(self, samp_rate, harmonic_num=0, sine_amp=0.1, noise_std=0.003, voiced_threshold=0):
|
||||
super(SineGen, self).__init__()
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = noise_std
|
||||
self.harmonic_num = harmonic_num
|
||||
self.dim = self.harmonic_num + 1
|
||||
self.sampling_rate = samp_rate
|
||||
self.voiced_threshold = voiced_threshold
|
||||
|
||||
def _f02uv(self, f0):
|
||||
# generate uv signal
|
||||
uv = torch.ones_like(f0)
|
||||
uv = uv * (f0 > self.voiced_threshold)
|
||||
return uv
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, f0, upp):
|
||||
"""sine_tensor, uv = forward(f0)
|
||||
input F0: tensor(batchsize=1, length, dim=1)
|
||||
f0 for unvoiced steps should be 0
|
||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
f0 = f0.unsqueeze(-1)
|
||||
fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1)))
|
||||
rad_values = (fn / self.sampling_rate) % 1 # %1意味着n_har的乘积无法后处理优化
|
||||
rand_ini = torch.rand(fn.shape[0], fn.shape[2], device=fn.device)
|
||||
rand_ini[:, 0] = 0
|
||||
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
||||
is_half = rad_values.dtype is not torch.float32
|
||||
tmp_over_one = torch.cumsum(rad_values.double(), 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
||||
if is_half:
|
||||
tmp_over_one = tmp_over_one.half()
|
||||
else:
|
||||
tmp_over_one = tmp_over_one.float()
|
||||
tmp_over_one *= upp
|
||||
tmp_over_one = F.interpolate(tmp_over_one.transpose(2, 1), scale_factor=upp, mode="linear", align_corners=True).transpose(2, 1)
|
||||
rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode="nearest").transpose(2, 1)
|
||||
tmp_over_one %= 1
|
||||
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
||||
cumsum_shift = torch.zeros_like(rad_values)
|
||||
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
||||
rad_values = rad_values.double()
|
||||
cumsum_shift = cumsum_shift.double()
|
||||
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
|
||||
if is_half:
|
||||
sine_waves = sine_waves.half()
|
||||
else:
|
||||
sine_waves = sine_waves.float()
|
||||
sine_waves = sine_waves * self.sine_amp
|
||||
uv = self._f02uv(f0)
|
||||
uv = F.interpolate(uv.transpose(2, 1), scale_factor=upp, mode="nearest").transpose(2, 1)
|
||||
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
|
||||
noise = noise_amp * torch.randn_like(sine_waves)
|
||||
sine_waves = sine_waves * uv + noise
|
||||
return sine_waves, uv, noise
|
||||
|
||||
|
||||
class SourceModuleHnNSF(torch.nn.Module):
|
||||
"""SourceModule for hn-nsf
|
||||
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
||||
add_noise_std=0.003, voiced_threshod=0)
|
||||
sampling_rate: sampling_rate in Hz
|
||||
harmonic_num: number of harmonic above F0 (default: 0)
|
||||
sine_amp: amplitude of sine source signal (default: 0.1)
|
||||
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
||||
note that amplitude of noise in unvoiced is decided
|
||||
by sine_amp
|
||||
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||
F0_sampled (batchsize, length, 1)
|
||||
Sine_source (batchsize, length, 1)
|
||||
noise_source (batchsize, length 1)
|
||||
uv (batchsize, length, 1)
|
||||
"""
|
||||
|
||||
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1, add_noise_std=0.003, voiced_threshod=0):
|
||||
super(SourceModuleHnNSF, self).__init__()
|
||||
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = add_noise_std
|
||||
|
||||
# to produce sine waveforms
|
||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod)
|
||||
|
||||
# to merge source harmonics into a single excitation
|
||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||
self.l_tanh = torch.nn.Tanh()
|
||||
|
||||
def forward(self, x, upp):
|
||||
sine_wavs, uv, _ = self.l_sin_gen(x, upp)
|
||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
||||
return sine_merge
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, h):
|
||||
super(Generator, self).__init__()
|
||||
self.h = h
|
||||
self.num_kernels = len(h.resblock_kernel_sizes)
|
||||
self.num_upsamples = len(h.upsample_rates)
|
||||
self.m_source = SourceModuleHnNSF(sampling_rate=h.sampling_rate, harmonic_num=8)
|
||||
self.noise_convs = nn.ModuleList()
|
||||
self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
|
||||
resblock = ResBlock1 if h.resblock == "1" else ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
||||
c_cur = h.upsample_initial_channel // (2 ** (i + 1))
|
||||
self.ups.append(weight_norm(ConvTranspose1d(h.upsample_initial_channel // (2**i), h.upsample_initial_channel // (2 ** (i + 1)), k, u, padding=(k - u) // 2)))
|
||||
if i + 1 < len(h.upsample_rates): #
|
||||
stride_f0 = int(np.prod(h.upsample_rates[i + 1 :]))
|
||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
|
||||
else:
|
||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
||||
self.resblocks = nn.ModuleList()
|
||||
ch = h.upsample_initial_channel
|
||||
for i in range(len(self.ups)):
|
||||
ch //= 2
|
||||
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(h, ch, k, d))
|
||||
|
||||
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
||||
self.ups.apply(init_weights)
|
||||
self.conv_post.apply(init_weights)
|
||||
self.upp = int(np.prod(h.upsample_rates))
|
||||
|
||||
def forward(self, x, f0):
|
||||
har_source = self.m_source(f0, self.upp).transpose(1, 2)
|
||||
x = self.conv_pre(x)
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
x_source = self.noise_convs[i](har_source)
|
||||
x = x + x_source
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print("Removing weight norm...")
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
remove_weight_norm(self.conv_pre)
|
||||
remove_weight_norm(self.conv_post)
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, periods=None):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
|
||||
self.discriminators = nn.ModuleList()
|
||||
for period in self.periods:
|
||||
self.discriminators.append(DiscriminatorP(period))
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm # NOQA
|
||||
self.convs = nn.ModuleList(
|
||||
[
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
]
|
||||
)
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiScaleDiscriminator(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MultiScaleDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList(
|
||||
[
|
||||
DiscriminatorS(use_spectral_norm=True),
|
||||
DiscriminatorS(),
|
||||
DiscriminatorS(),
|
||||
]
|
||||
)
|
||||
self.meanpools = nn.ModuleList([AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
if i != 0:
|
||||
y = self.meanpools[i - 1](y)
|
||||
y_hat = self.meanpools[i - 1](y_hat)
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
r_loss = torch.mean((1 - dr) ** 2)
|
||||
g_loss = torch.mean(dg**2)
|
||||
loss += r_loss + g_loss
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
l = torch.mean((1 - dg) ** 2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
@ -0,0 +1,137 @@
|
||||
import os
|
||||
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
import soundfile as sf
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
try:
|
||||
data, sampling_rate = sf.read(full_path, always_2d=True) # than soundfile.
|
||||
except Exception as ex:
|
||||
print(f"'{full_path}' failed to load.\nException:")
|
||||
print(ex)
|
||||
if return_empty_on_exception:
|
||||
return [], sampling_rate or target_sr or 48000
|
||||
else:
|
||||
raise Exception(ex)
|
||||
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0]
|
||||
assert len(data) > 2 # check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
|
||||
|
||||
if np.issubdtype(data.dtype, np.integer): # if audio data is type int
|
||||
max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
|
||||
else: # if audio data is type fp32
|
||||
max_mag = max(np.amax(data), -np.amin(data))
|
||||
max_mag = (2**31) + 1 if max_mag > (2**15) else ((2**15) + 1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
|
||||
|
||||
data = torch.FloatTensor(data.astype(np.float32)) / max_mag
|
||||
|
||||
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception: # resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
|
||||
return [], sampling_rate or target_sr or 48000
|
||||
if target_sr is not None and sampling_rate != target_sr:
|
||||
data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
|
||||
sampling_rate = target_sr
|
||||
|
||||
return data, sampling_rate
|
||||
|
||||
|
||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression(x, C=1):
|
||||
return np.exp(x) / C
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
class STFT:
|
||||
def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
|
||||
self.target_sr = sr
|
||||
|
||||
self.n_mels = n_mels
|
||||
self.n_fft = n_fft
|
||||
self.win_size = win_size
|
||||
self.hop_length = hop_length
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.clip_val = clip_val
|
||||
self.mel_basis = {}
|
||||
self.hann_window = {}
|
||||
|
||||
def get_mel(self, y, keyshift=0, speed=1, center=False):
|
||||
sampling_rate = self.target_sr
|
||||
n_mels = self.n_mels
|
||||
n_fft = self.n_fft
|
||||
win_size = self.win_size
|
||||
hop_length = self.hop_length
|
||||
fmin = self.fmin
|
||||
fmax = self.fmax
|
||||
clip_val = self.clip_val
|
||||
|
||||
factor = 2 ** (keyshift / 12)
|
||||
n_fft_new = int(np.round(n_fft * factor))
|
||||
win_size_new = int(np.round(win_size * factor))
|
||||
hop_length_new = int(np.round(hop_length * speed))
|
||||
|
||||
if torch.min(y) < -1.0:
|
||||
print("min value is ", torch.min(y))
|
||||
if torch.max(y) > 1.0:
|
||||
print("max value is ", torch.max(y))
|
||||
|
||||
mel_basis_key = str(fmax) + "_" + str(y.device)
|
||||
if mel_basis_key not in self.mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
|
||||
|
||||
keyshift_key = str(keyshift) + "_" + str(y.device)
|
||||
if keyshift_key not in self.hann_window:
|
||||
self.hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
|
||||
|
||||
pad_left = (win_size_new - hop_length_new) // 2
|
||||
pad_right = max((win_size_new - hop_length_new + 1) // 2, win_size_new - y.size(-1) - pad_left)
|
||||
if pad_right < y.size(-1):
|
||||
mode = "reflect"
|
||||
else:
|
||||
mode = "constant"
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode=mode)
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=self.hann_window[keyshift_key], center=center, pad_mode="reflect", normalized=False, onesided=True, return_complex=False)
|
||||
# print(111,spec)
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9))
|
||||
if keyshift != 0:
|
||||
size = n_fft // 2 + 1
|
||||
resize = spec.size(1)
|
||||
if resize < size:
|
||||
spec = F.pad(spec, (0, 0, 0, size - resize))
|
||||
spec = spec[:, :size, :] * win_size / win_size_new
|
||||
|
||||
# print(222,spec)
|
||||
spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
|
||||
# print(333,spec)
|
||||
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
||||
# print(444,spec)
|
||||
return spec
|
||||
|
||||
def __call__(self, audiopath):
|
||||
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
||||
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
||||
return spect
|
||||
|
||||
|
||||
stft = STFT()
|
@ -0,0 +1,67 @@
|
||||
import glob
|
||||
import os
|
||||
import matplotlib
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
|
||||
plt.colorbar(im, ax=ax)
|
||||
|
||||
fig.canvas.draw()
|
||||
plt.close()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def apply_weight_norm(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
weight_norm(m)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size * dilation - dilation) / 2)
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print("Complete.")
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def save_checkpoint(filepath, obj):
|
||||
print("Saving checkpoint to {}".format(filepath))
|
||||
torch.save(obj, filepath)
|
||||
print("Complete.")
|
||||
|
||||
|
||||
def del_old_checkpoints(cp_dir, prefix, n_models=2):
|
||||
pattern = os.path.join(cp_dir, prefix + "????????")
|
||||
cp_list = glob.glob(pattern) # get checkpoint paths
|
||||
cp_list = sorted(cp_list) # sort by iter
|
||||
if len(cp_list) > n_models: # if more than n_models models are found
|
||||
for cp in cp_list[:-n_models]: # delete the oldest models other than lastest n_models
|
||||
open(cp, "w").close() # empty file contents
|
||||
os.unlink(cp) # delete file (move to trash when using Colab)
|
||||
|
||||
|
||||
def scan_checkpoint(cp_dir, prefix):
|
||||
pattern = os.path.join(cp_dir, prefix + "????????")
|
||||
cp_list = glob.glob(pattern)
|
||||
if len(cp_list) == 0:
|
||||
return None
|
||||
return sorted(cp_list)[-1]
|
Loading…
Reference in New Issue
Block a user