diff --git a/demo/mod/attentions.py b/demo/mod/attentions.py deleted file mode 100755 index 4e0b0c1f..00000000 --- a/demo/mod/attentions.py +++ /dev/null @@ -1,303 +0,0 @@ -import copy -import math -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - -import commons -import modules -from modules import LayerNorm - - -class Encoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class Decoder(nn.Module): - def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - - self.drop = nn.Dropout(p_dropout) - self.self_attn_layers = nn.ModuleList() - self.norm_layers_0 = nn.ModuleList() - self.encdec_attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - for i in range(self.n_layers): - self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init)) - self.norm_layers_0.append(LayerNorm(hidden_channels)) - self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout)) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True)) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask, h, h_mask): - """ - x: decoder input - h: encoder output - """ - self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype) - encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.self_attn_layers[i](x, x, self_attn_mask) - y = self.drop(y) - x = self.norm_layers_0[i](x + y) - - y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class MultiHeadAttention(nn.Module): - def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels**-0.5 - self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # reshape [b, d, t] -> [b, n_h, t, d_k] - b, d, t_s, t_t = (*key.size(), query.size(2)) - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - if self.window_size is not None: - assert t_s == t_t, "Relative attention is only available for self-attention." - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - if self.proximal_bias: - assert t_s == t_t, "Proximal bias is only available for self-attention." - scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert t_s == t_t, "Local attention is only available for self-attention." - block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) - scores = scores.masked_fill(block_mask == 0, -1e4) - p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) - output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t] - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - """ - x: [b, h, l, m] - y: [h or 1, m, d] - ret: [b, h, l, d] - """ - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - """ - x: [b, h, l, d] - y: [h or 1, m, d] - ret: [b, h, l, m] - """ - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - # Pad first before slice to avoid using cond ops. - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]])) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - """ - x: [b, h, l, 2*l-1] - ret: [b, h, l, l] - """ - batch, heads, length, _ = x.size() - # Concat columns of pad to shift from relative to absolute indexing. - x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]])) - - # Concat extra elements so to add up to shape (len+1, 2*len-1). - x_flat = x.view([batch, heads, length * 2 * length]) - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]])) - - # Reshape and slice out the padded elements. - x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - """ - x: [b, h, l, l] - ret: [b, h, l, 2*l-1] - """ - batch, heads, length, _ = x.size() - # padd along column - x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]])) - x_flat = x.view([batch, heads, length**2 + length*(length -1)]) - # add 0's in the beginning that will skew the elements after reshape - x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) - x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:] - return x_final - - def _attention_bias_proximal(self, length): - """Bias for self-attention to encourage attention to close positions. - Args: - length: an integer scalar. - Returns: - a Tensor with shape [1, 1, length, length] - """ - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class FFN(nn.Module): - def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding - - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x * x_mask)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x * x_mask)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - padding = [[0, 0], [0, 0], [pad_l, pad_r]] - x = F.pad(x, commons.convert_pad_shape(padding)) - return x diff --git a/demo/mod/commons.py b/demo/mod/commons.py deleted file mode 100755 index 9ad0444b..00000000 --- a/demo/mod/commons.py +++ /dev/null @@ -1,161 +0,0 @@ -import math -import numpy as np -import torch -from torch import nn -from torch.nn import functional as F - - -def init_weights(m, mean=0.0, std=0.01): - classname = m.__class__.__name__ - if classname.find("Conv") != -1: - m.weight.data.normal_(mean, std) - - -def get_padding(kernel_size, dilation=1): - return int((kernel_size*dilation - dilation)/2) - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def intersperse(lst, item): - result = [item] * (len(lst) * 2 + 1) - result[1::2] = lst - return result - - -def kl_divergence(m_p, logs_p, m_q, logs_q): - """KL(P||Q)""" - kl = (logs_q - logs_p) - 0.5 - kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q) - return kl - - -def rand_gumbel(shape): - """Sample from the Gumbel distribution, protect from overflows.""" - uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 - return -torch.log(-torch.log(uniform_samples)) - - -def rand_gumbel_like(x): - g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) - return g - - -def slice_segments(x, ids_str, segment_size=4): - ret = torch.zeros_like(x[:, :, :segment_size]) - for i in range(x.size(0)): - idx_str = ids_str[i] - idx_end = idx_str + segment_size - ret[i] = x[i, :, idx_str:idx_end] - return ret - - -def rand_slice_segments(x, x_lengths=None, segment_size=4): - b, d, t = x.size() - if x_lengths is None: - x_lengths = t - ids_str_max = x_lengths - segment_size + 1 - ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) - ret = slice_segments(x, ids_str, segment_size) - return ret, ids_str - - -def get_timing_signal_1d( - length, channels, min_timescale=1.0, max_timescale=1.0e4): - position = torch.arange(length, dtype=torch.float) - num_timescales = channels // 2 - log_timescale_increment = ( - math.log(float(max_timescale) / float(min_timescale)) / - (num_timescales - 1)) - inv_timescales = min_timescale * torch.exp( - torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment) - scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) - signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) - signal = F.pad(signal, [0, 0, 0, channels % 2]) - signal = signal.view(1, channels, length) - return signal - - -def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return x + signal.to(dtype=x.dtype, device=x.device) - - -def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): - b, channels, length = x.size() - signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) - return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) - - -def subsequent_mask(length): - mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) - return mask - - -@torch.jit.script -def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): - n_channels_int = n_channels[0] - in_act = input_a + input_b - t_act = torch.tanh(in_act[:, :n_channels_int, :]) - s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) - acts = t_act * s_act - return acts - - -def convert_pad_shape(pad_shape): - l = pad_shape[::-1] - pad_shape = [item for sublist in l for item in sublist] - return pad_shape - - -def shift_1d(x): - x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] - return x - - -def sequence_mask(length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -def generate_path(duration, mask): - """ - duration: [b, 1, t_x] - mask: [b, 1, t_y, t_x] - """ - device = duration.device - - b, _, t_y, t_x = mask.shape - cum_duration = torch.cumsum(duration, -1) - - cum_duration_flat = cum_duration.view(b * t_x) - path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) - path = path.view(b, t_x, t_y) - path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] - path = path.unsqueeze(1).transpose(2,3) * mask - return path - - -def clip_grad_value_(parameters, clip_value, norm_type=2): - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - norm_type = float(norm_type) - if clip_value is not None: - clip_value = float(clip_value) - - total_norm = 0 - for p in parameters: - param_norm = p.grad.data.norm(norm_type) - total_norm += param_norm.item() ** norm_type - if clip_value is not None: - p.grad.data.clamp_(min=-clip_value, max=clip_value) - total_norm = total_norm ** (1. / norm_type) - return total_norm diff --git a/demo/mod/data_utils.py b/demo/mod/data_utils.py deleted file mode 100755 index 3e06620f..00000000 --- a/demo/mod/data_utils.py +++ /dev/null @@ -1,492 +0,0 @@ -import time -import os -import random -import numpy as np -import torch -import torch.utils.data -import tqdm - -import commons -from mel_processing import spectrogram_torch -from utils import load_wav_to_torch, load_filepaths_and_text -from text import text_to_sequence, cleaned_text_to_sequence -import struct -#add -from retry import retry -import random -import torchaudio -from scipy.io.wavfile import write - -class TextAudioLoader(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - def __init__(self, audiopaths_and_text, hparams, use_test = True): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.text_cleaners = hparams.text_cleaners - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.use_test = use_test - - self.cleaned_text = getattr(hparams, "cleaned_text", False) - - self.add_blank = hparams.add_blank - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 190) - - random.seed(1234) - random.shuffle(self.audiopaths_and_text) - self._filter() - - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text]) - lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - audiopath, text = audiopath_and_text[0], audiopath_and_text[1] - text = self.get_text(text) - if self.use_test != True: - text = torch.as_tensor("a") - spec, wav = self.get_audio(audiopath) - return (text, spec, wav) - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError("{} {} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate)) - audio_norm = audio / self.max_wav_value - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - spec = torch.load(spec_filename) - else: - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename) - return spec, audio_norm - - def get_text(self, text): - if self.cleaned_text: - text_norm = cleaned_text_to_sequence(text) - else: - text_norm = text_to_sequence(text, self.text_cleaners) - if self.add_blank: - text_norm = commons.intersperse(text_norm, 0) - text_norm = torch.LongTensor(text_norm) - return text_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollate(): - """ Zero-pads model inputs and targets - """ - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) - - max_text_len = max([len(x[0]) for x in batch]) - max_spec_len = max([x[1].size(1) for x in batch]) - max_wav_len = max([x[2].size(1) for x in batch]) - - text_lengths = torch.LongTensor(len(batch)) - spec_lengths = torch.LongTensor(len(batch)) - wav_lengths = torch.LongTensor(len(batch)) - - text_padded = torch.LongTensor(len(batch), max_text_len) - spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) - wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) - text_padded.zero_() - spec_padded.zero_() - wav_padded.zero_() - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - text = row[0] - text_padded[i, :text.size(0)] = text - text_lengths[i] = text.size(0) - - spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav - wav_lengths[i] = wav.size(1) - - if self.return_ids: - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths - - -"""Multi speaker version""" -class TextAudioSpeakerLoader(torch.utils.data.Dataset): - """ - 1) loads audio, speaker_id, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - def __init__(self, audiopaths_sid_text, hparams, no_text=False, augmentation=False, augmentation_params=None, no_use_textfile = False): - if no_use_textfile: - self.audiopaths_sid_text = list() - else: - self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text) - - self.text_cleaners = hparams.text_cleaners - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.no_text = no_text - self.augmentation = augmentation - if augmentation : - self.gain_p = augmentation_params.gain_p - self.min_gain_in_db = augmentation_params.min_gain_in_db - self.max_gain_in_db = augmentation_params.max_gain_in_db - self.time_stretch_p = augmentation_params.time_stretch_p - self.min_rate = augmentation_params.min_rate - self.max_rate = augmentation_params.max_rate - self.pitch_shift_p = augmentation_params.pitch_shift_p - self.min_semitones = augmentation_params.min_semitones - self.max_semitones = augmentation_params.max_semitones - self.add_gaussian_noise_p = augmentation_params.add_gaussian_noise_p - self.min_amplitude = augmentation_params.min_amplitude - self.max_amplitude = augmentation_params.max_amplitude - self.frequency_mask_p = augmentation_params.frequency_mask_p - - self.cleaned_text = getattr(hparams, "cleaned_text", False) - - self.add_blank = hparams.add_blank - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 1000) - - random.seed(1234) - random.shuffle(self.audiopaths_sid_text) - self._filter() - - @retry(tries=30, delay=10) - def _filter(self): - """ - Filter text & store spec lengths - """ - audiopaths_sid_text_new = [] - lengths = [] - - # for audiopath, sid, text in tqdm.tqdm(self.audiopaths_sid_text): - for audiopath, sid, text in self.audiopaths_sid_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_sid_text_new.append([audiopath, sid, text]) - lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length)) - self.audiopaths_sid_text = audiopaths_sid_text_new - self.lengths = lengths - - def get_audio_text_speaker_pair(self, audiopath_sid_text): - # separate filename, speaker_id and text - wavdata, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2] - text = self.get_text(text) - if self.no_text: - text = self.get_text("a") - spec, wav = self.get_audio(wavdata) - sid = self.get_sid(sid) - return (text, spec, wav, sid) - - @retry(exceptions=(PermissionError), tries=100, delay=10) - def get_audio(self, wavdata): - # 音声データは±1.0内に正規化したtorchベクトルでunsqueeze(0)で外側1次元くるんだものを扱う - audio = torch.FloatTensor(wavdata.astype(np.float32)) - sampling_rate=24000 - try: - if sampling_rate != self.sampling_rate: - raise ValueError("[Error] Exception: source {} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate)) - except ValueError as e: - print(e) - exit() - audio_norm = self.get_normalized_audio(audio, self.max_wav_value) - - if self.augmentation: - audio_augmented = self.add_augmentation(audio_norm, sampling_rate) - audio_noised = self.add_noise(audio_augmented, sampling_rate) - # ノーマライズ後のaugmentationとnoise付加で範囲外になったところを削る - audio_augmented = torch.clamp(audio_augmented, -1, 1) - audio_noised = torch.clamp(audio_noised, -1, 1) - # audio(音声波形)は教師信号となるのでノイズは含まずaugmentationのみしたものを使用 - audio_norm = audio_augmented - # spec(スペクトログラム)は入力信号となるのでaugmentationしてさらにノイズを付加したものを使用 - spec = spectrogram_torch(audio_noised, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) - spec_noised = self.add_spectrogram_noise(spec) - spec = torch.squeeze(spec_noised, 0) - else: - spec = spectrogram_torch(audio_norm, self.filter_length, - self.sampling_rate, self.hop_length, self.win_length, - center=False) - spec = torch.squeeze(spec, 0) - return spec, audio_norm - - def add_augmentation(self, audio, sampling_rate): - gain_in_db = 0.0 - if random.random() <= self.gain_p: - gain_in_db = random.uniform(self.min_gain_in_db, self.max_gain_in_db) - time_stretch_rate = 1.0 - if random.random() <= self.time_stretch_p: - time_stretch_rate = random.uniform(self.min_rate, self.max_rate) - pitch_shift_semitones = 0 - if random.random() <= self.pitch_shift_p: - pitch_shift_semitones = random.uniform(self.min_semitones, self.max_semitones) * 100 # 1/100 semitone 単位指定のため - augmentation_effects = [ - ["gain", f"{gain_in_db}"], - ["tempo", f"{time_stretch_rate}"], - ["pitch", f"{pitch_shift_semitones}"], - ["rate", f"{sampling_rate}"] - ] - audio_augmented, _ = torchaudio.sox_effects.apply_effects_tensor(audio, sampling_rate, augmentation_effects) - return audio_augmented - - def add_noise(self, audio, sampling_rate): - # AddGaussianNoise - audio = self.add_gaussian_noise(audio) - return audio - - def add_gaussian_noise(self, audio): - assert self.min_amplitude >= 0.0 - assert self.max_amplitude >= 0.0 - assert self.max_amplitude >= self.min_amplitude - if random.random() > self.add_gaussian_noise_p: - return audio - amplitude = random.uniform(self.min_amplitude, self.max_amplitude) - noise = torch.randn(audio.size()) - noised_audio = audio + amplitude * noise - return noised_audio - - def add_spectrogram_noise(self, spec): - # FrequencyMask - masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80) - masked = masking(spec) - return masked - - def get_normalized_audio(self, audio, max_wav_value): - audio_norm = audio / max_wav_value - audio_norm = audio_norm.unsqueeze(0) - return audio_norm - - def get_text(self, text): - if self.cleaned_text: - text_norm = cleaned_text_to_sequence(text) - else: - text_norm = text_to_sequence(text, self.text_cleaners) - if self.add_blank: - text_norm = commons.intersperse(text_norm, 0) - text_norm = torch.LongTensor(text_norm) - return text_norm - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def __getitem__(self, index): - return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index]) - - def __len__(self): - return len(self.audiopaths_sid_text) - - -class TextAudioSpeakerCollate(): - """ Zero-pads model inputs and targets - """ - def __init__(self, return_ids=False, no_text = False): - self.return_ids = return_ids - self.no_text = no_text - - def __call__(self, batch): - """Collate's training batch from normalized text, audio and speaker identities - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized, sid] - """ - # Right zero-pad all one-hot text sequences to max input length - - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[1].size(1) for x in batch]), - dim=0, descending=True) - - max_text_len = max([len(x[0]) for x in batch]) - max_spec_len = max([x[1].size(1) for x in batch]) - max_wav_len = max([x[2].size(1) for x in batch]) - - text_lengths = torch.LongTensor(len(batch)) - spec_lengths = torch.LongTensor(len(batch)) - wav_lengths = torch.LongTensor(len(batch)) - sid = torch.LongTensor(len(batch)) - - text_padded = torch.LongTensor(len(batch), max_text_len) - spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len) - wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len) - text_padded.zero_() - spec_padded.zero_() - wav_padded.zero_() - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - text = row[0] - text_padded[i, :text.size(0)] = text - text_lengths[i] = text.size(0) - - spec = row[1] - spec_padded[i, :, :spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wav = row[2] - wav_padded[i, :, :wav.size(1)] = wav - wav_lengths[i] = wav.size(1) - - sid[i] = row[3] - - if self.return_ids: - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing - return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid - - -class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): - """ - Maintain similar input lengths in a batch. - Length groups are specified by boundaries. - Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. - - It removes samples which are not included in the boundaries. - Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. - """ - def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - length = self.lengths[i] - idx_bucket = self._bisect(length) - if idx_bucket != -1: - buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, 0, -1): - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i+1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size - num_samples_per_bucket.append(len_bucket + rem) - return buckets, num_samples_per_bucket - - def __iter__(self): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(self.epoch) - - indices = [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - batches = [] - for i in range(len(self.buckets)): - next_bucket = (i+1) % len(self.buckets) - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - num_samples_bucket = self.num_samples_per_bucket[i] - - if len_bucket == 0: - print("[Warn] Exception: length of buckets {} is 0. ID:{} Skip.".format(i,i)) - continue - - # add extra samples to make it evenly divisible - rem = num_samples_bucket - len_bucket - ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)] - - # subsample - ids_bucket = ids_bucket[self.rank::self.num_replicas] - - # batching - for j in range(len(ids_bucket) // self.batch_size): - batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]] - batches.append(batch) - - if self.shuffle: - batch_ids = torch.randperm(len(batches), generator=g).tolist() - batches = [batches[i] for i in batch_ids] - self.batches = batches - - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: - hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid+1]: - return mid - elif x <= self.boundaries[mid]: - return self._bisect(x, lo, mid) - else: - return self._bisect(x, mid + 1, hi) - else: - return -1 - - def __len__(self): - return self.num_samples // self.batch_size diff --git a/demo/mod/mel_processing.py b/demo/mod/mel_processing.py deleted file mode 100755 index cb33b76d..00000000 --- a/demo/mod/mel_processing.py +++ /dev/null @@ -1,114 +0,0 @@ -import math -import os -import random -import torch -from torch import nn -import torch.nn.functional as F -import torch.utils.data -import numpy as np -import librosa -import librosa.util as librosa_util -from librosa.util import normalize, pad_center, tiny -from scipy.signal import get_window -from scipy.io.wavfile import read -from librosa.filters import mel as librosa_mel_fn - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - output = dynamic_range_compression_torch(magnitudes) - return output - - -def spectral_de_normalize_torch(magnitudes): - output = dynamic_range_decompression_torch(magnitudes) - return output - - -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - if torch.min(y) < -1.: - print('min value is ', torch.min(y)) - if torch.max(y) > 1.: - print('max value is ', torch.max(y)) - - global hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) - wnsize_dtype_device = str(win_size) + '_' + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') - y = y.squeeze(1) - - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) - spec = torch.view_as_real(spec) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - global mel_basis - dtype_device = str(spec.dtype) + '_' + str(spec.device) - fmax_dtype_device = str(fmax) + '_' + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device) - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - return spec - - -def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): - if torch.min(y) < -1.: - print('min value is ', torch.min(y)) - if torch.max(y) > 1.: - print('max value is ', torch.max(y)) - - global mel_basis, hann_window - dtype_device = str(y.dtype) + '_' + str(y.device) - fmax_dtype_device = str(fmax) + '_' + dtype_device - wnsize_dtype_device = str(win_size) + '_' + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device) - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) - - y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect') - y = y.squeeze(1) - - spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], - center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) - spec = torch.view_as_real(spec) - - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - - spec = torch.matmul(mel_basis[fmax_dtype_device], spec) - spec = spectral_normalize_torch(spec) - - return spec diff --git a/demo/mod/models.py b/demo/mod/models.py deleted file mode 100755 index 7b40b95f..00000000 --- a/demo/mod/models.py +++ /dev/null @@ -1,407 +0,0 @@ -import copy -import math -import torch -from torch import nn -from torch.nn import functional as F - -import commons -import modules -import attentions -import monotonic_align - -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from commons import init_weights, get_padding - - -class TextEncoder(nn.Module): - def __init__(self, - n_vocab, - out_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout): - super().__init__() - self.n_vocab = n_vocab - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - - self.emb = nn.Embedding(n_vocab, hidden_channels) - nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5) - - self.encoder = attentions.Encoder( - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths): - x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h] - x = torch.transpose(x, 1, -1) # [b, h, t] - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) - - x = self.encoder(x * x_mask, x_mask) - stats = self.proj(x) * x_mask - - m, logs = torch.split(stats, self.out_channels, dim=1) - return x, m, logs, x_mask - - -class ResidualCouplingBlock(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - n_flows=4, - gin_channels=0): - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.n_flows = n_flows - self.gin_channels = gin_channels - - self.flows = nn.ModuleList() - for i in range(n_flows): - self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True)) - self.flows.append(modules.Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - -class PosteriorEncoder(nn.Module): - def __init__(self, - in_channels, - out_channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - gin_channels=0): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype) - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - -class Generator(torch.nn.Module): - def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0): - super(Generator, self).__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) - resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2 - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - self.ups.append(weight_norm( - ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)), - k, u, padding=(k-u)//2))) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel//(2**(i+1)) - for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(resblock(ch, k, d)) - - self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) - self.ups.apply(init_weights) - - if gin_channels != 0: - #self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - gin_channels = 0 - - def forward(self, x, g=None): - x = self.conv_pre(x) - if g is not None: - #x = x + self.cond(g) - g=None - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, modules.LRELU_SLOPE) - x = self.ups[i](x) - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i*self.num_kernels+j](x) - else: - xs += self.resblocks[i*self.num_kernels+j](x) - x = xs / self.num_kernels - x = F.leaky_relu(x) - x = self.conv_post(x) - x = torch.tanh(x) - - return x - - def remove_weight_norm(self): - print('Removing weight norm...') - for l in self.ups: - remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class DiscriminatorP(torch.nn.Module): - def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): - super(DiscriminatorP, self).__init__() - self.period = period - self.use_spectral_norm = use_spectral_norm - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))), - norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))), - ]) - self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) - - def forward(self, x): - fmap = [] - - # 1d to 2d - b, c, t = x.shape - if t % self.period != 0: # pad first - n_pad = self.period - (t % self.period) - x = F.pad(x, (0, n_pad), "reflect") - t = t + n_pad - x = x.view(b, c, t // self.period, self.period) - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class DiscriminatorS(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(DiscriminatorS, self).__init__() - norm_f = weight_norm if use_spectral_norm == False else spectral_norm - self.convs = nn.ModuleList([ - norm_f(Conv1d(1, 16, 15, 1, padding=7)), - norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), - norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), - norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), - norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), - norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), - ]) - self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) - - def forward(self, x): - fmap = [] - - for l in self.convs: - x = l(x) - x = F.leaky_relu(x, modules.LRELU_SLOPE) - fmap.append(x) - x = self.conv_post(x) - fmap.append(x) - x = torch.flatten(x, 1, -1) - - return x, fmap - - -class MultiPeriodDiscriminator(torch.nn.Module): - def __init__(self, use_spectral_norm=False): - super(MultiPeriodDiscriminator, self).__init__() - periods = [2,3,5,7,11] - - discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] - discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods] - self.discriminators = nn.ModuleList(discs) - - def forward(self, y, y_hat): - y_d_rs = [] - y_d_gs = [] - fmap_rs = [] - fmap_gs = [] - for i, d in enumerate(self.discriminators): - y_d_r, fmap_r = d(y) - y_d_g, fmap_g = d(y_hat) - y_d_rs.append(y_d_r) - y_d_gs.append(y_d_g) - fmap_rs.append(fmap_r) - fmap_gs.append(fmap_g) - - return y_d_rs, y_d_gs, fmap_rs, fmap_gs - - - -class SynthesizerTrn(nn.Module): - """ - Synthesizer for Training - """ - - def __init__(self, - n_vocab, - spec_channels, - segment_size, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout, - resblock, - resblock_kernel_sizes, - resblock_dilation_sizes, - upsample_rates, - upsample_initial_channel, - upsample_kernel_sizes, - n_flow, - n_speakers=0, - gin_channels=0, - use_sdp=True, - **kwargs): - - super().__init__() - self.n_vocab = n_vocab - self.spec_channels = spec_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.n_speakers = n_speakers - self.gin_channels = gin_channels - - self.use_sdp = use_sdp - - self.enc_p = TextEncoder(n_vocab, - inter_channels, - hidden_channels, - filter_channels, - n_heads, - n_layers, - kernel_size, - p_dropout) - self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels) - self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels) - self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels) - - if n_speakers > 1: - self.emb_g = nn.Embedding(n_speakers, gin_channels) - - def forward(self, x, x_lengths, y, y_lengths, sid=None): - - x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths) - if self.n_speakers > 0: - g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1] - else: - g = None - - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) - z_p = self.flow(z, y_mask, g=g) - - with torch.no_grad(): - # negative cross-entropy - s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t] - neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s] - neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s] - neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s] - neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4 - - attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1) - attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach() - - # expand prior - m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) - logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) - - z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size) - o = self.dec(z_slice, g=g) - return o, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - - def voice_conversion(self, y, y_lengths, sid_src, sid_tgt): - assert self.n_speakers > 0, "n_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) - z_p = self.flow(z, y_mask, g=g_src) - z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) - o_hat = self.dec(z_hat * y_mask, g=g_tgt) - return o_hat, y_mask, (z, z_p, z_hat) - - def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt): - assert self.n_speakers > 0, "n_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) - o_hat = self.dec(z * y_mask, g=g_tgt) - return o_hat, y_mask, (z) - - def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt): - assert self.n_speakers > 0, "n_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) - o_hat = self.dec(z * y_mask, g=g_src) - return o_hat, y_mask, (z) - - def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt): - assert self.n_speakers > 0, "n_speakers have to be larger than 0." - g_src = self.emb_g(sid_src).unsqueeze(-1) - g_tgt = self.emb_g(sid_tgt).unsqueeze(-1) - z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src) - z_p = self.flow(z, y_mask, g=g_src) - z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True) - z_p_hat = self.flow(z_hat, y_mask, g=g_tgt) - z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True) - o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt) - return o_hat, y_mask, (z, z_p, z_hat) - - diff --git a/demo/mod/modules.py b/demo/mod/modules.py deleted file mode 100755 index 9c7fd9cd..00000000 --- a/demo/mod/modules.py +++ /dev/null @@ -1,390 +0,0 @@ -import copy -import math -import numpy as np -import scipy -import torch -from torch import nn -from torch.nn import functional as F - -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm - -import commons -from commons import init_weights, get_padding -from transforms import piecewise_rational_quadratic_transform - - -LRELU_SLOPE = 0.1 - - -class LayerNorm(nn.Module): - def __init__(self, channels, eps=1e-5): - super().__init__() - self.channels = channels - self.eps = eps - - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - x = x.transpose(1, -1) - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) - - -class ConvReluNorm(nn.Module): - def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout): - super().__init__() - self.in_channels = in_channels - self.hidden_channels = hidden_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - assert n_layers > 1, "Number of layers should be larger than 0." - - self.conv_layers = nn.ModuleList() - self.norm_layers = nn.ModuleList() - self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.relu_drop = nn.Sequential( - nn.ReLU(), - nn.Dropout(p_dropout)) - for _ in range(n_layers-1): - self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2)) - self.norm_layers.append(LayerNorm(hidden_channels)) - self.proj = nn.Conv1d(hidden_channels, out_channels, 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask): - x_org = x - for i in range(self.n_layers): - x = self.conv_layers[i](x * x_mask) - x = self.norm_layers[i](x) - x = self.relu_drop(x) - x = x_org + self.proj(x) - return x * x_mask - - -class DDSConv(nn.Module): - """ - Dialted and Depth-Separable Convolution - """ - def __init__(self, channels, kernel_size, n_layers, p_dropout=0.): - super().__init__() - self.channels = channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.p_dropout = p_dropout - - self.drop = nn.Dropout(p_dropout) - self.convs_sep = nn.ModuleList() - self.convs_1x1 = nn.ModuleList() - self.norms_1 = nn.ModuleList() - self.norms_2 = nn.ModuleList() - for i in range(n_layers): - dilation = kernel_size ** i - padding = (kernel_size * dilation - dilation) // 2 - self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size, - groups=channels, dilation=dilation, padding=padding - )) - self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) - self.norms_1.append(LayerNorm(channels)) - self.norms_2.append(LayerNorm(channels)) - - def forward(self, x, x_mask, g=None): - if g is not None: - x = x + g - for i in range(self.n_layers): - y = self.convs_sep[i](x * x_mask) - y = self.norms_1[i](y) - y = F.gelu(y) - y = self.convs_1x1[i](y) - y = self.norms_2[i](y) - y = F.gelu(y) - y = self.drop(y) - x = x + y - return x * x_mask - - -class WN(torch.nn.Module): - def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0): - super(WN, self).__init__() - assert(kernel_size % 2 == 1) - self.hidden_channels =hidden_channels - self.kernel_size = kernel_size, - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.gin_channels = gin_channels - self.p_dropout = p_dropout - - self.in_layers = torch.nn.ModuleList() - self.res_skip_layers = torch.nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels != 0: - cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1) - self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') - - for i in range(n_layers): - dilation = dilation_rate ** i - padding = int((kernel_size * dilation - dilation) / 2) - in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size, - dilation=dilation, padding=padding) - in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') - self.in_layers.append(in_layer) - - # last one is not necessary - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - - res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) - res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') - self.res_skip_layers.append(res_skip_layer) - - def forward(self, x, x_mask, g=None, **kwargs): - output = torch.zeros_like(x) - n_channels_tensor = torch.IntTensor([self.hidden_channels]) - - if g is not None: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:] - else: - g_l = torch.zeros_like(x_in) - - acts = commons.fused_add_tanh_sigmoid_multiply( - x_in, - g_l, - n_channels_tensor) - acts = self.drop(acts) - - res_skip_acts = self.res_skip_layers[i](acts) - if i < self.n_layers - 1: - res_acts = res_skip_acts[:,:self.hidden_channels,:] - x = (x + res_acts) * x_mask - output = output + res_skip_acts[:,self.hidden_channels:,:] - else: - output = output + res_skip_acts - return output * x_mask - - def remove_weight_norm(self): - if self.gin_channels != 0: - torch.nn.utils.remove_weight_norm(self.cond_layer) - for l in self.in_layers: - torch.nn.utils.remove_weight_norm(l) - for l in self.res_skip_layers: - torch.nn.utils.remove_weight_norm(l) - - -class ResBlock1(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): - super(ResBlock1, self).__init__() - self.convs1 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2], - padding=get_padding(kernel_size, dilation[2]))) - ]) - self.convs1.apply(init_weights) - - self.convs2 = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1, - padding=get_padding(kernel_size, 1))) - ]) - self.convs2.apply(init_weights) - - def forward(self, x, x_mask=None): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c1(xt) - xt = F.leaky_relu(xt, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c2(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs1: - remove_weight_norm(l) - for l in self.convs2: - remove_weight_norm(l) - - -class ResBlock2(torch.nn.Module): - def __init__(self, channels, kernel_size=3, dilation=(1, 3)): - super(ResBlock2, self).__init__() - self.convs = nn.ModuleList([ - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0], - padding=get_padding(kernel_size, dilation[0]))), - weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1], - padding=get_padding(kernel_size, dilation[1]))) - ]) - self.convs.apply(init_weights) - - def forward(self, x, x_mask=None): - for c in self.convs: - xt = F.leaky_relu(x, LRELU_SLOPE) - if x_mask is not None: - xt = xt * x_mask - xt = c(xt) - x = xt + x - if x_mask is not None: - x = x * x_mask - return x - - def remove_weight_norm(self): - for l in self.convs: - remove_weight_norm(l) - - -class Log(nn.Module): - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask - logdet = torch.sum(-y, [1, 2]) - return y, logdet - else: - x = torch.exp(x) * x_mask - return x - - -class Flip(nn.Module): - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - if not reverse: - logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) - return x, logdet - else: - return x - - -class ElementwiseAffine(nn.Module): - def __init__(self, channels): - super().__init__() - self.channels = channels - self.m = nn.Parameter(torch.zeros(channels,1)) - self.logs = nn.Parameter(torch.zeros(channels,1)) - - def forward(self, x, x_mask, reverse=False, **kwargs): - if not reverse: - y = self.m + torch.exp(self.logs) * x - y = y * x_mask - logdet = torch.sum(self.logs * x_mask, [1,2]) - return y, logdet - else: - x = (x - self.m) * torch.exp(-self.logs) * x_mask - return x - - -class ResidualCouplingLayer(nn.Module): - def __init__(self, - channels, - hidden_channels, - kernel_size, - dilation_rate, - n_layers, - p_dropout=0, - gin_channels=0, - mean_only=False): - assert channels % 2 == 0, "channels should be divisible by 2" - super().__init__() - self.channels = channels - self.hidden_channels = hidden_channels - self.kernel_size = kernel_size - self.dilation_rate = dilation_rate - self.n_layers = n_layers - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels) - self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - if not self.mean_only: - m, logs = torch.split(stats, [self.half_channels]*2, 1) - else: - m = stats - logs = torch.zeros_like(m) - - if not reverse: - x1 = m + x1 * torch.exp(logs) * x_mask - x = torch.cat([x0, x1], 1) - logdet = torch.sum(logs, [1,2]) - return x, logdet - else: - x1 = (x1 - m) * torch.exp(-logs) * x_mask - x = torch.cat([x0, x1], 1) - return x - - -class ConvFlow(nn.Module): - def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0): - super().__init__() - self.in_channels = in_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.n_layers = n_layers - self.num_bins = num_bins - self.tail_bound = tail_bound - self.half_channels = in_channels // 2 - - self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) - self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.) - self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1) - self.proj.weight.data.zero_() - self.proj.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels]*2, 1) - h = self.pre(x0) - h = self.convs(h, x_mask, g=g) - h = self.proj(h) * x_mask - - b, c, t = x0.shape - h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] - - unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels) - unnormalized_derivatives = h[..., 2 * self.num_bins:] - - x1, logabsdet = piecewise_rational_quadratic_transform(x1, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=reverse, - tails='linear', - tail_bound=self.tail_bound - ) - - x = torch.cat([x0, x1], 1) * x_mask - logdet = torch.sum(logabsdet * x_mask, [1,2]) - if not reverse: - return x, logdet - else: - return x diff --git a/demo/mod/monotonic_align/__init__.py b/demo/mod/monotonic_align/__init__.py deleted file mode 100755 index 3d7009c4..00000000 --- a/demo/mod/monotonic_align/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -import numpy as np -import torch -from .monotonic_align.core import maximum_path_c - - -def maximum_path(neg_cent, mask): - """ Cython optimized version. - neg_cent: [b, t_t, t_s] - mask: [b, t_t, t_s] - """ - device = neg_cent.device - dtype = neg_cent.dtype - neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) - path = np.zeros(neg_cent.shape, dtype=np.int32) - - t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) - t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) - maximum_path_c(path, neg_cent, t_t_max, t_s_max) - return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/demo/mod/monotonic_align/core.pyx b/demo/mod/monotonic_align/core.pyx deleted file mode 100755 index bfaabd4d..00000000 --- a/demo/mod/monotonic_align/core.pyx +++ /dev/null @@ -1,42 +0,0 @@ -cimport cython -from cython.parallel import prange - - -@cython.boundscheck(False) -@cython.wraparound(False) -cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil: - cdef int x - cdef int y - cdef float v_prev - cdef float v_cur - cdef float tmp - cdef int index = t_x - 1 - - for y in range(t_y): - for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)): - if x == y: - v_cur = max_neg_val - else: - v_cur = value[y-1, x] - if x == 0: - if y == 0: - v_prev = 0. - else: - v_prev = max_neg_val - else: - v_prev = value[y-1, x-1] - value[y, x] += max(v_prev, v_cur) - - for y in range(t_y - 1, -1, -1): - path[y, index] = 1 - if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]): - index = index - 1 - - -@cython.boundscheck(False) -@cython.wraparound(False) -cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil: - cdef int b = paths.shape[0] - cdef int i - for i in prange(b, nogil=True): - maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i]) diff --git a/demo/mod/monotonic_align/monotonic_align/core.cp39-win_amd64.pyd b/demo/mod/monotonic_align/monotonic_align/core.cp39-win_amd64.pyd deleted file mode 100755 index 9e9bacb7..00000000 Binary files a/demo/mod/monotonic_align/monotonic_align/core.cp39-win_amd64.pyd and /dev/null differ diff --git a/demo/mod/monotonic_align/monotonic_align/core.cpython-37m-x86_64-linux-gnu.so b/demo/mod/monotonic_align/monotonic_align/core.cpython-37m-x86_64-linux-gnu.so deleted file mode 100755 index 0ea56ee9..00000000 Binary files a/demo/mod/monotonic_align/monotonic_align/core.cpython-37m-x86_64-linux-gnu.so and /dev/null differ diff --git a/demo/mod/monotonic_align/monotonic_align/core.cpython-39-x86_64-linux-gnu.so b/demo/mod/monotonic_align/monotonic_align/core.cpython-39-x86_64-linux-gnu.so deleted file mode 100755 index d42eff71..00000000 Binary files a/demo/mod/monotonic_align/monotonic_align/core.cpython-39-x86_64-linux-gnu.so and /dev/null differ diff --git a/demo/mod/monotonic_align/setup.py b/demo/mod/monotonic_align/setup.py deleted file mode 100755 index d68e6672..00000000 --- a/demo/mod/monotonic_align/setup.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np -import torch -import sys -print(sys.path) -sys.path.append("/backend/mod/") -print(sys.path) -from .monotonic_align.core import maximum_path_c - - -def maximum_path(neg_cent, mask): - """ Cython optimized version. - neg_cent: [b, t_t, t_s] - mask: [b, t_t, t_s] - """ - device = neg_cent.device - dtype = neg_cent.dtype - neg_cent = neg_cent.data.cpu().numpy().astype(np.float32) - path = np.zeros(neg_cent.shape, dtype=np.int32) - - t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32) - t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32) - maximum_path_c(path, neg_cent, t_t_max, t_s_max) - return torch.from_numpy(path).to(device=device, dtype=dtype) diff --git a/demo/mod/text/__init__.py b/demo/mod/text/__init__.py deleted file mode 100755 index 4ac41f90..00000000 --- a/demo/mod/text/__init__.py +++ /dev/null @@ -1,54 +0,0 @@ -""" from https://github.com/keithito/tacotron """ -from text import cleaners -from text.symbols import symbols - - -# Mappings from symbol to numeric ID and vice versa: -_symbol_to_id = {s: i for i, s in enumerate(symbols)} -_id_to_symbol = {i: s for i, s in enumerate(symbols)} - - -def text_to_sequence(text, cleaner_names): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - cleaner_names: names of the cleaner functions to run the text through - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [] - - clean_text = _clean_text(text, cleaner_names) - for symbol in clean_text: - symbol_id = _symbol_to_id[symbol] - sequence += [symbol_id] - return sequence - - -def cleaned_text_to_sequence(cleaned_text): - '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. - Args: - text: string to convert to a sequence - Returns: - List of integers corresponding to the symbols in the text - ''' - sequence = [_symbol_to_id[symbol] for symbol in cleaned_text] - return sequence - - -def sequence_to_text(sequence): - '''Converts a sequence of IDs back to a string''' - result = '' - for symbol_id in sequence: - s = _id_to_symbol[symbol_id] - result += s - return result - - -def _clean_text(text, cleaner_names): - for name in cleaner_names: - cleaner = getattr(cleaners, name) - if not cleaner: - raise Exception('Unknown cleaner: %s' % name) - text = cleaner(text) - return text diff --git a/demo/mod/text/cleaners.py b/demo/mod/text/cleaners.py deleted file mode 100755 index 32afd52c..00000000 --- a/demo/mod/text/cleaners.py +++ /dev/null @@ -1,105 +0,0 @@ -""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d. """ -""" from https://github.com/keithito/tacotron """ - -''' -Cleaners are transformations that run over the input text at both training and eval time. - -Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" -hyperparameter. Some cleaners are English-specific. You'll typically want to use: - 1. "english_cleaners" for English text - 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using - the Unidecode library (https://pypi.python.org/pypi/Unidecode) - 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update - the symbols in symbols.py to match your data). -''' - -import re -from unidecode import unidecode -from phonemizer import phonemize - - -# Regular expression matching whitespace: -_whitespace_re = re.compile(r'\s+') - -# List of (regular expression, replacement) pairs for abbreviations: -_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ - ('mrs', 'misess'), - ('mr', 'mister'), - ('dr', 'doctor'), - ('st', 'saint'), - ('co', 'company'), - ('jr', 'junior'), - ('maj', 'major'), - ('gen', 'general'), - ('drs', 'doctors'), - ('rev', 'reverend'), - ('lt', 'lieutenant'), - ('hon', 'honorable'), - ('sgt', 'sergeant'), - ('capt', 'captain'), - ('esq', 'esquire'), - ('ltd', 'limited'), - ('col', 'colonel'), - ('ft', 'fort'), -]] - - -def expand_abbreviations(text): - for regex, replacement in _abbreviations: - text = re.sub(regex, replacement, text) - return text - - -def expand_numbers(text): - return normalize_numbers(text) - - -def lowercase(text): - return text.lower() - - -def collapse_whitespace(text): - return re.sub(_whitespace_re, ' ', text) - - -def convert_to_ascii(text): - return unidecode(text) - - -def basic_cleaners(text): - '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def transliteration_cleaners(text): - '''Pipeline for non-English text that transliterates to ASCII.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = collapse_whitespace(text) - return text - - -def english_cleaners(text): - '''Pipeline for English text, including abbreviation expansion.''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True) - phonemes = collapse_whitespace(phonemes) - return phonemes - - -def english_cleaners2(text): - '''Pipeline for English text, including abbreviation expansion. + punctuation + stress''' - text = convert_to_ascii(text) - text = lowercase(text) - text = expand_abbreviations(text) - phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True) - phonemes = collapse_whitespace(phonemes) - return phonemes - -def japanese_cleaners(text): - phonemes = text.split('-') - return phonemes \ No newline at end of file diff --git a/demo/mod/text/symbols.py b/demo/mod/text/symbols.py deleted file mode 100755 index 2a46c1fa..00000000 --- a/demo/mod/text/symbols.py +++ /dev/null @@ -1,64 +0,0 @@ -""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """ -""" from https://github.com/keithito/tacotron """ - -''' -Defines the set of symbols used in text input to the model. -''' -_pad = '_' -_punctuation = ';:,.!?¡¿—…"«»“” ' -_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' -_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" - - -# Export all symbols: -symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa) - -# Special symbol ids -SPACE_ID = symbols.index(" ") - -symbols = [ - "A", - "E", - "I", - "N", - "O", - "U", - "a", - "b", - "by", - "ch", - "cl", - "d", - "dy", - "e", - "f", - "g", - "gy", - "h", - "hy", - "i", - "j", - "k", - "ky", - "m", - "my", - "n", - "ny", - "o", - "p", - "py", - "r", - "ry", - "s", - "sh", - "t", - "ts", - "ty", - "u", - "v", - "w", - "y", - "z", - "pau", - "sil" -] diff --git a/demo/mod/transforms.py b/demo/mod/transforms.py deleted file mode 100755 index 4793d67c..00000000 --- a/demo/mod/transforms.py +++ /dev/null @@ -1,193 +0,0 @@ -import torch -from torch.nn import functional as F - -import numpy as np - - -DEFAULT_MIN_BIN_WIDTH = 1e-3 -DEFAULT_MIN_BIN_HEIGHT = 1e-3 -DEFAULT_MIN_DERIVATIVE = 1e-3 - - -def piecewise_rational_quadratic_transform(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails=None, - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - - if tails is None: - spline_fn = rational_quadratic_spline - spline_kwargs = {} - else: - spline_fn = unconstrained_rational_quadratic_spline - spline_kwargs = { - 'tails': tails, - 'tail_bound': tail_bound - } - - outputs, logabsdet = spline_fn( - inputs=inputs, - unnormalized_widths=unnormalized_widths, - unnormalized_heights=unnormalized_heights, - unnormalized_derivatives=unnormalized_derivatives, - inverse=inverse, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative, - **spline_kwargs - ) - return outputs, logabsdet - - -def searchsorted(bin_locations, inputs, eps=1e-6): - bin_locations[..., -1] += eps - return torch.sum( - inputs[..., None] >= bin_locations, - dim=-1 - ) - 1 - - -def unconstrained_rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - tails='linear', - tail_bound=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) - outside_interval_mask = ~inside_interval_mask - - outputs = torch.zeros_like(inputs) - logabsdet = torch.zeros_like(inputs) - - if tails == 'linear': - unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) - constant = np.log(np.exp(1 - min_derivative) - 1) - unnormalized_derivatives[..., 0] = constant - unnormalized_derivatives[..., -1] = constant - - outputs[outside_interval_mask] = inputs[outside_interval_mask] - logabsdet[outside_interval_mask] = 0 - else: - raise RuntimeError('{} tails are not implemented.'.format(tails)) - - outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline( - inputs=inputs[inside_interval_mask], - unnormalized_widths=unnormalized_widths[inside_interval_mask, :], - unnormalized_heights=unnormalized_heights[inside_interval_mask, :], - unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], - inverse=inverse, - left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound, - min_bin_width=min_bin_width, - min_bin_height=min_bin_height, - min_derivative=min_derivative - ) - - return outputs, logabsdet - -def rational_quadratic_spline(inputs, - unnormalized_widths, - unnormalized_heights, - unnormalized_derivatives, - inverse=False, - left=0., right=1., bottom=0., top=1., - min_bin_width=DEFAULT_MIN_BIN_WIDTH, - min_bin_height=DEFAULT_MIN_BIN_HEIGHT, - min_derivative=DEFAULT_MIN_DERIVATIVE): - if torch.min(inputs) < left or torch.max(inputs) > right: - raise ValueError('Input to a transform is not within its domain') - - num_bins = unnormalized_widths.shape[-1] - - if min_bin_width * num_bins > 1.0: - raise ValueError('Minimal bin width too large for the number of bins') - if min_bin_height * num_bins > 1.0: - raise ValueError('Minimal bin height too large for the number of bins') - - widths = F.softmax(unnormalized_widths, dim=-1) - widths = min_bin_width + (1 - min_bin_width * num_bins) * widths - cumwidths = torch.cumsum(widths, dim=-1) - cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0) - cumwidths = (right - left) * cumwidths + left - cumwidths[..., 0] = left - cumwidths[..., -1] = right - widths = cumwidths[..., 1:] - cumwidths[..., :-1] - - derivatives = min_derivative + F.softplus(unnormalized_derivatives) - - heights = F.softmax(unnormalized_heights, dim=-1) - heights = min_bin_height + (1 - min_bin_height * num_bins) * heights - cumheights = torch.cumsum(heights, dim=-1) - cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0) - cumheights = (top - bottom) * cumheights + bottom - cumheights[..., 0] = bottom - cumheights[..., -1] = top - heights = cumheights[..., 1:] - cumheights[..., :-1] - - if inverse: - bin_idx = searchsorted(cumheights, inputs)[..., None] - else: - bin_idx = searchsorted(cumwidths, inputs)[..., None] - - input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] - input_bin_widths = widths.gather(-1, bin_idx)[..., 0] - - input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] - delta = heights / widths - input_delta = delta.gather(-1, bin_idx)[..., 0] - - input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] - input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] - - input_heights = heights.gather(-1, bin_idx)[..., 0] - - if inverse: - a = (((inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta) - + input_heights * (input_delta - input_derivatives))) - b = (input_heights * input_derivatives - - (inputs - input_cumheights) * (input_derivatives - + input_derivatives_plus_one - - 2 * input_delta)) - c = - input_delta * (inputs - input_cumheights) - - discriminant = b.pow(2) - 4 * a * c - assert (discriminant >= 0).all() - - root = (2 * c) / (-b - torch.sqrt(discriminant)) - outputs = root * input_bin_widths + input_cumwidths - - theta_one_minus_theta = root * (1 - root) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - root).pow(2)) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, -logabsdet - else: - theta = (inputs - input_cumwidths) / input_bin_widths - theta_one_minus_theta = theta * (1 - theta) - - numerator = input_heights * (input_delta * theta.pow(2) - + input_derivatives * theta_one_minus_theta) - denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta) - * theta_one_minus_theta) - outputs = input_cumheights + numerator / denominator - - derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2) - + 2 * input_delta * theta_one_minus_theta - + input_derivatives * (1 - theta).pow(2)) - logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) - - return outputs, logabsdet diff --git a/demo/mod/utils.py b/demo/mod/utils.py deleted file mode 100755 index fae83f01..00000000 --- a/demo/mod/utils.py +++ /dev/null @@ -1,270 +0,0 @@ -import os -import glob -import sys -import argparse -import logging -import json -import subprocess -import numpy as np -from scipy.io.wavfile import read -import torch - -MATPLOTLIB_FLAG = False - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -logger = logging - - -def load_checkpoint(checkpoint_path, model, optimizer=None): - assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}" - checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') - iteration = checkpoint_dict['iteration'] - learning_rate = checkpoint_dict['learning_rate'] - if optimizer is not None: - optimizer.load_state_dict(checkpoint_dict['optimizer']) - saved_state_dict = checkpoint_dict['model'] - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict= {} - for k, v in state_dict.items(): - try: - new_state_dict[k] = saved_state_dict[k] - except: - logger.info("%s is not in the checkpoint" % k) - new_state_dict[k] = v - if hasattr(model, 'module'): - model.module.load_state_dict(new_state_dict) - else: - model.load_state_dict(new_state_dict) - logger.info("Loaded checkpoint '{}' (iteration {})" .format( - checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info("Saving model and optimizer state at iteration {} to {}".format( - iteration, checkpoint_path)) - if hasattr(model, 'module'): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save({'model': state_dict, - 'iteration': iteration, - 'optimizer': optimizer.state_dict(), - 'learning_rate': learning_rate}, checkpoint_path) - - -def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats='HWC') - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) - - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - print(x) - return x - - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10,2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", - interpolation='none') - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger('matplotlib') - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower', - interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='') - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) # scipy.io.wavfile - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding='utf-8') as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text - - -def get_hparams(init=True): - parser = argparse.ArgumentParser() - parser.add_argument('-c', '--config', type=str, default="./configs/base.json", - help='JSON file for configuration') - parser.add_argument('-m', '--model', type=str, required=True, - help='Model name') - parser.add_argument('-fg', '--fine_tuning_g', type=str, default=None, - help='If fine tuning, please specify model(G)') - parser.add_argument('-fd', '--fine_tuning_d', type=str, default=None, - help='If fine tuning, please specify model(D)') - - args = parser.parse_args() - model_dir = os.path.join("./logs", args.model) - - if not os.path.exists(model_dir): - os.makedirs(model_dir) - - config_path = args.config - config_save_path = os.path.join(model_dir, "config.json") - if init: - with open(config_path, "r") as f: - data = f.read() - with open(config_save_path, "w") as f: - f.write(data) - else: - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - #Added about fine tuning - if args.fine_tuning_g != None and args.fine_tuning_d != None: - config['fine_flag'] = True - config['fine_model_g'] = args.fine_tuning_g - config['fine_model_d'] = args.fine_tuning_d - else: - config['fine_flag'] = False - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams =HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams =HParams(**config) - return hparams - - -def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - )) - return - - cur_hash = subprocess.getoutput("git rev-parse HEAD") - - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warn("git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8])) - else: - open(path, "w").write(cur_hash) - - -def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger - - -class HParams(): - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() diff --git a/demo/setupFlask.sh b/demo/setupFlask.sh deleted file mode 100755 index 7bef48bd..00000000 --- a/demo/setupFlask.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -echo config: $1 -echo model: $2 -cp -r /resources/* . - -if [[ -e ./setting.json ]]; then - cp ./setting.json ../frontend/dist/assets/setting.json -fi -pip install flask -pip install flask_cors -python3 serverFlask.py 8080 $1 $2 - - diff --git a/frontend/dist/index.html b/frontend/dist/index.html index 89df0d0b..a333ab9c 100755 --- a/frontend/dist/index.html +++ b/frontend/dist/index.html @@ -1,13 +1 @@ - - -
- -n.far?null:{distance:c,point:cn.clone(),object:t}}(t,e,n,r,Je,Ze,Ke,ln);if(p){a&&(sn.fromBufferAttribute(a,c),on.fromBufferAttribute(a,h),an.fromBufferAttribute(a,u),p.uv=Re.getUV(ln,Je,Ze,Ke,sn,on,an,new J)),l&&(sn.fromBufferAttribute(l,c),on.fromBufferAttribute(l,h),an.fromBufferAttribute(l,u),p.uv2=Re.getUV(ln,Je,Ze,Ke,sn,on,an,new J));const t={a:c,b:h,c:u,normal:new mt,materialIndex:0};Re.getNormal(Je,Ze,Ke,t.normal),p.face=t}return p}hn.prototype.isMesh=!0;class dn extends je{constructor(t=1,e=1,n=1,r=1,i=1,s=1){super(),this.type=\"BoxGeometry\",this.parameters={width:t,height:e,depth:n,widthSegments:r,heightSegments:i,depthSegments:s};const o=this;r=Math.floor(r),i=Math.floor(i),s=Math.floor(s);const a=[],l=[],c=[],h=[];let u=0,d=0;function p(t,e,n,r,i,s,p,f,m,g,v){const y=s/m,_=p/g,x=s/2,w=p/2,M=f/2,b=m+1,E=g+1;let S=0,T=0;const A=new mt;for(let s=0;s