From 954a26b0c6ea3bfaae2c1133bc5fe36b07c6a980 Mon Sep 17 00:00:00 2001 From: wataru Date: Fri, 10 Feb 2023 15:38:34 +0900 Subject: [PATCH] WIP: docker support v1.5.x trial 4 --- docker_trainer/Dockerfile | 12 +- docker_trainer/README.md | 5 + docker_trainer/scripts/client_modules.py | 208 +++++++++++++++++++++++ docker_trainer/scripts/conver_test.py | 104 ++++++++++++ docker_trainer/start_trainer.sh | 2 +- 5 files changed, 327 insertions(+), 4 deletions(-) create mode 100644 docker_trainer/scripts/client_modules.py create mode 100644 docker_trainer/scripts/conver_test.py diff --git a/docker_trainer/Dockerfile b/docker_trainer/Dockerfile index cebd39a2..efc032ee 100644 --- a/docker_trainer/Dockerfile +++ b/docker_trainer/Dockerfile @@ -28,18 +28,24 @@ RUN python3 warmup.py ADD dummy / -RUN git clone --depth 1 -b v1.5.0.0_SiFiGAN https://github.com/isletennos/MMVC_Trainer.git +RUN git clone -b v1.5.0.0_SiFiGAN https://github.com/isletennos/MMVC_Trainer.git WORKDIR /MMVC_Trainer/ +RUN git checkout c242d3d1cf7f768af70d9735082ca2bdd90c45f3 -#RUN git checkout 1109f1201e1ee9008ee61dc221d620eb1c93d7d0 -RUN git checkout f28a1514b6731c36bd68989c600bff0a3b217d75 +RUN git clone https://github.com/isletennos/MMVC_Client.git +WORKDIR /MMVC_Trainer/MMVC_Client +RUN git checkout 3374a1177b73e3f6d600e5dbe93af033c36ee120 +WORKDIR / +# ↓ テストスクリプトはTrainerのrootに置くとmodelsがconflictする。 +ADD /scripts /MMVC_Trainer/MMVC_Client/python/ ADD /model/D_v15_best.pth /MMVC_Trainer/fine_model/ ADD /model/G_v15_best.pth /MMVC_Trainer/fine_model/ RUN cp -r /MMVC_Trainer/configs /MMVC_Trainer/configs_org +WORKDIR /MMVC_Trainer/ diff --git a/docker_trainer/README.md b/docker_trainer/README.md index 69df18b0..189eff45 100644 --- a/docker_trainer/README.md +++ b/docker_trainer/README.md @@ -58,3 +58,8 @@ $ python3 train_ms.py -c configs/train_config.json -m 20220306_24000 -fg fine_mo $ python3 train_ms.py -c configs/train_config.json -m 20220306_24000 ``` + +(x) テスト +``` +$ python3 MMVC_Client/python/conver_test.py -m logs/G_40000.pth -c configs/train_config.json -s 0 -t 101 --input dataset/00_myvoice/wav/emotion011.wav --output dataset/test.wav --f0_scale 3 +``` \ No newline at end of file diff --git a/docker_trainer/scripts/client_modules.py b/docker_trainer/scripts/client_modules.py new file mode 100644 index 00000000..aa3d4760 --- /dev/null +++ b/docker_trainer/scripts/client_modules.py @@ -0,0 +1,208 @@ + + +from features import SignalGenerator, dilated_factor +from scipy.interpolate import interp1d +import torch +import numpy as np +import json +import os +hann_window = {} + + +class TextAudioSpeakerCollate(): + """ Zero-pads model inputs and targets + """ + + def __init__( + self, + sample_rate, + hop_size, + f0_factor=1.0, + dense_factors=[0.5, 1, 4, 8], + upsample_scales=[8, 4, 2, 2], + sine_amp=0.1, + noise_amp=0.003, + signal_types=["sine"], + ): + self.dense_factors = dense_factors + self.prod_upsample_scales = np.cumprod(upsample_scales) + self.sample_rate = sample_rate + self.signal_generator = SignalGenerator( + sample_rate=sample_rate, + hop_size=hop_size, + sine_amp=sine_amp, + noise_amp=noise_amp, + signal_types=signal_types, + ) + self.f0_factor = f0_factor + + def __call__(self, batch): + """Collate's training batch from normalized text, audio and speaker identities + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized, sid, note] + """ + + spec_lengths = torch.LongTensor(len(batch)) + sid = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), batch[0][0].size(1)) + f0_padded = torch.FloatTensor(len(batch), 1, batch[0][2].size(0)) + # 返り値の初期化 + spec_padded.zero_() + f0_padded.zero_() + + # dfs + dfs_batch = [[] for _ in range(len(self.dense_factors))] + + # row spec, sid, f0 + for i in range(len(batch)): + row = batch[i] + + spec = row[0] + spec_padded[i, :, :spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + sid[i] = row[1] + # 推論時 f0/cf0にf0の倍率を乗算してf0/cf0を求める + f0 = row[2] * self.f0_factor + f0_padded[i, :, :f0.size(0)] = f0 + + # dfs + dfs = [] + # dilated_factor の入力はnumpy!! + for df, us in zip(self.dense_factors, self.prod_upsample_scales): + dfs += [ + np.repeat(dilated_factor(torch.unsqueeze(f0, dim=1).to('cpu').detach().numpy(), self.sample_rate, df), us) + ] + + # よくわからないけど、後で論文ちゃんと読む + for i in range(len(self.dense_factors)): + dfs_batch[i] += [ + dfs[i].astype(np.float32).reshape(-1, 1) + ] # [(T', 1), ...] + # よくわからないdfsを転置 + for i in range(len(self.dense_factors)): + dfs_batch[i] = torch.FloatTensor(np.array(dfs_batch[i])).transpose( + 2, 1 + ) # (B, 1, T') + + # f0/cf0を実際に使うSignalに変換する + in_batch = self.signal_generator(f0_padded) + + return spec_padded, spec_lengths, sid, in_batch, dfs_batch + + +def convert_continuos_f0(f0, f0_size): + # get start and end of f0 + if (f0 == 0).all(): + return np.zeros((f0_size,)) + start_f0 = f0[f0 != 0][0] + end_f0 = f0[f0 != 0][-1] + # padding start and end of f0 sequence + cf0 = f0 + start_idx = np.where(cf0 == start_f0)[0][0] + end_idx = np.where(cf0 == end_f0)[0][-1] + cf0[:start_idx] = start_f0 + cf0[end_idx:] = end_f0 + # get non-zero frame index + nz_frames = np.where(cf0 != 0)[0] + # perform linear interpolation + f = interp1d(nz_frames, cf0[nz_frames], bounds_error=False, fill_value=0.0) + cf0_ = f(np.arange(0, f0_size)) + # print(cf0.shape, cf0_.shape, f0.shape, f0_size) + # print(cf0_) + return f(np.arange(0, f0_size)) + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.: + print('min value is ', torch.min(y)) + if torch.max(y) > 1.: + print('max value is ', torch.max(y)) + + dtype_device = str(y.dtype) + '_' + str(y.device) + wnsize_dtype_device = str(win_size) + '_' + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device) + + y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode='reflect') + y = y.squeeze(1) + + spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device], + center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True) + spec = torch.view_as_real(spec) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def get_hparams_from_file(config_path): + with open(config_path, "r", encoding="utf-8") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +class HParams(): + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() + + +def load_checkpoint(checkpoint_path, model, optimizer=None): + assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}" + checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') + iteration = checkpoint_dict['iteration'] + learning_rate = checkpoint_dict['learning_rate'] + if optimizer is not None: + optimizer.load_state_dict(checkpoint_dict['optimizer']) + saved_state_dict = { + **checkpoint_dict['pe'], + **checkpoint_dict['flow'], + **checkpoint_dict['text_enc'], + **checkpoint_dict['dec'], + **checkpoint_dict['emb_g'] + } + if hasattr(model, 'module'): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): + try: + new_state_dict[k] = saved_state_dict[k] + except: + new_state_dict[k] = v + if hasattr(model, 'module'): + model.module.load_state_dict(new_state_dict) + else: + model.load_state_dict(new_state_dict) + return model, optimizer, learning_rate, iteration diff --git a/docker_trainer/scripts/conver_test.py b/docker_trainer/scripts/conver_test.py new file mode 100644 index 00000000..03d7ad4d --- /dev/null +++ b/docker_trainer/scripts/conver_test.py @@ -0,0 +1,104 @@ +import sys +sys.path.append(".") # sifiganへのパスが必要。 +import argparse + +import torch + +import numpy as np +from scipy.io.wavfile import write, read +import pyworld as pw +from logging import getLogger + +# import utils +from models import SynthesizerTrn + +# from mmvc_client import Hyperparameters # <- pyaudioなどが必要になるため必要なロジックのみコピペ +from client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint + +logger = getLogger(__name__) + + +def setupArgParser(): + parser = argparse.ArgumentParser() + parser.add_argument("-c", type=str, required=True, help="path for the config.json") + parser.add_argument("-m", type=str, help="path for the pytorch model file") + parser.add_argument("-o", type=str, help="path for the onnx model file") + parser.add_argument("-s", type=int, required=True, help="source speaker id") + parser.add_argument("-t", type=int, required=True, help="target speaker id") + parser.add_argument("--input", type=str, required=True, help="input wav file") + parser.add_argument("--output", type=str, required=True, help="outpu wav file") + parser.add_argument("--f0_scale", type=float, required=True, help="f0 scale") + return parser + + +def create_model(hps, pytorch_model_file): + net_g = SynthesizerTrn( + spec_channels=hps.data.filter_length // 2 + 1, + segment_size=hps.train.segment_size // hps.data.hop_length, + inter_channels=hps.model.inter_channels, + hidden_channels=hps.model.hidden_channels, + upsample_rates=hps.model.upsample_rates, + upsample_initial_channel=hps.model.upsample_initial_channel, + upsample_kernel_sizes=hps.model.upsample_kernel_sizes, + n_flow=hps.model.n_flow, + dec_out_channels=1, + dec_kernel_size=7, + n_speakers=hps.data.n_speakers, + gin_channels=hps.model.gin_channels, + requires_grad_pe=hps.requires_grad.pe, + requires_grad_flow=hps.requires_grad.flow, + requires_grad_text_enc=hps.requires_grad.text_enc, + requires_grad_dec=hps.requires_grad.dec + ) + _ = net_g.eval() + + _ = load_checkpoint(pytorch_model_file, net_g, None) + return net_g + + +def convert(hps, ssid, tsid, input, output, f0_scale): + sr, signal = read(input) + signal = signal / hps.data.max_wav_value + _f0, _time = pw.dio(signal, hps.data.sampling_rate, frame_period=5.5) + f0 = pw.stonemask(signal, _f0, _time, hps.data.sampling_rate) + f0 = convert_continuos_f0(f0, int(signal.shape[0] / hps.data.hop_length)) + f0 = torch.from_numpy(f0.astype(np.float32)) + signal = torch.from_numpy(signal.astype(np.float32)).clone() + signal = signal.unsqueeze(0) + spec = spectrogram_torch(signal, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False) + spec = torch.squeeze(spec, 0) + sid = torch.LongTensor([int(ssid)]) + data = TextAudioSpeakerCollate( + sample_rate=hps.data.sampling_rate, + hop_size=hps.data.hop_length, + f0_factor=f0_scale + )([(spec, sid, f0)]) + + spec, spec_lengths, sid_src, sin, d = data + spec = spec.cuda() + spec_lengths = spec_lengths.cuda() + sid_src = sid_src.cuda() + sin = sin.cuda() + d = tuple([d[:1].cuda() for d in d]) + sid_target = torch.LongTensor([tsid]).cuda() + audio = net_g.cuda().voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data.cpu().float().numpy() + # print(audio) + write(output, 24000, audio) + + +if __name__ == '__main__': + print("main") + parser = setupArgParser() + args = parser.parse_args() + + CONFIG_PATH = args.c + hps = get_hparams_from_file(CONFIG_PATH) + pytorch_model_file = args.m + ssid = args.s + tsid = args.t + input = args.input + output = args.output + f0_scale = args.f0_scale + + net_g = create_model(hps, pytorch_model_file) + convert(hps, ssid, tsid, input, output, f0_scale) diff --git a/docker_trainer/start_trainer.sh b/docker_trainer/start_trainer.sh index c7093906..c404c147 100644 --- a/docker_trainer/start_trainer.sh +++ b/docker_trainer/start_trainer.sh @@ -1,7 +1,7 @@ #!/bin/bash set -eu -DOCKER_IMAGE=dannadori/trainer:20230209_214044 +DOCKER_IMAGE=dannadori/trainer:20230210_153105 # DOCKER_IMAGE=trainer docker run --gpus all --rm -ti \