voice-changer/docker_trainer/scripts/conver_test.py
2023-02-10 15:38:34 +09:00

105 lines
3.9 KiB
Python

import sys
sys.path.append(".") # sifiganへのパスが必要。
import argparse
import torch
import numpy as np
from scipy.io.wavfile import write, read
import pyworld as pw
from logging import getLogger
# import utils
from models import SynthesizerTrn
# from mmvc_client import Hyperparameters # <- pyaudioなどが必要になるため必要なロジックのみコピペ
from client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
logger = getLogger(__name__)
def setupArgParser():
parser = argparse.ArgumentParser()
parser.add_argument("-c", type=str, required=True, help="path for the config.json")
parser.add_argument("-m", type=str, help="path for the pytorch model file")
parser.add_argument("-o", type=str, help="path for the onnx model file")
parser.add_argument("-s", type=int, required=True, help="source speaker id")
parser.add_argument("-t", type=int, required=True, help="target speaker id")
parser.add_argument("--input", type=str, required=True, help="input wav file")
parser.add_argument("--output", type=str, required=True, help="outpu wav file")
parser.add_argument("--f0_scale", type=float, required=True, help="f0 scale")
return parser
def create_model(hps, pytorch_model_file):
net_g = SynthesizerTrn(
spec_channels=hps.data.filter_length // 2 + 1,
segment_size=hps.train.segment_size // hps.data.hop_length,
inter_channels=hps.model.inter_channels,
hidden_channels=hps.model.hidden_channels,
upsample_rates=hps.model.upsample_rates,
upsample_initial_channel=hps.model.upsample_initial_channel,
upsample_kernel_sizes=hps.model.upsample_kernel_sizes,
n_flow=hps.model.n_flow,
dec_out_channels=1,
dec_kernel_size=7,
n_speakers=hps.data.n_speakers,
gin_channels=hps.model.gin_channels,
requires_grad_pe=hps.requires_grad.pe,
requires_grad_flow=hps.requires_grad.flow,
requires_grad_text_enc=hps.requires_grad.text_enc,
requires_grad_dec=hps.requires_grad.dec
)
_ = net_g.eval()
_ = load_checkpoint(pytorch_model_file, net_g, None)
return net_g
def convert(hps, ssid, tsid, input, output, f0_scale):
sr, signal = read(input)
signal = signal / hps.data.max_wav_value
_f0, _time = pw.dio(signal, hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(signal, _f0, _time, hps.data.sampling_rate)
f0 = convert_continuos_f0(f0, int(signal.shape[0] / hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32))
signal = torch.from_numpy(signal.astype(np.float32)).clone()
signal = signal.unsqueeze(0)
spec = spectrogram_torch(signal, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(ssid)])
data = TextAudioSpeakerCollate(
sample_rate=hps.data.sampling_rate,
hop_size=hps.data.hop_length,
f0_factor=f0_scale
)([(spec, sid, f0)])
spec, spec_lengths, sid_src, sin, d = data
spec = spec.cuda()
spec_lengths = spec_lengths.cuda()
sid_src = sid_src.cuda()
sin = sin.cuda()
d = tuple([d[:1].cuda() for d in d])
sid_target = torch.LongTensor([tsid]).cuda()
audio = net_g.cuda().voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data.cpu().float().numpy()
# print(audio)
write(output, 24000, audio)
if __name__ == '__main__':
print("main")
parser = setupArgParser()
args = parser.parse_args()
CONFIG_PATH = args.c
hps = get_hparams_from_file(CONFIG_PATH)
pytorch_model_file = args.m
ssid = args.s
tsid = args.t
input = args.input
output = args.output
f0_scale = args.f0_scale
net_g = create_model(hps, pytorch_model_file)
convert(hps, ssid, tsid, input, output, f0_scale)