mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 05:25:01 +03:00
105 lines
3.9 KiB
Python
105 lines
3.9 KiB
Python
import sys
|
|
sys.path.append(".") # sifiganへのパスが必要。
|
|
import argparse
|
|
|
|
import torch
|
|
|
|
import numpy as np
|
|
from scipy.io.wavfile import write, read
|
|
import pyworld as pw
|
|
from logging import getLogger
|
|
|
|
# import utils
|
|
from models import SynthesizerTrn
|
|
|
|
# from mmvc_client import Hyperparameters # <- pyaudioなどが必要になるため必要なロジックのみコピペ
|
|
from client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
|
|
|
|
logger = getLogger(__name__)
|
|
|
|
|
|
def setupArgParser():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("-c", type=str, required=True, help="path for the config.json")
|
|
parser.add_argument("-m", type=str, help="path for the pytorch model file")
|
|
parser.add_argument("-o", type=str, help="path for the onnx model file")
|
|
parser.add_argument("-s", type=int, required=True, help="source speaker id")
|
|
parser.add_argument("-t", type=int, required=True, help="target speaker id")
|
|
parser.add_argument("--input", type=str, required=True, help="input wav file")
|
|
parser.add_argument("--output", type=str, required=True, help="outpu wav file")
|
|
parser.add_argument("--f0_scale", type=float, required=True, help="f0 scale")
|
|
return parser
|
|
|
|
|
|
def create_model(hps, pytorch_model_file):
|
|
net_g = SynthesizerTrn(
|
|
spec_channels=hps.data.filter_length // 2 + 1,
|
|
segment_size=hps.train.segment_size // hps.data.hop_length,
|
|
inter_channels=hps.model.inter_channels,
|
|
hidden_channels=hps.model.hidden_channels,
|
|
upsample_rates=hps.model.upsample_rates,
|
|
upsample_initial_channel=hps.model.upsample_initial_channel,
|
|
upsample_kernel_sizes=hps.model.upsample_kernel_sizes,
|
|
n_flow=hps.model.n_flow,
|
|
dec_out_channels=1,
|
|
dec_kernel_size=7,
|
|
n_speakers=hps.data.n_speakers,
|
|
gin_channels=hps.model.gin_channels,
|
|
requires_grad_pe=hps.requires_grad.pe,
|
|
requires_grad_flow=hps.requires_grad.flow,
|
|
requires_grad_text_enc=hps.requires_grad.text_enc,
|
|
requires_grad_dec=hps.requires_grad.dec
|
|
)
|
|
_ = net_g.eval()
|
|
|
|
_ = load_checkpoint(pytorch_model_file, net_g, None)
|
|
return net_g
|
|
|
|
|
|
def convert(hps, ssid, tsid, input, output, f0_scale):
|
|
sr, signal = read(input)
|
|
signal = signal / hps.data.max_wav_value
|
|
_f0, _time = pw.dio(signal, hps.data.sampling_rate, frame_period=5.5)
|
|
f0 = pw.stonemask(signal, _f0, _time, hps.data.sampling_rate)
|
|
f0 = convert_continuos_f0(f0, int(signal.shape[0] / hps.data.hop_length))
|
|
f0 = torch.from_numpy(f0.astype(np.float32))
|
|
signal = torch.from_numpy(signal.astype(np.float32)).clone()
|
|
signal = signal.unsqueeze(0)
|
|
spec = spectrogram_torch(signal, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False)
|
|
spec = torch.squeeze(spec, 0)
|
|
sid = torch.LongTensor([int(ssid)])
|
|
data = TextAudioSpeakerCollate(
|
|
sample_rate=hps.data.sampling_rate,
|
|
hop_size=hps.data.hop_length,
|
|
f0_factor=f0_scale
|
|
)([(spec, sid, f0)])
|
|
|
|
spec, spec_lengths, sid_src, sin, d = data
|
|
spec = spec.cuda()
|
|
spec_lengths = spec_lengths.cuda()
|
|
sid_src = sid_src.cuda()
|
|
sin = sin.cuda()
|
|
d = tuple([d[:1].cuda() for d in d])
|
|
sid_target = torch.LongTensor([tsid]).cuda()
|
|
audio = net_g.cuda().voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data.cpu().float().numpy()
|
|
# print(audio)
|
|
write(output, 24000, audio)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
print("main")
|
|
parser = setupArgParser()
|
|
args = parser.parse_args()
|
|
|
|
CONFIG_PATH = args.c
|
|
hps = get_hparams_from_file(CONFIG_PATH)
|
|
pytorch_model_file = args.m
|
|
ssid = args.s
|
|
tsid = args.t
|
|
input = args.input
|
|
output = args.output
|
|
f0_scale = args.f0_scale
|
|
|
|
net_g = create_model(hps, pytorch_model_file)
|
|
convert(hps, ssid, tsid, input, output, f0_scale)
|