voice-changer/server/voice_changer/VoiceChanger.py

499 lines
21 KiB
Python
Executable File

import sys
sys.path.append("MMVC_Client/python")
from const import ERROR_NO_ONNX_SESSION, TMP_DIR
import torch
import os
import traceback
import numpy as np
from dataclasses import dataclass, asdict
import resampy
import onnxruntime
from symbols import symbols
from models import SynthesizerTrn
import pyworld as pw
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
from voice_changer.MMVCv15 import MMVCv15
import time
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
import wave
import matplotlib
matplotlib.use('Agg')
import pylab
import librosa
import librosa.display
SAMPLING_RATE = 24000
class MockStream:
"""gi
オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
"""
def __init__(self, sampling_rate):
self.sampling_rate = sampling_rate
self.start_count = 2
self.end_count = 2
self.fr = None
self.fw = None
def open_inputfile(self, input_filename):
self.fr = wave.open(input_filename, 'rb')
def open_outputfile(self, output_filename):
self.fw = wave.open(output_filename, 'wb')
self.fw.setnchannels(1)
self.fw.setsampwidth(2)
self.fw.setframerate(self.sampling_rate)
def read(self, length, exception_on_overflow=False):
if self.start_count > 0:
wav = bytes(length * 2)
self.start_count -= 1 # 最初の2回はダミーの空データ送る
else:
wav = self.fr.readframes(length)
if len(wav) <= 0: # データなくなってから最後の2回はダミーの空データを送る
wav = bytes(length * 2)
self.end_count -= 1
if self.end_count < 0:
Hyperparameters.VC_END_FLAG = True
return wav
def write(self, wav):
self.fw.writeframes(wav)
def stop_stream(self):
pass
def close(self):
if self.fr != None:
self.fr.close()
self.fr = None
if self.fw != None:
self.fw.close()
self.fw = None
@dataclass
class VocieChangerSettings():
gpu: int = 0
srcId: int = 0
dstId: int = 101
inputSampleRate: int = 24000 # 48000 or 24000
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
f0Factor: float = 1.0
f0Detector: str = "dio" # dio or harvest
recordIO: int = 0 # 0:off, 1:on
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
# ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId", "inputSampleRate", "crossFadeOverlapSize", "recordIO"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "f0Factor"]
strData = ["framework", "f0Detector"]
def readMicrophone(queue, sid, deviceIndex):
print("READ MIC", queue, sid, deviceIndex)
class VoiceChanger():
def __init__(self):
# 初期化
self.settings = VocieChangerSettings()
self.unpackedData_length = 0
self.net_g = None
self.onnx_session = None
self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate = 0
self.currentCrossFadeOverlapSize = 0
self.voiceChanger = MMVCv15()
self.gpu_num = torch.cuda.device_count()
self.text_norm = torch.LongTensor([0, 6, 0])
self.audio_buffer = torch.zeros(1, 0)
self.prev_audio = np.zeros(1)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def _setupRecordIO(self):
# IO Recorder Setup
if hasattr(self, "stream_out"):
self.stream_out.close()
mock_stream_out = MockStream(24000)
stream_output_file = os.path.join(TMP_DIR, "out.wav")
if os.path.exists(stream_output_file):
print("delete old analyze file.", stream_output_file)
os.remove(stream_output_file)
else:
print("old analyze file not exist.", stream_output_file)
mock_stream_out.open_outputfile(stream_output_file)
self.stream_out = mock_stream_out
if hasattr(self, "stream_in"):
self.stream_in.close()
mock_stream_in = MockStream(24000)
stream_input_file = os.path.join(TMP_DIR, "in.wav")
if os.path.exists(stream_input_file):
print("delete old analyze file.", stream_input_file)
os.remove(stream_input_file)
else:
print("old analyze file not exist.", stream_output_file)
mock_stream_in.open_outputfile(stream_input_file)
self.stream_in = mock_stream_in
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config
self.hps = get_hparams_from_file(config)
if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file:
self.settings.onnxModelFile = onnx_model_file
# PyTorchモデル生成
if pyTorch_model_file != None:
self.net_g = SynthesizerTrn(
spec_channels=self.hps.data.filter_length // 2 + 1,
segment_size=self.hps.train.segment_size // self.hps.data.hop_length,
inter_channels=self.hps.model.inter_channels,
hidden_channels=self.hps.model.hidden_channels,
upsample_rates=self.hps.model.upsample_rates,
upsample_initial_channel=self.hps.model.upsample_initial_channel,
upsample_kernel_sizes=self.hps.model.upsample_kernel_sizes,
n_flow=self.hps.model.n_flow,
dec_out_channels=1,
dec_kernel_size=7,
n_speakers=self.hps.data.n_speakers,
gin_channels=self.hps.model.gin_channels,
requires_grad_pe=self.hps.requires_grad.pe,
requires_grad_flow=self.hps.requires_grad.flow,
requires_grad_text_enc=self.hps.requires_grad.text_enc,
requires_grad_dec=self.hps.requires_grad.dec
)
self.net_g.eval()
load_checkpoint(pyTorch_model_file, self.net_g, None)
# utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
# ONNXモデル生成
if onnx_model_file != None:
ort_options = onnxruntime.SessionOptions()
ort_options.intra_op_num_threads = 8
self.onnx_session = onnxruntime.InferenceSession(
onnx_model_file,
providers=providers
)
return self.get_info()
def destroy(self):
del self.net_g
del self.onnx_session
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def _get_f0_dio(self, y, sr=SAMPLING_RATE):
_f0, time = pw.dio(y, sr, frame_period=5)
f0 = pw.stonemask(y, _f0, time, sr)
time = np.linspace(0, y.shape[0] / sr, len(time))
return f0, time
def _get_f0_harvest(self, y, sr=SAMPLING_RATE):
_f0, time = pw.harvest(y, sr, frame_period=5)
f0 = pw.stonemask(y, _f0, time, sr)
time = np.linspace(0, y.shape[0] / sr, len(time))
return f0, time
def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0
if key == "recordIO" and val == 1:
self._setupRecordIO()
if key == "recordIO" and val == 0:
pass
if key == "recordIO" and val == 2:
try:
stream_input_file = os.path.join(TMP_DIR, "in.wav")
analyze_file_dio = os.path.join(TMP_DIR, "analyze-dio.png")
analyze_file_harvest = os.path.join(TMP_DIR, "analyze-harvest.png")
y, sr = librosa.load(stream_input_file, SAMPLING_RATE)
y = y.astype(np.float64)
spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
f0_dio, times = self._get_f0_dio(y)
f0_harvest, times = self._get_f0_harvest(y)
pylab.close()
HOP_LENGTH = 128
img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
pylab.savefig(analyze_file_dio)
pylab.close()
HOP_LENGTH = 128
img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
pylab.savefig(analyze_file_harvest)
except Exception as e:
print("recordIO exception", e)
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
else:
print(f"{key} is not mutalbe variable!")
return self.get_info()
def _generate_strength(self, dataLength: int):
if self.unpackedData_length != dataLength or \
self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \
self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \
self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
self.unpackedData_length = dataLength
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
overlapSize = min(self.settings.crossFadeOverlapSize, self.unpackedData_length)
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
self.prev_strength = torch.FloatTensor(self.np_prev_strength)
self.cur_strength = torch.FloatTensor(self.np_cur_strength)
# torch.set_printoptions(edgeitems=2100)
print("Generated Strengths")
# print(f"cross fade: start:{cf_offset} end:{cf_end} range:{cf_range}")
# print(f"target_len:{unpackedData.shape[0]}, prev_len:{len(self.prev_strength)} cur_len:{len(self.cur_strength)}")
# print("Prev", self.prev_strength)
# print("Cur", self.cur_strength)
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'prev_audio1') == True:
delattr(self, "prev_audio1")
def _generate_input(self, unpackedData: any, convertSize: int):
# 今回変換するデータをテンソルとして整形する
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
# audio_norm = self.audio_buffer[:, -(convertSize + 1280 * 2):] # 変換対象の部分だけ抽出
audio_norm = self.audio_buffer[:, -(convertSize):] # 変換対象の部分だけ抽出
self.audio_buffer = audio_norm
# TBD: numpy <--> pytorch変換が行ったり来たりしているが、まずは動かすことを最優先。
audio_norm_np = audio_norm.squeeze().numpy().astype(np.float64)
if self.settings.f0Detector == "dio":
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
else:
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32))
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
# dispose_stft_specs = 2
# spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
# f0 = f0[dispose_stft_specs:-dispose_stft_specs]
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(self.settings.srcId)])
# data = (self.text_norm, spec, audio_norm, sid)
# data = TextAudioSpeakerCollate()([data])
data = TextAudioSpeakerCollate(
sample_rate=self.hps.data.sampling_rate,
hop_size=self.hps.data.hop_length,
f0_factor=self.settings.f0Factor
)([(spec, sid, f0)])
return data
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No ONNX session.")
return np.zeros(1).astype(np.int16)
spec, spec_lengths, sid_src, sin, d = data
sid_tgt1 = torch.LongTensor([self.settings.dstId])
audio1 = self.onnx_session.run(
["audio"],
{
"specs": spec.numpy(),
"lengths": spec_lengths.numpy(),
"sin": sin.numpy(),
"d0": d[0][:1].numpy(),
"d1": d[1][:1].numpy(),
"d2": d[2][:1].numpy(),
"d3": d[3][:1].numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy()
})[0][0, 0] * self.hps.data.max_wav_value
return audio1
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
if self.settings.gpu < 0 or self.gpu_num == 0:
dev = torch.device("cpu")
else:
dev = torch.device("cuda", index=self.settings.gpu)
with torch.no_grad():
spec, spec_lengths, sid_src, sin, d = data
spec = spec.to(dev)
spec_lengths = spec_lengths.to(dev)
sid_src = sid_src.to(dev)
sin = sin.to(dev)
d = tuple([d[:1].to(dev) for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
result = audio1.float().cpu().numpy()
return result
def on_request(self, unpackedData: any):
with Timer("pre-process") as t:
if self.settings.inputSampleRate != 24000:
unpackedData = resampy.resample(unpackedData, 48000, 24000)
convertSize = unpackedData.shape[0] + min(self.settings.crossFadeOverlapSize, unpackedData.shape[0])
# print(convertSize, unpackedData.shape[0])
if convertSize < 8192:
convertSize = 8192
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
self._generate_strength(unpackedData.shape[0])
data = self._generate_input(unpackedData, convertSize)
preprocess_time = t.secs
with Timer("main-process") as t:
try:
if self.settings.framework == "ONNX":
audio = self._onnx_inference(data)
# result = self.voiceChanger._onnx_inference(data, unpackedData.shape[0])
else:
audio = self._pyTorch_inference(data)
# result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0])
inputSize = unpackedData.shape[0]
if hasattr(self, 'np_prev_audio1') == True:
np.set_printoptions(threshold=10000)
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur
cur = audio[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0)
else:
result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
if hasattr(self, "np_prev_audio1"):
del self.np_prev_audio1
if hasattr(self, "prev_audio1"):
del self.prev_audio1
return np.zeros(1).astype(np.int16), [0, 0, 0]
mainprocess_time = t.secs
with Timer("post-process") as t:
result = result.astype(np.int16)
# print("on_request result size:",result.shape)
if self.settings.recordIO == 1:
self.stream_in.write(unpackedData.astype(np.int16).tobytes())
self.stream_out.write(result.tobytes())
if self.settings.inputSampleRate != 24000:
result = resampy.resample(result, 24000, 48000).astype(np.int16)
postprocess_time = t.secs
perf = [preprocess_time, mainprocess_time, postprocess_time]
return result, perf
##############
class Timer(object):
def __init__(self, title: str):
self.title = title
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs