WIP: refactor, separate mmvc main process

This commit is contained in:
wataru 2023-03-08 00:38:09 +09:00
parent 6f97320c0d
commit bbb068be72
3 changed files with 100 additions and 381 deletions

View File

@ -28,8 +28,7 @@ class MMVC_Namespace(socketio.AsyncNamespace):
print(data) print(data)
await self.emit('response', [timestamp, 0], to=sid) await self.emit('response', [timestamp, 0], to=sid)
else: else:
# tuple of short unpackedData = np.array(struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)).astype(np.int16)
unpackedData = struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)
# audio1, perf = self.voiceChangerManager.changeVoice(unpackedData) # audio1, perf = self.voiceChangerManager.changeVoice(unpackedData)
res = self.voiceChangerManager.changeVoice(unpackedData) res = self.voiceChangerManager.changeVoice(unpackedData)

View File

@ -1,18 +1,14 @@
import sys import sys
sys.path.append("MMVC_Client/python") sys.path.append("MMVC_Client/python")
import os
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
import os
import numpy as np import numpy as np
import torch import torch
import onnxruntime import onnxruntime
import pyworld as pw import pyworld as pw
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
from models import SynthesizerTrn from models import SynthesizerTrn
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
from const import ERROR_NO_ONNX_SESSION, TMP_DIR
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@ -23,15 +19,8 @@ class MMVCv15Settings():
srcId: int = 0 srcId: int = 0
dstId: int = 101 dstId: int = 101
inputSampleRate: int = 24000 # 48000 or 24000
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
f0Factor: float = 1.0 f0Factor: float = 1.0
f0Detector: str = "dio" # dio or harvest f0Detector: str = "dio" # dio or harvest
recordIO: int = 0 # 0:off, 1:on
framework: str = "PyTorch" # PyTorch or ONNX framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = "" pyTorchModelFile: str = ""
@ -39,28 +28,23 @@ class MMVCv15Settings():
configFile: str = "" configFile: str = ""
# ↓mutableな物だけ列挙 # ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId", "inputSampleRate", "crossFadeOverlapSize", "recordIO"] intData = ["gpu", "srcId", "dstId"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "f0Factor"] floatData = ["f0Factor"]
strData = ["framework", "f0Detector"] strData = ["framework", "f0Detector"]
class MMVCv15: class MMVCv15:
def __init__(self): def __init__(self):
# 初期化
self.settings = MMVCv15Settings() self.settings = MMVCv15Settings()
self.net_g = None self.net_g = None
self.onnx_session = None self.onnx_session = None
self.gpu_num = torch.cuda.device_count() self.gpu_num = torch.cuda.device_count()
self.text_norm = torch.LongTensor([0, 6, 0])
self.audio_buffer = torch.zeros(1, 0)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None): def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config self.settings.configFile = config
self.hps = get_hparams_from_file(config) self.hps = get_hparams_from_file(config)
if pyTorch_model_file != None: if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file: if onnx_model_file:
@ -88,6 +72,7 @@ class MMVCv15:
) )
self.net_g.eval() self.net_g.eval()
load_checkpoint(pyTorch_model_file, self.net_g, None) load_checkpoint(pyTorch_model_file, self.net_g, None)
# utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
# ONNXモデル生成 # ONNXモデル生成
if onnx_model_file != None: if onnx_model_file != None:
@ -99,23 +84,6 @@ class MMVCv15:
) )
return self.get_info() return self.get_info()
def destroy(self):
del self.net_g
del self.onnx_session
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def update_setteings(self, key: str, val: any): def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None: if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider": if val == "CUDAExecutionProvider":
@ -133,49 +101,64 @@ class MMVCv15:
if "CUDAExecutionProvider" in providers: if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}] provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0
elif key in self.settings.floatData: elif key in self.settings.floatData:
setattr(self.settings, key, float(val)) setattr(self.settings, key, float(val))
elif key in self.settings.strData: elif key in self.settings.strData:
setattr(self.settings, key, str(val)) setattr(self.settings, key, str(val))
else: else:
print(f"{key} is not mutalbe variable!") return False
return self.get_info() return True
def _generate_input(self, unpackedData: any, convertSize: int): def get_info(self):
# 今回変換するデータをテンソルとして整形する data = asdict(self.settings)
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
# audio_norm = self.audio_buffer[:, -(convertSize + 1280 * 2):] # 変換対象の部分だけ抽出
audio_norm = self.audio_buffer[:, -(convertSize):] # 変換対象の部分だけ抽出
self.audio_buffer = audio_norm
# TBD: numpy <--> pytorch変換が行ったり来たりしているが、まずは動かすことを最優先。 data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
audio_norm_np = audio_norm.squeeze().numpy().astype(np.float64) files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
if self.settings.f0Detector == "dio": for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def _get_f0(self, detector: str, newData: any):
audio_norm_np = newData.astype(np.float64)
if detector == "dio":
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5) _f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate) f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
else: else:
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0) f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length)) f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32)) f0 = torch.from_numpy(f0.astype(np.float32))
return f0
def _get_spec(self, newData: any):
audio = torch.FloatTensor(newData)
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length, spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False) center=False)
# dispose_stft_specs = 2
# spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
# f0 = f0[dispose_stft_specs:-dispose_stft_specs]
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
return spec
def generate_input(self, newData: any, convertSize: int):
newData = newData.astype(np.float32)
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
f0 = self._get_f0(self.settings.f0Detector, self.audio_buffer) # f0 生成
spec = self._get_spec(self.audio_buffer)
sid = torch.LongTensor([int(self.settings.srcId)]) sid = torch.LongTensor([int(self.settings.srcId)])
# data = (self.text_norm, spec, audio_norm, sid)
# data = TextAudioSpeakerCollate()([data])
data = TextAudioSpeakerCollate( data = TextAudioSpeakerCollate(
sample_rate=self.hps.data.sampling_rate, sample_rate=self.hps.data.sampling_rate,
hop_size=self.hps.data.hop_length, hop_size=self.hps.data.hop_length,
@ -184,17 +167,13 @@ class MMVCv15:
return data return data
def _onnx_inference(self, data, inputSize): def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None: if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No ONNX session.") print("[Voice Changer] No ONNX session.")
return np.zeros(1).astype(np.int16) return np.zeros(1).astype(np.int16)
# x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data]
# sid_tgt1 = torch.LongTensor([self.settings.dstId])
spec, spec_lengths, sid_src, sin, d = data spec, spec_lengths, sid_src, sin, d = data
sid_tgt1 = torch.LongTensor([self.settings.dstId]) sid_tgt1 = torch.LongTensor([self.settings.dstId])
# if spec.size()[2] >= 8:
audio1 = self.onnx_session.run( audio1 = self.onnx_session.run(
["audio"], ["audio"],
{ {
@ -208,107 +187,38 @@ class MMVCv15:
"sid_src": sid_src.numpy(), "sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy() "sid_tgt": sid_tgt1.numpy()
})[0][0, 0] * self.hps.data.max_wav_value })[0][0, 0] * self.hps.data.max_wav_value
return audio1
if hasattr(self, 'np_prev_audio1') == True: def _pyTorch_inference(self, data):
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1*(inputSize + overlapSize) , -1*inputSize)
powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur
cur = audio1[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0)
else:
result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio1
return result
def _pyTorch_inference(self, data, inputSize):
if hasattr(self, "net_g") == False or self.net_g == None: if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.") print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16) return np.zeros(1).astype(np.int16)
if self.settings.gpu < 0 or self.gpu_num == 0: if self.settings.gpu < 0 or self.gpu_num == 0:
with torch.no_grad(): dev = torch.device("cpu")
spec, spec_lengths, sid_src, sin, d = data
spec = spec.cpu()
spec_lengths = spec_lengths.cpu()
sid_src = sid_src.cpu()
sin = sin.cpu()
d = tuple([d[:1].cpu() for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).cpu()
audio1 = self.net_g.cpu().voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
if self.prev_strength.device != torch.device('cpu'):
print(f"prev_strength move from {self.prev_strength.device} to cpu")
self.prev_strength = self.prev_strength.cpu()
if self.cur_strength.device != torch.device('cpu'):
print(f"cur_strength move from {self.cur_strength.device} to cpu")
self.cur_strength = self.cur_strength.cpu()
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): # prev_audio1が所望のデバイスに無い場合は一回休み。
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur
cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else:
cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur
self.prev_audio1 = audio1
result = result.cpu().float().numpy()
else: else:
with torch.no_grad(): dev = torch.device("cuda", index=self.settings.gpu)
spec, spec_lengths, sid_src, sin, d = data
spec = spec.cuda(self.settings.gpu)
spec_lengths = spec_lengths.cuda(self.settings.gpu)
sid_src = sid_src.cuda(self.settings.gpu)
sin = sin.cuda(self.settings.gpu)
d = tuple([d[:1].cuda(self.settings.gpu) for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu)
# audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, with torch.no_grad():
# sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value spec, spec_lengths, sid_src, sin, d = data
spec = spec.to(dev)
spec_lengths = spec_lengths.to(dev)
sid_src = sid_src.to(dev)
sin = sin.to(dev)
d = tuple([d[:1].to(dev) for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sin, d, audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value result = audio1.float().cpu().numpy()
if self.prev_strength.device != torch.device('cuda', self.settings.gpu):
print(f"prev_strength move from {self.prev_strength.device} to gpu{self.settings.gpu}")
self.prev_strength = self.prev_strength.cuda(self.settings.gpu)
if self.cur_strength.device != torch.device('cuda', self.settings.gpu):
print(f"cur_strength move from {self.cur_strength.device} to gpu{self.settings.gpu}")
self.cur_strength = self.cur_strength.cuda(self.settings.gpu)
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.settings.gpu):
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur
# print(overlapSize, prev_overlap.shape, cur_overlap.shape, self.prev_strength.shape, self.cur_strength.shape)
# print(self.prev_audio1.shape, audio1.shape, inputSize, overlapSize)
cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else:
cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur
self.prev_audio1 = audio1
result = result.cpu().float().numpy()
return result return result
def inference(self, data):
if self.settings.framework == "ONNX":
audio = self._onnx_inference(data)
else:
audio = self._pyTorch_inference(data)
return audio
def destroy(self):
del self.net_g
del self.onnx_session

View File

@ -2,7 +2,7 @@
import sys import sys
sys.path.append("MMVC_Client/python") sys.path.append("MMVC_Client/python")
from const import ERROR_NO_ONNX_SESSION, TMP_DIR from const import TMP_DIR
import torch import torch
import os import os
import traceback import traceback
@ -10,13 +10,6 @@ import numpy as np
from dataclasses import dataclass, asdict from dataclasses import dataclass, asdict
import resampy import resampy
import onnxruntime
from symbols import symbols
from models import SynthesizerTrn
import pyworld as pw
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
from voice_changer.MMVCv15 import MMVCv15 from voice_changer.MMVCv15 import MMVCv15
from voice_changer.IORecorder import IORecorder from voice_changer.IORecorder import IORecorder
@ -27,17 +20,6 @@ import time
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
import wave
import matplotlib
matplotlib.use('Agg')
import pylab
import librosa
import librosa.display
SAMPLING_RATE = 24000
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav") STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png") STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png")
@ -46,33 +28,18 @@ STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")
@dataclass @dataclass
class VocieChangerSettings(): class VocieChangerSettings():
gpu: int = 0
srcId: int = 0
dstId: int = 101
inputSampleRate: int = 24000 # 48000 or 24000 inputSampleRate: int = 24000 # 48000 or 24000
crossFadeOffsetRate: float = 0.1 crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9 crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096 crossFadeOverlapSize: int = 4096
f0Factor: float = 1.0
f0Detector: str = "dio" # dio or harvest
recordIO: int = 0 # 0:off, 1:on recordIO: int = 0 # 0:off, 1:on
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
# ↓mutableな物だけ列挙 # ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId", "inputSampleRate", "crossFadeOverlapSize", "recordIO"] intData = ["inputSampleRate", "crossFadeOverlapSize", "recordIO"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "f0Factor"] floatData = ["crossFadeOffsetRate", "crossFadeEndRate"]
strData = ["framework", "f0Detector"] strData = []
def readMicrophone(queue, sid, deviceIndex):
print("READ MIC", queue, sid, deviceIndex)
class VoiceChanger(): class VoiceChanger():
@ -81,7 +48,6 @@ class VoiceChanger():
# 初期化 # 初期化
self.settings = VocieChangerSettings() self.settings = VocieChangerSettings()
self.unpackedData_length = 0 self.unpackedData_length = 0
self.net_g = None
self.onnx_session = None self.onnx_session = None
self.currentCrossFadeOffsetRate = 0 self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate = 0 self.currentCrossFadeEndRate = 0
@ -90,88 +56,22 @@ class VoiceChanger():
self.voiceChanger = MMVCv15() self.voiceChanger = MMVCv15()
self.gpu_num = torch.cuda.device_count() self.gpu_num = torch.cuda.device_count()
self.text_norm = torch.LongTensor([0, 6, 0])
self.prev_audio = np.zeros(1) self.prev_audio = np.zeros(1)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})") print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None): def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config return self.voiceChanger.loadModel(config, pyTorch_model_file, onnx_model_file)
self.hps = get_hparams_from_file(config)
if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file:
self.settings.onnxModelFile = onnx_model_file
# PyTorchモデル生成
if pyTorch_model_file != None:
self.net_g = SynthesizerTrn(
spec_channels=self.hps.data.filter_length // 2 + 1,
segment_size=self.hps.train.segment_size // self.hps.data.hop_length,
inter_channels=self.hps.model.inter_channels,
hidden_channels=self.hps.model.hidden_channels,
upsample_rates=self.hps.model.upsample_rates,
upsample_initial_channel=self.hps.model.upsample_initial_channel,
upsample_kernel_sizes=self.hps.model.upsample_kernel_sizes,
n_flow=self.hps.model.n_flow,
dec_out_channels=1,
dec_kernel_size=7,
n_speakers=self.hps.data.n_speakers,
gin_channels=self.hps.model.gin_channels,
requires_grad_pe=self.hps.requires_grad.pe,
requires_grad_flow=self.hps.requires_grad.flow,
requires_grad_text_enc=self.hps.requires_grad.text_enc,
requires_grad_dec=self.hps.requires_grad.dec
)
self.net_g.eval()
load_checkpoint(pyTorch_model_file, self.net_g, None)
# utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
# ONNXモデル生成
if onnx_model_file != None:
ort_options = onnxruntime.SessionOptions()
ort_options.intra_op_num_threads = 8
self.onnx_session = onnxruntime.InferenceSession(
onnx_model_file,
providers=providers
)
return self.get_info()
def destroy(self):
del self.net_g
del self.onnx_session
def get_info(self): def get_info(self):
data = asdict(self.settings) data = asdict(self.settings)
data.update(self.voiceChanger.get_info())
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data return data
def update_setteings(self, key: str, val: any): def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None: if key in self.settings.intData:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
setattr(self.settings, key, int(val)) setattr(self.settings, key, int(val))
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate": if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0 self.unpackedData_length = 0
if key == "recordIO" and val == 1: if key == "recordIO" and val == 1:
@ -194,13 +94,14 @@ class VoiceChanger():
except Exception as e: except Exception as e:
print("recordIO exception", e) print("recordIO exception", e)
elif key in self.settings.floatData: elif key in self.settings.floatData:
setattr(self.settings, key, float(val)) setattr(self.settings, key, float(val))
elif key in self.settings.strData: elif key in self.settings.strData:
setattr(self.settings, key, str(val)) setattr(self.settings, key, str(val))
else: else:
print(f"{key} is not mutalbe variable!") ret = self.voiceChanger.update_setteings(key, val)
if ret == False:
print(f"{key} is not mutalbe variable or unknown variable!")
return self.get_info() return self.get_info()
@ -234,138 +135,49 @@ class VoiceChanger():
if hasattr(self, 'np_prev_audio1') == True: if hasattr(self, 'np_prev_audio1') == True:
delattr(self, "np_prev_audio1") delattr(self, "np_prev_audio1")
def _get_f0(self, newData: any):
audio_norm_np = newData.astype(np.float64)
if self.settings.f0Detector == "dio":
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
else:
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32))
return f0
def _get_spec(self, newData: any):
audio = torch.FloatTensor(newData)
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
return spec
def _generate_input(self, newData: any, convertSize: int):
newData = np.array(newData).astype(np.float32)
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
f0 = self._get_f0(self.audio_buffer) # f0 生成
spec = self._get_spec(self.audio_buffer)
sid = torch.LongTensor([int(self.settings.srcId)])
data = TextAudioSpeakerCollate(
sample_rate=self.hps.data.sampling_rate,
hop_size=self.hps.data.hop_length,
f0_factor=self.settings.f0Factor
)([(spec, sid, f0)])
return data
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No ONNX session.")
return np.zeros(1).astype(np.int16)
spec, spec_lengths, sid_src, sin, d = data
sid_tgt1 = torch.LongTensor([self.settings.dstId])
audio1 = self.onnx_session.run(
["audio"],
{
"specs": spec.numpy(),
"lengths": spec_lengths.numpy(),
"sin": sin.numpy(),
"d0": d[0][:1].numpy(),
"d1": d[1][:1].numpy(),
"d2": d[2][:1].numpy(),
"d3": d[3][:1].numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy()
})[0][0, 0] * self.hps.data.max_wav_value
return audio1
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
if self.settings.gpu < 0 or self.gpu_num == 0:
dev = torch.device("cpu")
else:
dev = torch.device("cuda", index=self.settings.gpu)
with torch.no_grad():
spec, spec_lengths, sid_src, sin, d = data
spec = spec.to(dev)
spec_lengths = spec_lengths.to(dev)
sid_src = sid_src.to(dev)
sin = sin.to(dev)
d = tuple([d[:1].to(dev) for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
result = audio1.float().cpu().numpy()
return result
# receivedData: tuple of short # receivedData: tuple of short
def on_request(self, receivedData: any): def on_request(self, receivedData: any):
# 前処理
with Timer("pre-process") as t: with Timer("pre-process") as t:
if self.settings.inputSampleRate != 24000: if self.settings.inputSampleRate != 24000:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000) newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
else: else:
newData = receivedData newData = receivedData
convertSize = len(newData) + min(self.settings.crossFadeOverlapSize, len(newData))
inputSize = newData.shape[0]
convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
# print(convertSize, unpackedData.shape[0]) # print(convertSize, unpackedData.shape[0])
if convertSize < 8192: if convertSize < 8192:
convertSize = 8192 convertSize = 8192
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128)) convertSize = convertSize + (128 - (convertSize % 128))
self._generate_strength(inputSize)
self._generate_strength(len(newData)) data = self.voiceChanger.generate_input(newData, convertSize)
data = self._generate_input(newData, convertSize)
preprocess_time = t.secs preprocess_time = t.secs
# 変換処理
with Timer("main-process") as t: with Timer("main-process") as t:
try: try:
if self.settings.framework == "ONNX": # Inference
audio = self._onnx_inference(data) audio = self.voiceChanger.inference(data)
# result = self.voiceChanger._onnx_inference(data, unpackedData.shape[0])
else:
audio = self._pyTorch_inference(data)
# result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0])
inputSize = len(newData)
# CrossFade
if hasattr(self, 'np_prev_audio1') == True: if hasattr(self, 'np_prev_audio1') == True:
np.set_printoptions(threshold=10000) np.set_printoptions(threshold=10000)
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize) overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.np_prev_audio1[-1 * overlapSize:] prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize] cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
powered_prev = prev_overlap * self.np_prev_strength powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur powered_result = powered_prev + powered_cur
cur = audio[-1 * inputSize:-1 * overlapSize] cur = audio[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0) result = np.concatenate([powered_result, cur], axis=0)
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
else: else:
result = np.zeros(1).astype(np.int16) result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio self.np_prev_audio1 = audio
@ -378,18 +190,16 @@ class VoiceChanger():
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
mainprocess_time = t.secs mainprocess_time = t.secs
# 後処理
with Timer("post-process") as t: with Timer("post-process") as t:
result = result.astype(np.int16) result = result.astype(np.int16)
# print("on_request result size:",result.shape)
if self.settings.recordIO == 1:
# self.stream_in.write(unpackedData.astype(np.int16).tobytes())
# self.stream_out.write(result.tobytes())
self.ioRecorder.writeInput(receivedData.astype(np.int16).tobytes())
self.ioRecorder.writeOutput(result.tobytes())
if self.settings.inputSampleRate != 24000: if self.settings.inputSampleRate != 24000:
result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16) result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(result.tobytes())
postprocess_time = t.secs postprocess_time = t.secs
perf = [preprocess_time, mainprocess_time, postprocess_time] perf = [preprocess_time, mainprocess_time, postprocess_time]