WIP: refactoring

This commit is contained in:
wataru 2023-04-28 13:49:40 +09:00
parent bd3667117e
commit c96609640d
5 changed files with 249 additions and 148 deletions

View File

@ -16,6 +16,9 @@ from restapi.MMVC_Rest import MMVC_Rest
from const import NATIVE_CLIENT_FILE_MAC, NATIVE_CLIENT_FILE_WIN, SSL_KEY_DIR
import subprocess
import multiprocessing as mp
from misc.log_control import setup_loggers
setup_loggers()
def setupArgParser():

View File

@ -8,32 +8,31 @@ class UvicornSuppressFilter(logging.Filter):
return False
# logger = logging.getLogger("uvicorn.error")
# logger.addFilter(UvicornSuppressFilter())
def setup_loggers():
# logger = logging.getLogger("uvicorn.error")
# logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.tasks.hubert_pretraining")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.tasks.hubert_pretraining")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.models.hubert.hubert")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.models.hubert.hubert")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.tasks.text_to_speech")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.tasks.text_to_speech")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.ssa")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.ssa")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.interpreter")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.interpreter")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.byteflow")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("numba.core.byteflow")
logger.addFilter(UvicornSuppressFilter())
# logger.propagate = False
logger = logging.getLogger("multipart.multipart")
logger.propagate = False
# logger.propagate = False
logger = logging.getLogger("multipart.multipart")
logger.propagate = False
logging.getLogger('asyncio').setLevel(logging.WARNING)
logging.getLogger("asyncio").setLevel(logging.WARNING)

View File

@ -1,6 +1,10 @@
import sys
import os
if sys.platform.startswith('darwin'):
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.VoiceChangerModel import AudioInOut
if sys.platform.startswith("darwin"):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
if len(baseDir) != 1:
print("baseDir should be only one ", baseDir)
@ -12,23 +16,32 @@ else:
sys.path.append(modulePath)
from dataclasses import dataclass, asdict
from dataclasses import dataclass, asdict, field
import numpy as np
import torch
import onnxruntime
import pyworld as pw
from symbols import symbols
from models import SynthesizerTrn
from voice_changer.MMVCv13.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file
from symbols import symbols # type:ignore
from models import SynthesizerTrn # type:ignore
from voice_changer.MMVCv13.TrainerFunctions import (
TextAudioSpeakerCollate,
spectrogram_torch,
load_checkpoint,
get_hparams_from_file,
)
from Exceptions import NoModeLoadedException
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
providers = [
"OpenVINOExecutionProvider",
"CUDAExecutionProvider",
"DmlExecutionProvider",
"CPUExecutionProvider",
]
@dataclass
class MMVCv13Settings():
class MMVCv13Settings:
gpu: int = 0
srcId: int = 0
dstId: int = 101
@ -40,11 +53,13 @@ class MMVCv13Settings():
# ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId"]
floatData = []
floatData: list[str] = field(default_factory=lambda: [])
strData = ["framework"]
class MMVCv13:
audio_buffer: AudioInOut | None = None
def __init__(self):
self.settings = MMVCv13Settings()
self.net_g = None
@ -53,51 +68,62 @@ class MMVCv13:
self.gpu_num = torch.cuda.device_count()
self.text_norm = torch.LongTensor([0, 6, 0])
def loadModel(self, props):
self.settings.configFile = props["files"]["configFilename"]
def loadModel(self, props: LoadModelParams):
self.settings.configFile = props.files.configFilename
self.hps = get_hparams_from_file(self.settings.configFile)
self.settings.pyTorchModelFile = props["files"]["pyTorchModelFilename"]
self.settings.onnxModelFile = props["files"]["onnxModelFilename"]
self.settings.pyTorchModelFile = props.files.pyTorchModelFilename
self.settings.onnxModelFile = props.files.onnxModelFilename
# PyTorchモデル生成
if self.settings.pyTorchModelFile != None:
if self.settings.pyTorchModelFile is not None:
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model)
**self.hps.model
)
self.net_g.eval()
load_checkpoint(self.settings.pyTorchModelFile, self.net_g, None)
# ONNXモデル生成
if self.settings.onnxModelFile != None:
if self.settings.onnxModelFile is not None:
ort_options = onnxruntime.SessionOptions()
ort_options.intra_op_num_threads = 8
self.onnx_session = onnxruntime.InferenceSession(
self.settings.onnxModelFile,
providers=providers
self.settings.onnxModelFile, providers=providers
)
return self.get_info()
def update_settings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
def update_settings(self, key: str, val: int | float | str):
if key == "onnxExecutionProvider" and self.onnx_session is not None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=[val], provider_options=provider_options
)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
val = int(val)
setattr(self.settings, key, val)
if (
key == "gpu"
and val >= 0
and val < self.gpu_num
and self.onnx_session is not None
):
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=["CUDAExecutionProvider"],
provider_options=provider_options,
)
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
@ -110,10 +136,12 @@ class MMVCv13:
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
data["onnxExecutionProviders"] = (
self.onnx_session.get_providers() if self.onnx_session is not None else []
)
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
if data[f] is not None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
@ -121,22 +149,35 @@ class MMVCv13:
return data
def get_processing_sampling_rate(self):
if hasattr(self, "hps") == False:
if hasattr(self, "hps") is False:
raise NoModeLoadedException("config")
return self.hps.data.sampling_rate
def _get_spec(self, audio: any):
spec = spectrogram_torch(audio, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
def _get_spec(self, audio: AudioInOut):
spec = spectrogram_torch(
audio,
self.hps.data.filter_length,
self.hps.data.sampling_rate,
self.hps.data.hop_length,
self.hps.data.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
return spec
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
def generate_input(
self,
newData: AudioInOut,
inputSize: int,
crossfadeSize: int,
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
if self.audio_buffer is not None:
self.audio_buffer = np.concatenate(
[self.audio_buffer, newData], 0
) # 過去のデータに連結
else:
self.audio_buffer = newData
@ -145,9 +186,12 @@ class MMVCv13:
if convertSize < 8192:
convertSize = 8192
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
convertSize = convertSize + (
self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
)
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
convertOffset = -1 * convertSize
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
audio = torch.FloatTensor(self.audio_buffer)
audio_norm = audio.unsqueeze(0) # unsqueeze
@ -160,25 +204,29 @@ class MMVCv13:
return data
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
if hasattr(self, "onnx_session") is False or self.onnx_session is None:
print("[Voice Changer] No ONNX session.")
raise NoModeLoadedException("ONNX")
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data]
sid_tgt1 = torch.LongTensor([self.settings.dstId])
# if spec.size()[2] >= 8:
audio1 = self.onnx_session.run(
["audio"],
{
"specs": spec.numpy(),
"lengths": spec_lengths.numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy()
})[0][0, 0] * self.hps.data.max_wav_value
audio1 = (
self.onnx_session.run(
["audio"],
{
"specs": spec.numpy(),
"lengths": spec_lengths.numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy(),
},
)[0][0, 0]
* self.hps.data.max_wav_value
)
return audio1
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") == False or self.net_g == None:
if hasattr(self, "net_g") is False or self.net_g is None:
print("[Voice Changer] No pyTorch session.")
raise NoModeLoadedException("pytorch")
@ -188,11 +236,19 @@ class MMVCv13:
dev = torch.device("cuda", index=self.settings.gpu)
with torch.no_grad():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(dev) for x in data]
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [
x.to(dev) for x in data
]
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
audio1 = (self.net_g.to(dev).voice_conversion(spec, spec_lengths, sid_src=sid_src,
sid_tgt=sid_target)[0, 0].data * self.hps.data.max_wav_value)
audio1 = (
self.net_g.to(dev)
.voice_conversion(
spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target
)[0, 0]
.data
* self.hps.data.max_wav_value
)
result = audio1.float().cpu().numpy()
return result
@ -208,7 +264,7 @@ class MMVCv13:
del self.net_g
del self.onnx_session
remove_path = os.path.join("MMVC_Client_v13", "python")
sys.path = [x for x in sys.path if x.endswith(remove_path) == False]
sys.path = [x for x in sys.path if x.endswith(remove_path) is False]
for key in list(sys.modules):
val = sys.modules.get(key)
@ -217,5 +273,5 @@ class MMVCv13:
if file_path.find(remove_path + os.path.sep) >= 0:
print("remove", key, file_path)
sys.modules.pop(key)
except Exception as e:
except: # type:ignore
pass

View File

@ -1,36 +1,58 @@
import torch
import os, sys, json
import os
import sys
import json
import logging
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
if torch.min(y) < -1.0:
print("min value is ", torch.min(y))
if torch.max(y) > 1.0:
print("max value is ", torch.max(y))
global hann_window
dtype_device = str(y.dtype) + '_' + str(y.device)
wnsize_dtype_device = str(win_size) + '_' + dtype_device
dtype_device = str(y.dtype) + "_" + str(y.device)
wnsize_dtype_device = str(win_size) + "_" + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
dtype=y.dtype, device=y.device
)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = torch.nn.functional.pad(
y.unsqueeze(1),
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
mode="reflect",
)
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.stft(
y,
n_fft,
hop_length=hop_size,
win_length=win_size,
window=hann_window[wnsize_dtype_device],
center=center,
pad_mode="reflect",
normalized=False,
onesided=True,
return_complex=True,
)
spec = torch.view_as_real(spec)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False, no_text = False):
class TextAudioSpeakerCollate:
"""Zero-pads model inputs and targets"""
def __init__(self, return_ids=False, no_text=False):
self.return_ids = return_ids
self.no_text = no_text
@ -42,8 +64,8 @@ class TextAudioSpeakerCollate():
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
@ -64,88 +86,108 @@ class TextAudioSpeakerCollate():
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_padded[i, : text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_padded[i, :, : spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_padded[i, :, : wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
sid[i] = row[3]
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
return (
text_padded,
text_lengths,
spec_padded,
spec_lengths,
wav_padded,
wav_lengths,
sid,
ids_sorted_decreasing,
)
return (
text_padded,
text_lengths,
spec_padded,
spec_lengths,
wav_padded,
wav_lengths,
sid,
)
def load_checkpoint(checkpoint_path, model, optimizer=None):
assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}"
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
learning_rate = checkpoint_dict['learning_rate']
if optimizer is not None:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
saved_state_dict = checkpoint_dict['model']
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():
try:
new_state_dict[k] = saved_state_dict[k]
except:
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict)
else:
model.load_state_dict(new_state_dict)
logger.info("Loaded checkpoint '{}' (iteration {})" .format(
checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
assert os.path.isfile(
checkpoint_path
), f"No such file or directory: {checkpoint_path}"
checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
iteration = checkpoint_dict["iteration"]
learning_rate = checkpoint_dict["learning_rate"]
if optimizer is not None:
optimizer.load_state_dict(checkpoint_dict["optimizer"])
saved_state_dict = checkpoint_dict["model"]
if hasattr(model, "module"):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict = {}
for k, v in state_dict.items():
try:
new_state_dict[k] = saved_state_dict[k]
except:
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v
if hasattr(model, "module"):
model.module.load_state_dict(new_state_dict)
else:
model.load_state_dict(new_state_dict)
logger.info(
"Loaded checkpoint '{}' (iteration {})".format(checkpoint_path, iteration)
)
return model, optimizer, learning_rate, iteration
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams =HParams(**config)
return hparams
hparams = HParams(**config)
return hparams
class HParams():
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
class HParams:
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def values(self):
return self.__dict__.values()
def keys(self):
return self.__dict__.keys()
def __len__(self):
return len(self.__dict__)
def items(self):
return self.__dict__.items()
def __getitem__(self, key):
return getattr(self, key)
def values(self):
return self.__dict__.values()
def __setitem__(self, key, value):
return setattr(self, key, value)
def __len__(self):
return len(self.__dict__)
def __contains__(self, key):
return key in self.__dict__
def __getitem__(self, key):
return getattr(self, key)
def __repr__(self):
return self.__dict__.__repr__()
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()

View File

@ -124,6 +124,7 @@ class VoiceChanger:
try:
return self.voiceChanger.loadModel(props)
except Exception as e:
print(traceback.format_exc())
print("[Voice Changer] Model Load Error! Check your model is valid.", e)
return {"status": "NG"}