From c14ea07dd54de41a8c326ba4193692f8dd0d311c Mon Sep 17 00:00:00 2001 From: wataru Date: Wed, 8 Mar 2023 01:40:03 +0900 Subject: [PATCH] WIP: refactor, move v13 --- .gitignore | 4 +- server/README.md | 2 +- server/voice_changer/MMVCv13/MMVCv13.py | 187 ++++++++++++++++++ .../{ => MMVCv13}/TrainerFunctions.py | 0 server/voice_changer/{ => MMVCv15}/MMVCv15.py | 4 +- .../{ => MMVCv15}/client_modules.py | 0 server/voice_changer/VoiceChanger.py | 7 +- 7 files changed, 196 insertions(+), 8 deletions(-) create mode 100644 server/voice_changer/MMVCv13/MMVCv13.py rename server/voice_changer/{ => MMVCv13}/TrainerFunctions.py (100%) rename server/voice_changer/{ => MMVCv15}/MMVCv15.py (97%) rename server/voice_changer/{ => MMVCv15}/client_modules.py (100%) diff --git a/.gitignore b/.gitignore index 3be54042..c76cb45f 100644 --- a/.gitignore +++ b/.gitignore @@ -4,8 +4,8 @@ node_modules __pycache__ server/upload_dir/ -server/MMVC_Trainer/ -server/MMVC_Client/ +server/MMVC_Client_v13/ +server/MMVC_Client_v15/ server/keys server/info server/in.wav diff --git a/server/README.md b/server/README.md index a7cf49b0..d070cc73 100644 --- a/server/README.md +++ b/server/README.md @@ -25,6 +25,6 @@ cd .. # for 1.5 cd MMVC_Client -git checkout 1424609e53c79e2d629add10ae4bfb16fc0c3c82 +git checkout 6dd4f2451fec701d85f611fa831d7e5f4ddce8da cd .. ``` \ No newline at end of file diff --git a/server/voice_changer/MMVCv13/MMVCv13.py b/server/voice_changer/MMVCv13/MMVCv13.py new file mode 100644 index 00000000..f31eb24d --- /dev/null +++ b/server/voice_changer/MMVCv13/MMVCv13.py @@ -0,0 +1,187 @@ +import sys +sys.path.append("MMVC_Client_v13/python") +from dataclasses import dataclass, asdict +import os +import numpy as np +import torch +import onnxruntime +import pyworld as pw + +from symbols import symbols +from models import SynthesizerTrn +from voice_changer.MMVCv13.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file + +providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] + + +@dataclass +class MMVCv13Settings(): + gpu: int = 0 + srcId: int = 0 + dstId: int = 101 + + framework: str = "PyTorch" # PyTorch or ONNX + pyTorchModelFile: str = "" + onnxModelFile: str = "" + configFile: str = "" + + # ↓mutableな物だけ列挙 + intData = ["gpu", "srcId", "dstId"] + floatData = [] + strData = ["framework"] + + +class MMVCv13: + def __init__(self): + self.settings = MMVCv13Settings() + self.net_g = None + self.onnx_session = None + + self.gpu_num = torch.cuda.device_count() + self.text_norm = torch.LongTensor([0, 6, 0]) + + def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None): + self.settings.configFile = config + self.hps = get_hparams_from_file(config) + + if pyTorch_model_file != None: + self.settings.pyTorchModelFile = pyTorch_model_file + if onnx_model_file: + self.settings.onnxModelFile = onnx_model_file + + # PyTorchモデル生成 + if pyTorch_model_file != None: + self.net_g = SynthesizerTrn( + len(symbols), + self.hps.data.filter_length // 2 + 1, + self.hps.train.segment_size // self.hps.data.hop_length, + n_speakers=self.hps.data.n_speakers, + **self.hps.model) + self.net_g.eval() + load_checkpoint(pyTorch_model_file, self.net_g, None) + + # ONNXモデル生成 + if onnx_model_file != None: + ort_options = onnxruntime.SessionOptions() + ort_options.intra_op_num_threads = 8 + self.onnx_session = onnxruntime.InferenceSession( + onnx_model_file, + providers=providers + ) + return self.get_info() + + def update_setteings(self, key: str, val: any): + if key == "onnxExecutionProvider" and self.onnx_session != None: + if val == "CUDAExecutionProvider": + if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num: + self.settings.gpu = 0 + provider_options = [{'device_id': self.settings.gpu}] + self.onnx_session.set_providers(providers=[val], provider_options=provider_options) + else: + self.onnx_session.set_providers(providers=[val]) + elif key in self.settings.intData: + setattr(self.settings, key, int(val)) + if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None: + providers = self.onnx_session.get_providers() + print("Providers:", providers) + if "CUDAExecutionProvider" in providers: + provider_options = [{'device_id': self.settings.gpu}] + self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) + elif key in self.settings.floatData: + setattr(self.settings, key, float(val)) + elif key in self.settings.strData: + setattr(self.settings, key, str(val)) + else: + return False + + return True + + def get_info(self): + data = asdict(self.settings) + + data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else [] + files = ["configFile", "pyTorchModelFile", "onnxModelFile"] + for f in files: + if data[f] != None and os.path.exists(data[f]): + data[f] = os.path.basename(data[f]) + else: + data[f] = "" + + return data + + def _get_spec(self, audio: any): + spec = spectrogram_torch(audio, self.hps.data.filter_length, + self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, + center=False) + spec = torch.squeeze(spec, 0) + return spec + + def generate_input(self, newData: any, convertSize: int): + newData = newData.astype(np.float32) + + if hasattr(self, "audio_buffer"): + self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + else: + self.audio_buffer = newData + + self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出 + + audio = torch.FloatTensor(self.audio_buffer) + audio_norm = audio / self.hps.data.max_wav_value # normalize + audio_norm = audio_norm.unsqueeze(0) # unsqueeze + spec = self._get_spec(audio_norm) + sid = torch.LongTensor([int(self.settings.srcId)]) + + data = (self.text_norm, spec, audio_norm, sid) + data = TextAudioSpeakerCollate()([data]) + + return data + + def _onnx_inference(self, data): + if hasattr(self, "onnx_session") == False or self.onnx_session == None: + print("[Voice Changer] No ONNX session.") + return np.zeros(1).astype(np.int16) + + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data] + sid_tgt1 = torch.LongTensor([self.settings.dstId]) + # if spec.size()[2] >= 8: + audio1 = self.onnx_session.run( + ["audio"], + { + "specs": spec.numpy(), + "lengths": spec_lengths.numpy(), + "sid_src": sid_src.numpy(), + "sid_tgt": sid_tgt1.numpy() + })[0][0, 0] * self.hps.data.max_wav_value + return audio1 + + def _pyTorch_inference(self, data): + if hasattr(self, "net_g") == False or self.net_g == None: + print("[Voice Changer] No pyTorch session.") + return np.zeros(1).astype(np.int16) + + if self.settings.gpu < 0 or self.gpu_num == 0: + dev = torch.device("cpu") + else: + dev = torch.device("cuda", index=self.settings.gpu) + + with torch.no_grad(): + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(dev) for x in data] + sid_target = torch.LongTensor([self.settings.dstId]).to(dev) + + audio1 = (self.net_g.to(dev).voice_conversion(spec, spec_lengths, sid_src=sid_src, + sid_tgt=sid_target)[0, 0].data * self.hps.data.max_wav_value) + result = audio1.float().cpu().numpy() + + return result + + def inference(self, data): + if self.settings.framework == "ONNX": + audio = self._onnx_inference(data) + else: + audio = self._pyTorch_inference(data) + return audio + + def destroy(self): + del self.net_g + del self.onnx_session diff --git a/server/voice_changer/TrainerFunctions.py b/server/voice_changer/MMVCv13/TrainerFunctions.py similarity index 100% rename from server/voice_changer/TrainerFunctions.py rename to server/voice_changer/MMVCv13/TrainerFunctions.py diff --git a/server/voice_changer/MMVCv15.py b/server/voice_changer/MMVCv15/MMVCv15.py similarity index 97% rename from server/voice_changer/MMVCv15.py rename to server/voice_changer/MMVCv15/MMVCv15.py index 5c8b0e06..109a3735 100644 --- a/server/voice_changer/MMVCv15.py +++ b/server/voice_changer/MMVCv15/MMVCv15.py @@ -1,5 +1,5 @@ import sys -sys.path.append("MMVC_Client/python") +sys.path.append("MMVC_Client_v15/python") from dataclasses import dataclass, asdict import os import numpy as np @@ -8,7 +8,7 @@ import onnxruntime import pyworld as pw from models import SynthesizerTrn -from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint +from voice_changer.MMVCv15.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] diff --git a/server/voice_changer/client_modules.py b/server/voice_changer/MMVCv15/client_modules.py similarity index 100% rename from server/voice_changer/client_modules.py rename to server/voice_changer/MMVCv15/client_modules.py diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index b6fb5199..8ed87908 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -1,6 +1,5 @@ import sys -sys.path.append("MMVC_Client/python") from const import TMP_DIR import torch @@ -11,7 +10,8 @@ from dataclasses import dataclass, asdict import resampy -from voice_changer.MMVCv15 import MMVCv15 +# from voice_changer.MMVCv15.MMVCv15 import MMVCv15 +from voice_changer.MMVCv13.MMVCv13 import MMVCv13 from voice_changer.IORecorder import IORecorder from voice_changer.IOAnalyzer import IOAnalyzer @@ -53,7 +53,8 @@ class VoiceChanger(): self.currentCrossFadeEndRate = 0 self.currentCrossFadeOverlapSize = 0 - self.voiceChanger = MMVCv15() + # self.voiceChanger = MMVCv15() + self.voiceChanger = MMVCv13() self.gpu_num = torch.cuda.device_count() self.prev_audio = np.zeros(1)