diff --git a/.gitignore b/.gitignore index c76cb45f..e69503c9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ __pycache__ server/upload_dir/ server/MMVC_Client_v13/ server/MMVC_Client_v15/ +server/so-vits-svc-40v2/ server/keys server/info server/in.wav diff --git a/server/MMVCServerSIO.py b/server/MMVCServerSIO.py index 28de244b..665bdbeb 100755 --- a/server/MMVCServerSIO.py +++ b/server/MMVCServerSIO.py @@ -38,7 +38,7 @@ def setupArgParser(): parser.add_argument("--colab", type=strtobool, default=False, help="run on colab") parser.add_argument("--modelType", type=str, - default="MMVCv15", help="model type") + default="MMVCv15", help="model type: MMVCv13, MMVCv15, so-vits-svc-40v2") return parser diff --git a/server/README.md b/server/README.md index d070cc73..5cb04459 100644 --- a/server/README.md +++ b/server/README.md @@ -27,4 +27,10 @@ cd .. cd MMVC_Client git checkout 6dd4f2451fec701d85f611fa831d7e5f4ddce8da cd .. + +# for so-vits-svc +cd so-vits-svc/ +git checkout 016db3de81f6a4034b85ffba120554d07829f132 +cd .. + ``` \ No newline at end of file diff --git a/server/const.py b/server/const.py index 8b5144dc..8c62d05b 100644 --- a/server/const.py +++ b/server/const.py @@ -18,10 +18,6 @@ NATIVE_CLIENT_FILE_MAC = os.path.join(sys._MEIPASS, "voice-changer-native-client TMP_DIR = os.path.join(tmpdir.name, "tmp_dir") if hasattr(sys, "_MEIPASS") else "tmp_dir" os.makedirs(TMP_DIR, exist_ok=True) -# SSL_KEY_DIR = os.path.join(sys._MEIPASS, "keys") if hasattr(sys, "_MEIPASS") else "keys" -# MODEL_DIR = os.path.join(sys._MEIPASS, "logs") if hasattr(sys, "_MEIPASS") else "logs" -# UPLOAD_DIR = os.path.join(sys._MEIPASS, "upload_dir") if hasattr(sys, "_MEIPASS") else "upload_dir" - modelType = "MMVCv15" @@ -40,5 +36,6 @@ def getFrontendPath(): frontend_path = os.path.join(sys._MEIPASS, "dist_v15") if hasattr(sys, "_MEIPASS") else "../client/demo_v15/dist" elif modelType == "MMVCv13": frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist" - + elif modelType == "so-vits-svc-40v2": + frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist" return frontend_path diff --git a/server/misc/log_control.py b/server/misc/log_control.py index e692fbdb..e4fa09fb 100644 --- a/server/misc/log_control.py +++ b/server/misc/log_control.py @@ -2,13 +2,18 @@ import logging # logging.getLogger('numba').setLevel(logging.WARNING) + class UvicornSuppressFilter(logging.Filter): def filter(self, record): return False + logger = logging.getLogger("uvicorn.error") logger.addFilter(UvicornSuppressFilter()) +logger = logging.getLogger("fairseq.tasks.hubert_pretraining") +logger.addFilter(UvicornSuppressFilter()) + # logger.propagate = False logger = logging.getLogger("multipart.multipart") diff --git a/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py new file mode 100644 index 00000000..57ff5054 --- /dev/null +++ b/server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py @@ -0,0 +1,224 @@ +import sys +import os +if sys.platform.startswith('darwin'): + baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")] + if len(baseDir) != 1: + print("baseDir should be only one ", baseDir) + sys.exit() + modulePath = os.path.join(baseDir[0], "so-vits-svc-40v2") + sys.path.append(modulePath) +else: + sys.path.append("so-vits-svc-40v2") + +import io +from dataclasses import dataclass, asdict +from functools import reduce +import numpy as np +import torch +import onnxruntime +import pyworld as pw + +from models import SynthesizerTrn +import utils +from fairseq import checkpoint_utils +import librosa +from inference import infer_tool +providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] + + +@dataclass +class SoVitsSvc40v2Settings(): + gpu: int = 0 + srcId: int = 0 + dstId: int = 101 + + f0Factor: float = 1.0 + f0Detector: str = "dio" # dio or harvest + + framework: str = "PyTorch" # PyTorch or ONNX + pyTorchModelFile: str = "" + onnxModelFile: str = "" + configFile: str = "" + + # ↓mutableな物だけ列挙 + intData = ["gpu", "srcId", "dstId"] + floatData = ["f0Factor"] + strData = ["framework", "f0Detector"] + + +class SoVitsSvc40v2: + def __init__(self): + self.settings = SoVitsSvc40v2Settings() + self.net_g = None + self.onnx_session = None + + self.raw_path = io.BytesIO() + self.gpu_num = torch.cuda.device_count() + self.prevVol = 0 + + def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None): + self.settings.configFile = config + self.hps = utils.get_hparams_from_file(config) + + # hubert model + print("loading hubert model") + vec_path = "hubert/checkpoint_best_legacy_500.pt" + print("load model(s) from {}".format(vec_path)) + models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( + [vec_path], + suffix="", + ) + model = models[0] + model.eval() + self.hubert_model = utils.get_hubert_model().cpu() + + if pyTorch_model_file != None: + self.settings.pyTorchModelFile = pyTorch_model_file + if onnx_model_file: + self.settings.onnxModelFile = onnx_model_file + + # PyTorchモデル生成 + if pyTorch_model_file != None: + self.net_g = SynthesizerTrn( + self.hps + ) + self.net_g.eval() + utils.load_checkpoint(pyTorch_model_file, self.net_g, None) + + # # ONNXモデル生成 + # if onnx_model_file != None: + # ort_options = onnxruntime.SessionOptions() + # ort_options.intra_op_num_threads = 8 + # self.onnx_session = onnxruntime.InferenceSession( + # onnx_model_file, + # providers=providers + # ) + return self.get_info() + + def update_setteings(self, key: str, val: any): + if key == "onnxExecutionProvider" and self.onnx_session != None: + if val == "CUDAExecutionProvider": + if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num: + self.settings.gpu = 0 + provider_options = [{'device_id': self.settings.gpu}] + self.onnx_session.set_providers(providers=[val], provider_options=provider_options) + else: + self.onnx_session.set_providers(providers=[val]) + elif key in self.settings.intData: + setattr(self.settings, key, int(val)) + if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None: + providers = self.onnx_session.get_providers() + print("Providers:", providers) + if "CUDAExecutionProvider" in providers: + provider_options = [{'device_id': self.settings.gpu}] + self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) + elif key in self.settings.floatData: + setattr(self.settings, key, float(val)) + elif key in self.settings.strData: + setattr(self.settings, key, str(val)) + else: + return False + + return True + + def get_info(self): + data = asdict(self.settings) + + data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else [] + files = ["configFile", "pyTorchModelFile", "onnxModelFile"] + for f in files: + if data[f] != None and os.path.exists(data[f]): + data[f] = os.path.basename(data[f]) + else: + data[f] = "" + + return data + + def get_processing_sampling_rate(self): + return self.hps.data.sampling_rate + + def get_unit_f0(self, audio_buffer, tran): + wav_44k = audio_buffer + # f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size) + f0 = utils.compute_f0_dio(wav_44k, sampling_rate=self.hps.data.sampling_rate, hop_length=self.hps.data.hop_length) + + f0, uv = utils.interpolate_f0(f0) + f0 = torch.FloatTensor(f0) + uv = torch.FloatTensor(uv) + f0 = f0 * 2 ** (tran / 12) + f0 = f0.unsqueeze(0) + uv = uv.unsqueeze(0) + + # wav16k = librosa.resample(audio_buffer, orig_sr=24000, target_sr=16000) + wav16k = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000) + wav16k = torch.from_numpy(wav16k) + c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k) + c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1]) + c = c.unsqueeze(0) + return c, f0, uv + + def generate_input(self, newData: any, convertSize: int, cropRange): + newData = newData.astype(np.float32) / self.hps.data.max_wav_value + + if hasattr(self, "audio_buffer"): + self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + else: + self.audio_buffer = newData + + self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出 + + crop = self.audio_buffer[cropRange[0]:cropRange[1]] + + rms = np.sqrt(np.square(crop).mean(axis=0)) + vol = max(rms, self.prevVol * 0.1) + self.prevVol = vol + print(f" Crop:{crop.shape}, vol{vol}") + + c, f0, uv = self.get_unit_f0(self.audio_buffer, 20) + return (c, f0, uv, convertSize, vol) + + def _onnx_inference(self, data): + pass + + def _pyTorch_inference(self, data): + if hasattr(self, "net_g") == False or self.net_g == None: + print("[Voice Changer] No pyTorch session.") + return np.zeros(1).astype(np.int16) + + if self.settings.gpu < 0 or self.gpu_num == 0: + dev = torch.device("cpu") + else: + dev = torch.device("cuda", index=self.settings.gpu) + + convertSize = data[3] + vol = data[4] + data = (data[0], data[1], data[2],) + + if vol < 0.00001: + print("silcent") + return np.zeros(convertSize).astype(np.int16) + print(vol) + + with torch.no_grad(): + c, f0, uv = [x.to(dev)for x in data] + sid_target = torch.LongTensor([0]).to(dev) + self.net_g.to(dev) + # audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=True, noice_scale=0.1)[0][0, 0].data.float() + audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=False, noice_scale=0.4)[0][0, 0].data.float() + audio1 = audio1 * self.hps.data.max_wav_value + + result = audio1.float().cpu().numpy() + + # result = infer_tool.pad_array(result, length) + return result + + def inference(self, data): + if self.settings.framework == "ONNX": + audio = self._onnx_inference(data) + else: + audio = self._pyTorch_inference(data) + return audio + + def destroy(self): + del self.net_g + del self.onnx_session diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 9f1a16ad..45b63c24 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -12,7 +12,7 @@ from voice_changer.IOAnalyzer import IOAnalyzer import time - +import librosa providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") @@ -53,6 +53,13 @@ class VoiceChanger(): if modelType == "MMVCv15": from voice_changer.MMVCv15.MMVCv15 import MMVCv15 self.voiceChanger = MMVCv15() + elif modelType == "MMVCv13": + from voice_changer.MMVCv13.MMVCv13 import MMVCv13 + self.voiceChanger = MMVCv13() + elif modelType == "so-vits-svc-40v2": + from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2 + self.voiceChanger = SoVitsSvc40v2() + else: from voice_changer.MMVCv13.MMVCv13 import MMVCv13 self.voiceChanger = MMVCv13() @@ -139,24 +146,32 @@ class VoiceChanger(): # receivedData: tuple of short def on_request(self, receivedData: any): - + processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() + print(f"------------ Convert processing.... ------------") # 前処理 with Timer("pre-process") as t: - if self.settings.inputSampleRate != 24000: - newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000) + if self.settings.inputSampleRate != processing_sampling_rate: + newData = resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate) else: newData = receivedData inputSize = newData.shape[0] convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize) - # print(convertSize, unpackedData.shape[0]) + print(f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz") + if convertSize < 8192: convertSize = 8192 if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) + + overlapSize = min(self.settings.crossFadeOverlapSize, inputSize) + cropRange = (-1 * (inputSize + overlapSize), -1 * overlapSize) + + print(f" Convert input data size of {convertSize}") + print(f" overlap:{overlapSize}, cropRange:{cropRange}") self._generate_strength(inputSize) - data = self.voiceChanger.generate_input(newData, convertSize) + data = self.voiceChanger.generate_input(newData, convertSize, cropRange) preprocess_time = t.secs # 変換処理 @@ -165,10 +180,8 @@ class VoiceChanger(): # Inference audio = self.voiceChanger.inference(data) - # CrossFade if hasattr(self, 'np_prev_audio1') == True: np.set_printoptions(threshold=10000) - overlapSize = min(self.settings.crossFadeOverlapSize, inputSize) prev_overlap = self.np_prev_audio1[-1 * overlapSize:] cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize] powered_prev = prev_overlap * self.np_prev_strength @@ -177,6 +190,7 @@ class VoiceChanger(): cur = audio[-1 * inputSize:-1 * overlapSize] result = np.concatenate([powered_result, cur], axis=0) + print(f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input") # print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape) # print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize) @@ -195,20 +209,43 @@ class VoiceChanger(): # 後処理 with Timer("post-process") as t: result = result.astype(np.int16) - if self.settings.inputSampleRate != 24000: - result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16) + if self.settings.inputSampleRate != processing_sampling_rate: + outputData = resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16) + else: + outputData = result + + print(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") if self.settings.recordIO == 1: self.ioRecorder.writeInput(receivedData) - self.ioRecorder.writeOutput(result.tobytes()) + self.ioRecorder.writeOutput(outputData.tobytes()) + + if receivedData.shape[0] != outputData.shape[0]: + outputData = pad_array(outputData, receivedData.shape[0]) + print( + f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") postprocess_time = t.secs + print(" [fin] Input/Output size:", receivedData.shape[0], outputData.shape[0]) perf = [preprocess_time, mainprocess_time, postprocess_time] - return result, perf + return outputData, perf +def pad_array(arr, target_length): + current_length = arr.shape[0] + if current_length >= target_length: + return arr + else: + pad_width = target_length - current_length + pad_left = pad_width // 2 + pad_right = pad_width - pad_left + padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0)) + return padded_arr + ############## + + class Timer(object): def __init__(self, title: str): self.title = title