diff --git a/server/.vscode/settings.json b/server/.vscode/settings.json new file mode 100644 index 00000000..88a7b800 --- /dev/null +++ b/server/.vscode/settings.json @@ -0,0 +1,16 @@ +{ + "workbench.colorCustomizations": { + "tab.activeBackground": "#65952acc" + }, + "python.formatting.provider": "black", + "python.linting.mypyEnabled": true, + "[python]": { + "editor.defaultFormatter": null, // Prettier を使わないようにする + "editor.formatOnSave": true // ファイル保存時に自動フォーマット + }, + "flake8.args": [ + "--ignore=E501" + // "--max-line-length=150", + // "--max-complexity=20" + ] +} diff --git a/server/MMVCServerSIO.py b/server/MMVCServerSIO.py index bb37fb57..3a5ff55c 100755 --- a/server/MMVCServerSIO.py +++ b/server/MMVCServerSIO.py @@ -2,12 +2,12 @@ import sys from distutils.util import strtobool from datetime import datetime -from dataclasses import dataclass -import misc.log_control import socket import platform import os import argparse +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams + import uvicorn from mods.ssl import create_self_signed_cert from voice_changer.VoiceChangerManager import VoiceChangerManager @@ -21,30 +21,48 @@ import multiprocessing as mp def setupArgParser(): parser = argparse.ArgumentParser() parser.add_argument("-p", type=int, default=18888, help="port") - parser.add_argument("--https", type=strtobool, - default=False, help="use https") - parser.add_argument("--httpsKey", type=str, - default="ssl.key", help="path for the key of https") - parser.add_argument("--httpsCert", type=str, - default="ssl.cert", help="path for the cert of https") - parser.add_argument("--httpsSelfSigned", type=strtobool, - default=True, help="generate self-signed certificate") + parser.add_argument("--https", type=strtobool, default=False, help="use https") + parser.add_argument( + "--httpsKey", type=str, default="ssl.key", help="path for the key of https" + ) + parser.add_argument( + "--httpsCert", type=str, default="ssl.cert", help="path for the cert of https" + ) + parser.add_argument( + "--httpsSelfSigned", + type=strtobool, + default=True, + help="generate self-signed certificate", + ) - # parser.add_argument("--internal", type=strtobool, default=False, help="各種パスをmac appの中身に変換") - - parser.add_argument("--content_vec_500", type=str, help="path to content_vec_500 model(pytorch)") - parser.add_argument("--content_vec_500_onnx", type=str, help="path to content_vec_500 model(onnx)") - parser.add_argument("--content_vec_500_onnx_on", type=strtobool, default=False, help="use or not onnx for content_vec_500") - parser.add_argument("--hubert_base", type=str, help="path to hubert_base model(pytorch)") - parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)") - parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)") + parser.add_argument( + "--content_vec_500", type=str, help="path to content_vec_500 model(pytorch)" + ) + parser.add_argument( + "--content_vec_500_onnx", type=str, help="path to content_vec_500 model(onnx)" + ) + parser.add_argument( + "--content_vec_500_onnx_on", + type=strtobool, + default=False, + help="use or not onnx for content_vec_500", + ) + parser.add_argument( + "--hubert_base", type=str, help="path to hubert_base model(pytorch)" + ) + parser.add_argument( + "--hubert_soft", type=str, help="path to hubert_soft model(pytorch)" + ) + parser.add_argument( + "--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)" + ) return parser def printMessage(message, level=0): pf = platform.system() - if pf == 'Windows': + if pf == "Windows": if level == 0: print(f"{message}") elif level == 1: @@ -78,37 +96,38 @@ def localServer(): host="0.0.0.0", port=int(PORT), reload=False if hasattr(sys, "_MEIPASS") else True, - log_level="warning" + log_level="warning", ) -if __name__ == 'MMVCServerSIO': - voiceChangerManager = VoiceChangerManager.get_instance({ - "content_vec_500": args.content_vec_500, - "content_vec_500_onnx": args.content_vec_500_onnx, - "content_vec_500_onnx_on": args.content_vec_500_onnx_on, - "hubert_base": args.hubert_base, - "hubert_soft": args.hubert_soft, - "nsf_hifigan": args.nsf_hifigan, - }) +if __name__ == "MMVCServerSIO": + voiceChangerParams = VoiceChangerParams( + content_vec_500=args.content_vec_500, + content_vec_500_onnx=args.content_vec_500_onnx, + content_vec_500_onnx_on=args.content_vec_500_onnx_on, + hubert_base=args.hubert_base, + hubert_soft=args.hubert_soft, + nsf_hifigan=args.nsf_hifigan, + ) + voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams) + print("voiceChangerManager", voiceChangerManager) app_fastapi = MMVC_Rest.get_instance(voiceChangerManager) app_socketio = MMVC_SocketIOApp.get_instance(app_fastapi, voiceChangerManager) -if __name__ == '__mp_main__': - printMessage(f"サーバプロセスを起動しています。", level=2) +if __name__ == "__mp_main__": + printMessage("サーバプロセスを起動しています。", level=2) -if __name__ == '__main__': +if __name__ == "__main__": mp.freeze_support() - printMessage(f"Voice Changerを起動しています。", level=2) + printMessage("Voice Changerを起動しています。", level=2) PORT = args.p if os.getenv("EX_PORT"): EX_PORT = os.environ["EX_PORT"] - printMessage( - f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1) + printMessage(f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1) else: printMessage(f"Internal_Port:{PORT}", level=1) @@ -123,38 +142,42 @@ if __name__ == '__main__': key_base_name = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}" keyname = f"{key_base_name}.key" certname = f"{key_base_name}.cert" - create_self_signed_cert(certname, keyname, certargs={"Country": "JP", - "State": "Tokyo", - "City": "Chuo-ku", - "Organization": "F", - "Org. Unit": "F"}, cert_dir=SSL_KEY_DIR) + create_self_signed_cert( + certname, + keyname, + certargs={ + "Country": "JP", + "State": "Tokyo", + "City": "Chuo-ku", + "Organization": "F", + "Org. Unit": "F", + }, + cert_dir=SSL_KEY_DIR, + ) key_path = os.path.join(SSL_KEY_DIR, keyname) cert_path = os.path.join(SSL_KEY_DIR, certname) printMessage( - f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1) + f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1 + ) elif args.https and args.httpsSelfSigned == 0: # HTTPS key_path = args.httpsKey cert_path = args.httpsCert - printMessage( - f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1) + printMessage(f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1) else: # HTTP - printMessage(f"protocol: HTTP", level=1) - printMessage(f"-- ---- -- ", level=1) + printMessage("protocol: HTTP", level=1) + printMessage("-- ---- -- ", level=1) # アドレス表示 - printMessage( - f"ブラウザで次のURLを開いてください.", level=2) + printMessage("ブラウザで次のURLを開いてください.", level=2) if args.https == 1: - printMessage( - f"https://:/", level=1) + printMessage("https://:/", level=1) else: - printMessage( - f"http://:/", level=1) + printMessage("http://:/", level=1) - printMessage(f"多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2) + printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2) if "EX_PORT" in locals() and "EX_IP" in locals(): # シェルスクリプト経由起動(docker) if args.https == 1: printMessage(f"https://localhost:{EX_PORT}/", level=1) @@ -175,7 +198,7 @@ if __name__ == '__main__': # サーバ起動 if args.https: # HTTPS サーバ起動 - res = uvicorn.run( + uvicorn.run( f"{os.path.basename(__file__)[:-3]}:app_socketio", host="0.0.0.0", port=int(PORT), @@ -188,13 +211,17 @@ if __name__ == '__main__': p = mp.Process(name="p", target=localServer) p.start() try: - if sys.platform.startswith('win'): - process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "-u", f"http://localhost:{PORT}/"]) + if sys.platform.startswith("win"): + process = subprocess.Popen( + [NATIVE_CLIENT_FILE_WIN, "-u", f"http://localhost:{PORT}/"] + ) return_code = process.wait() print("client closed.") p.terminate() - elif sys.platform.startswith('darwin'): - process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "-u", f"http://localhost:{PORT}/"]) + elif sys.platform.startswith("darwin"): + process = subprocess.Popen( + [NATIVE_CLIENT_FILE_MAC, "-u", f"http://localhost:{PORT}/"] + ) return_code = process.wait() print("client closed.") p.terminate() diff --git a/server/const.py b/server/const.py index 3519a6c7..fe877166 100644 --- a/server/const.py +++ b/server/const.py @@ -26,14 +26,6 @@ TMP_DIR = os.path.join(tmpdir.name, "tmp_dir") if hasattr(sys, "_MEIPASS") else os.makedirs(TMP_DIR, exist_ok=True) -# modelType: ModelType = "MMVCv15" -# def getModelType() -> ModelType: -# return modelType -# def setModelType(_modelType: ModelType): -# global modelType -# modelType = _modelType - - def getFrontendPath(): frontend_path = os.path.join(sys._MEIPASS, "dist") if hasattr(sys, "_MEIPASS") else "../client/demo/dist" return frontend_path diff --git a/server/restapi/MMVC_Rest.py b/server/restapi/MMVC_Rest.py index e1b0a947..61652c4b 100644 --- a/server/restapi/MMVC_Rest.py +++ b/server/restapi/MMVC_Rest.py @@ -1,7 +1,8 @@ -from fastapi import FastAPI, Request, Response +from fastapi import FastAPI, Request, Response, HTTPException from fastapi.routing import APIRoute from fastapi.middleware.cors import CORSMiddleware from fastapi.staticfiles import StaticFiles +from fastapi.exceptions import RequestValidationError from typing import Callable from voice_changer.VoiceChangerManager import VoiceChangerManager @@ -18,7 +19,7 @@ class ValidationErrorLoggingRoute(APIRoute): async def custom_route_handler(request: Request) -> Response: try: return await original_route_handler(request) - except Exception as exc: + except RequestValidationError as exc: print("Exception", request.url, str(exc)) body = await request.body() detail = {"errors": exc.errors(), "body": body.decode()} @@ -28,10 +29,11 @@ class ValidationErrorLoggingRoute(APIRoute): class MMVC_Rest: + _instance = None @classmethod def get_instance(cls, voiceChangerManager: VoiceChangerManager): - if not hasattr(cls, "_instance"): + if cls._instance is None: app_fastapi = FastAPI() app_fastapi.router.route_class = ValidationErrorLoggingRoute app_fastapi.add_middleware( @@ -43,15 +45,25 @@ class MMVC_Rest: ) app_fastapi.mount( - "/front", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static") + "/front", + StaticFiles(directory=f"{getFrontendPath()}", html=True), + name="static", + ) app_fastapi.mount( - "/trainer", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static") + "/trainer", + StaticFiles(directory=f"{getFrontendPath()}", html=True), + name="static", + ) app_fastapi.mount( - "/recorder", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static") + "/recorder", + StaticFiles(directory=f"{getFrontendPath()}", html=True), + name="static", + ) app_fastapi.mount( - "/tmp", StaticFiles(directory=f'{TMP_DIR}'), name="static") + "/tmp", StaticFiles(directory=f"{TMP_DIR}"), name="static" + ) restHello = MMVC_Rest_Hello() app_fastapi.include_router(restHello.router) diff --git a/server/restapi/MMVC_Rest_Fileuploader.py b/server/restapi/MMVC_Rest_Fileuploader.py index a4ef0ff4..fa1b5a93 100644 --- a/server/restapi/MMVC_Rest_Fileuploader.py +++ b/server/restapi/MMVC_Rest_Fileuploader.py @@ -4,12 +4,13 @@ from typing import Union from fastapi import APIRouter from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse -from fastapi import HTTPException, FastAPI, UploadFile, File, Form +from fastapi import UploadFile, File, Form from restapi.mods.FileUploader import upload_file, concat_file_chunks from voice_changer.VoiceChangerManager import VoiceChangerManager from const import MODEL_DIR, UPLOAD_DIR, ModelType + os.makedirs(UPLOAD_DIR, exist_ok=True) os.makedirs(MODEL_DIR, exist_ok=True) @@ -19,12 +20,22 @@ class MMVC_Rest_Fileuploader: self.voiceChangerManager = voiceChangerManager self.router = APIRouter() self.router.add_api_route("/info", self.get_info, methods=["GET"]) - self.router.add_api_route("/upload_file", self.post_upload_file, methods=["POST"]) - self.router.add_api_route("/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"]) - self.router.add_api_route("/update_settings", self.post_update_settings, methods=["POST"]) + self.router.add_api_route( + "/upload_file", self.post_upload_file, methods=["POST"] + ) + self.router.add_api_route( + "/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"] + ) + self.router.add_api_route( + "/update_settings", self.post_update_settings, methods=["POST"] + ) self.router.add_api_route("/load_model", self.post_load_model, methods=["POST"]) - self.router.add_api_route("/load_model_for_train", self.post_load_model_for_train, methods=["POST"]) - self.router.add_api_route("/extract_voices", self.post_extract_voices, methods=["POST"]) + self.router.add_api_route( + "/load_model_for_train", self.post_load_model_for_train, methods=["POST"] + ) + self.router.add_api_route( + "/extract_voices", self.post_extract_voices, methods=["POST"] + ) self.router.add_api_route("/model_type", self.post_model_type, methods=["POST"]) self.router.add_api_route("/model_type", self.get_model_type, methods=["GET"]) self.router.add_api_route("/onnx", self.get_onnx, methods=["GET"]) @@ -34,9 +45,13 @@ class MMVC_Rest_Fileuploader: json_compatible_item_data = jsonable_encoder(res) return JSONResponse(content=json_compatible_item_data) - def post_concat_uploaded_file(self, filename: str = Form(...), filenameChunkNum: int = Form(...)): + def post_concat_uploaded_file( + self, filename: str = Form(...), filenameChunkNum: int = Form(...) + ): slot = 0 - res = concat_file_chunks(slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR) + res = concat_file_chunks( + slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR + ) json_compatible_item_data = jsonable_encoder(res) return JSONResponse(content=json_compatible_item_data) @@ -45,7 +60,9 @@ class MMVC_Rest_Fileuploader: json_compatible_item_data = jsonable_encoder(info) return JSONResponse(content=json_compatible_item_data) - def post_update_settings(self, key: str = Form(...), val: Union[int, str, float] = Form(...)): + def post_update_settings( + self, key: str = Form(...), val: Union[int, str, float] = Form(...) + ): print("post_update_settings", key, val) info = self.voiceChangerManager.update_settings(key, val) json_compatible_item_data = jsonable_encoder(info) @@ -63,7 +80,6 @@ class MMVC_Rest_Fileuploader: isHalf: bool = Form(...), params: str = Form(...), ): - props = { "slot": slot, "isHalf": isHalf, @@ -73,9 +89,9 @@ class MMVC_Rest_Fileuploader: "onnxModelFilename": onnxModelFilename, "clusterTorchModelFilename": clusterTorchModelFilename, "featureFilename": featureFilename, - "indexFilename": indexFilename + "indexFilename": indexFilename, }, - "params": params + "params": params, } # Change Filepath for key, val in props["files"].items(): @@ -103,9 +119,11 @@ class MMVC_Rest_Fileuploader: modelDFilenameChunkNum: int = Form(...), ): modelGFilePath = concat_file_chunks( - UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR) + UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR + ) modelDFilePath = concat_file_chunks( - UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR) + UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR + ) return {"File saved": f"{modelGFilePath}, {modelDFilePath}"} def post_extract_voices( @@ -114,7 +132,8 @@ class MMVC_Rest_Fileuploader: zipFileChunkNum: int = Form(...), ): zipFilePath = concat_file_chunks( - UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR) + UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR + ) shutil.unpack_archive(zipFilePath, "MMVC_Trainer/dataset/textful/") return {"Zip file unpacked": f"{zipFilePath}"} diff --git a/server/restapi/MMVC_Rest_Hello.py b/server/restapi/MMVC_Rest_Hello.py index aa3eebaf..4966d57e 100644 --- a/server/restapi/MMVC_Rest_Hello.py +++ b/server/restapi/MMVC_Rest_Hello.py @@ -1,6 +1,6 @@ from fastapi import APIRouter -from fastapi.encoders import jsonable_encoder -from fastapi.responses import JSONResponse + + class MMVC_Rest_Hello: def __init__(self): self.router = APIRouter() @@ -8,6 +8,3 @@ class MMVC_Rest_Hello: def hello(self): return {"result": "Index"} - - - diff --git a/server/restapi/MMVC_Rest_VoiceChanger.py b/server/restapi/MMVC_Rest_VoiceChanger.py index 45f87d69..3be438ae 100644 --- a/server/restapi/MMVC_Rest_VoiceChanger.py +++ b/server/restapi/MMVC_Rest_VoiceChanger.py @@ -31,24 +31,24 @@ class MMVC_Rest_VoiceChanger: buffer = voice.buffer wav = base64.b64decode(buffer) - if wav == 0: - samplerate, data = read("dummy.wav") - unpackedData = data - else: - unpackedData = np.array(struct.unpack( - '<%sh' % (len(wav) // struct.calcsize('= self.gpu_num: self.settings.gpu = 0 - provider_options = [{'device_id': self.settings.gpu}] - self.onnx_session.set_providers(providers=[val], provider_options=provider_options) + provider_options = [{"device_id": self.settings.gpu}] + self.onnx_session.set_providers( + providers=[val], provider_options=provider_options + ) if hasattr(self, "hubert_onnx"): - self.hubert_onnx.set_providers(providers=[val], provider_options=provider_options) + self.hubert_onnx.set_providers( + providers=[val], provider_options=provider_options + ) else: self.onnx_session.set_providers(providers=[val]) if hasattr(self, "hubert_onnx"): @@ -294,12 +341,20 @@ class RVC: print("Onnx is not enabled. Please load model.") return False elif key in self.settings.intData: - if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None: + if ( + key == "gpu" + and val >= 0 + and val < self.gpu_num + and self.onnx_session != None + ): providers = self.onnx_session.get_providers() print("Providers:", providers) if "CUDAExecutionProvider" in providers: - provider_options = [{'device_id': self.settings.gpu}] - self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) + provider_options = [{"device_id": self.settings.gpu}] + self.onnx_session.set_providers( + providers=["CUDAExecutionProvider"], + provider_options=provider_options, + ) if key == "modelSlotIndex": # self.switchModel(int(val)) val = int(val) % 1000 # Quick hack for same slot is selected @@ -318,7 +373,9 @@ class RVC: def get_info(self): data = asdict(self.settings) - data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else [] + data["onnxExecutionProviders"] = ( + self.onnx_session.get_providers() if self.onnx_session != None else [] + ) files = ["configFile", "pyTorchModelFile", "onnxModelFile"] for f in files: if data[f] != None and os.path.exists(data[f]): @@ -331,22 +388,30 @@ class RVC: def get_processing_sampling_rate(self): return self.settings.modelSamplingRate - def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0): + def generate_input( + self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0 + ): newData = newData.astype(np.float32) / 32768.0 if hasattr(self, "audio_buffer"): - self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 + self.audio_buffer = np.concatenate( + [self.audio_buffer, newData], 0 + ) # 過去のデータに連結 else: self.audio_buffer = newData - convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + convertSize = ( + inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize + ) if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (128 - (convertSize % 128)) - self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出 + self.audio_buffer = self.audio_buffer[-1 * convertSize :] # 変換対象の部分だけ抽出 - crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮) + crop = self.audio_buffer[ + -1 * (inputSize + crossfadeSize) : -1 * (crossfadeSize) + ] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮) rms = np.sqrt(np.square(crop).mean(axis=0)) vol = max(rms, self.prevVol * 0.0) self.prevVol = vol @@ -390,15 +455,34 @@ class RVC: f0 = self.settings.modelSlots[self.currentSlot].f0 embChannels = self.settings.modelSlots[self.currentSlot].embChannels - audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method, - file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels) + audio_out = vc.pipeline( + self.hubert_model, + self.onnx_session, + sid, + audio, + times, + f0_up_key, + f0_method, + file_index, + file_big_npy, + index_rate, + if_f0, + f0_file=f0_file, + silence_front=self.settings.extraConvertSize + / self.settings.modelSamplingRate, + embChannels=embChannels, + ) result = audio_out * np.sqrt(vol) return result def _pyTorch_inference(self, data): if hasattr(self, "net_g") == False or self.net_g == None: - print("[Voice Changer] No pyTorch session.", hasattr(self, "net_g"), self.net_g) + print( + "[Voice Changer] No pyTorch session.", + hasattr(self, "net_g"), + self.net_g, + ) raise NoModeLoadedException("pytorch") if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled == False): @@ -436,8 +520,23 @@ class RVC: f0_file = None embChannels = self.settings.modelSlots[self.currentSlot].embChannels - audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, - file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels) + audio_out = vc.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + times, + f0_up_key, + f0_method, + file_index, + file_big_npy, + index_rate, + if_f0, + f0_file=f0_file, + silence_front=self.settings.extraConvertSize + / self.settings.modelSamplingRate, + embChannels=embChannels, + ) result = audio_out * np.sqrt(vol) @@ -445,7 +544,11 @@ class RVC: def inference(self, data): if self.settings.modelSlotIndex < 0: - print("[Voice Changer] wait for loading model...", self.settings.modelSlotIndex, self.currentSlot) + print( + "[Voice Changer] wait for loading model...", + self.settings.modelSlotIndex, + self.currentSlot, + ) raise NoModeLoadedException("model_common") if self.currentSlot != self.settings.modelSlotIndex: @@ -482,7 +585,9 @@ class RVC: print("[Voice Changer] export2onnx, No pyTorch session.") return {"status": "ng", "path": f""} - pyTorchModelFile = self.settings.modelSlots[self.settings.modelSlotIndex].pyTorchModelFile # inference前にexportできるようにcurrentSlotではなくslot + pyTorchModelFile = self.settings.modelSlots[ + self.settings.modelSlotIndex + ].pyTorchModelFile # inference前にexportできるようにcurrentSlotではなくslot if pyTorchModelFile == None: print("[Voice Changer] export2onnx, No pyTorch filepath.") @@ -490,23 +595,45 @@ class RVC: import voice_changer.RVC.export2onnx as onnxExporter output_file = os.path.splitext(os.path.basename(pyTorchModelFile))[0] + ".onnx" - output_file_simple = os.path.splitext(os.path.basename(pyTorchModelFile))[0] + "_simple.onnx" + output_file_simple = ( + os.path.splitext(os.path.basename(pyTorchModelFile))[0] + "_simple.onnx" + ) output_path = os.path.join(TMP_DIR, output_file) output_path_simple = os.path.join(TMP_DIR, output_file_simple) - print("embChannels", self.settings.modelSlots[self.settings.modelSlotIndex].embChannels) + print( + "embChannels", + self.settings.modelSlots[self.settings.modelSlotIndex].embChannels, + ) metadata = { "application": "VC_CLIENT", "version": "1", - "modelType": self.settings.modelSlots[self.settings.modelSlotIndex].modelType, - "samplingRate": self.settings.modelSlots[self.settings.modelSlotIndex].samplingRate, + "modelType": self.settings.modelSlots[ + self.settings.modelSlotIndex + ].modelType, + "samplingRate": self.settings.modelSlots[ + self.settings.modelSlotIndex + ].samplingRate, "f0": self.settings.modelSlots[self.settings.modelSlotIndex].f0, - "embChannels": self.settings.modelSlots[self.settings.modelSlotIndex].embChannels, + "embChannels": self.settings.modelSlots[ + self.settings.modelSlotIndex + ].embChannels, + "embedder": self.settings.modelSlots[self.settings.modelSlotIndex].embedder, } if torch.cuda.device_count() > 0: - onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, True, metadata) + onnxExporter.export2onnx( + pyTorchModelFile, output_path, output_path_simple, True, metadata + ) else: - print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.") - onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, False, metadata) + print( + "[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled." + ) + onnxExporter.export2onnx( + pyTorchModelFile, output_path, output_path_simple, False, metadata + ) - return {"status": "ok", "path": f"/tmp/{output_file_simple}", "filename": output_file_simple} + return { + "status": "ok", + "path": f"/tmp/{output_file_simple}", + "filename": output_file_simple, + } diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index dcbbdce8..f60045ec 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -9,14 +9,18 @@ import resampy from voice_changer.IORecorder import IORecorder -# from voice_changer.IOAnalyzer import IOAnalyzer from voice_changer.utils.Timer import Timer from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut -import time from Exceptions import NoModeLoadedException, ONNXInputArgumentException +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams -providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"] +providers = [ + "OpenVINOExecutionProvider", + "CUDAExecutionProvider", + "DmlExecutionProvider", + "CPUExecutionProvider", +] STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav") STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav") @@ -25,7 +29,7 @@ STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png") @dataclass -class VoiceChangerSettings(): +class VoiceChangerSettings: inputSampleRate: int = 48000 # 48000 or 24000 crossFadeOffsetRate: float = 0.1 @@ -41,16 +45,14 @@ class VoiceChangerSettings(): floatData: list[str] = field( default_factory=lambda: ["crossFadeOffsetRate", "crossFadeEndRate"] ) - strData: list[str] = field( - default_factory=lambda: [] - ) + strData: list[str] = field(default_factory=lambda: []) -class VoiceChanger(): +class VoiceChanger: settings: VoiceChangerSettings voiceChanger: VoiceChangerModel - def __init__(self, params): + def __init__(self, params: VoiceChangerParams): # 初期化 self.settings = VoiceChangerSettings() self.onnx_session = None @@ -64,9 +66,14 @@ class VoiceChanger(): self.params = params self.gpu_num = torch.cuda.device_count() self.prev_audio = np.zeros(4096) - self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() + self.mps_enabled: bool = ( + getattr(torch.backends, "mps", None) is not None + and torch.backends.mps.is_available() + ) - print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})") + print( + f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})" + ) def switchModelType(self, modelType: ModelType): if hasattr(self, "voiceChanger") and self.voiceChanger != None: @@ -77,24 +84,31 @@ class VoiceChanger(): self.modelType = modelType if self.modelType == "MMVCv15": from voice_changer.MMVCv15.MMVCv15 import MMVCv15 + self.voiceChanger = MMVCv15() # type: ignore elif self.modelType == "MMVCv13": from voice_changer.MMVCv13.MMVCv13 import MMVCv13 + self.voiceChanger = MMVCv13() elif self.modelType == "so-vits-svc-40v2": from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2 + self.voiceChanger = SoVitsSvc40v2(self.params) elif self.modelType == "so-vits-svc-40" or self.modelType == "so-vits-svc-40_c": from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40 + self.voiceChanger = SoVitsSvc40(self.params) elif self.modelType == "DDSP-SVC": from voice_changer.DDSP_SVC.DDSP_SVC import DDSP_SVC + self.voiceChanger = DDSP_SVC(self.params) elif self.modelType == "RVC": from voice_changer.RVC.RVC import RVC + self.voiceChanger = RVC(self.params) else: from voice_changer.MMVCv13.MMVCv13 import MMVCv13 + self.voiceChanger = MMVCv13() return {"status": "OK", "msg": "vc is switched."} @@ -109,7 +123,6 @@ class VoiceChanger(): self, props, ): - try: return self.voiceChanger.loadModel(props) except Exception as e: @@ -143,7 +156,9 @@ class VoiceChanger(): if key == "recordIO" and val == 1: if hasattr(self, "ioRecorder"): self.ioRecorder.close() - self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate) + self.ioRecorder = IORecorder( + STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate + ) if key == "recordIO" and val == 0: if hasattr(self, "ioRecorder"): self.ioRecorder.close() @@ -174,12 +189,12 @@ class VoiceChanger(): return self.get_info() def _generate_strength(self, crossfadeSize: int): - - if self.crossfadeSize != crossfadeSize or \ - self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \ - self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \ - self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize: - + if ( + self.crossfadeSize != crossfadeSize + or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate + or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate + or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize + ): self.crossfadeSize = crossfadeSize self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate self.currentCrossFadeEndRate = self.settings.crossFadeEndRate @@ -193,30 +208,54 @@ class VoiceChanger(): np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2 np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2 - self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, - np.zeros(crossfadeSize - cf_offset - len(np_prev_strength))]) - self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(crossfadeSize - cf_offset - len(np_cur_strength))]) + self.np_prev_strength = np.concatenate( + [ + np.ones(cf_offset), + np_prev_strength, + np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)), + ] + ) + self.np_cur_strength = np.concatenate( + [ + np.zeros(cf_offset), + np_cur_strength, + np.ones(crossfadeSize - cf_offset - len(np_cur_strength)), + ] + ) - print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}") + print( + f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}" + ) # ひとつ前の結果とサイズが変わるため、記録は消去する。 - if hasattr(self, 'np_prev_audio1') == True: + if hasattr(self, "np_prev_audio1") == True: delattr(self, "np_prev_audio1") if hasattr(self, "sola_buffer"): del self.sola_buffer # receivedData: tuple of short - def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: + def on_request( + self, receivedData: AudioInOut + ) -> tuple[AudioInOut, list[Union[int, float]]]: return self.on_request_sola(receivedData) - def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: + def on_request_sola( + self, receivedData: AudioInOut + ) -> tuple[AudioInOut, list[Union[int, float]]]: try: processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() # 前処理 with Timer("pre-process") as t: if self.settings.inputSampleRate != processing_sampling_rate: - newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)) + newData = cast( + AudioInOut, + resampy.resample( + receivedData, + self.settings.inputSampleRate, + processing_sampling_rate, + ), + ) else: newData = receivedData @@ -226,7 +265,9 @@ class VoiceChanger(): crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) self._generate_strength(crossfade_frame) - data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) + data = self.voiceChanger.generate_input( + newData, block_frame, crossfade_frame, sola_search_frame + ) preprocess_time = t.secs # 変換処理 @@ -234,15 +275,28 @@ class VoiceChanger(): # Inference audio = self.voiceChanger.inference(data) - if hasattr(self, 'sola_buffer') == True: + if hasattr(self, "sola_buffer") == True: np.set_printoptions(threshold=10000) - audio = audio[-sola_search_frame - crossfade_frame - block_frame:] + audio = audio[-sola_search_frame - crossfade_frame - block_frame :] # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI - cor_nom = np.convolve(audio[: crossfade_frame + sola_search_frame], np.flip(self.sola_buffer), 'valid') - cor_den = np.sqrt(np.convolve(audio[: crossfade_frame + sola_search_frame] ** 2, np.ones(crossfade_frame), 'valid') + 1e-3) + cor_nom = np.convolve( + audio[: crossfade_frame + sola_search_frame], + np.flip(self.sola_buffer), + "valid", + ) + cor_den = np.sqrt( + np.convolve( + audio[: crossfade_frame + sola_search_frame] ** 2, + np.ones(crossfade_frame), + "valid", + ) + + 1e-3 + ) sola_offset = np.argmax(cor_nom / cor_den) - output_wav = audio[sola_offset: sola_offset + block_frame].astype(np.float64) + output_wav = audio[sola_offset : sola_offset + block_frame].astype( + np.float64 + ) output_wav[:crossfade_frame] *= self.np_cur_strength output_wav[:crossfade_frame] += self.sola_buffer[:] @@ -251,11 +305,19 @@ class VoiceChanger(): print("[Voice Changer] no sola buffer. (You can ignore this.)") result = np.zeros(4096).astype(np.int16) - if hasattr(self, 'sola_buffer') == True and sola_offset < sola_search_frame: - sola_buf_org = audio[- sola_search_frame - crossfade_frame + sola_offset: -sola_search_frame + sola_offset] + if ( + hasattr(self, "sola_buffer") == True + and sola_offset < sola_search_frame + ): + sola_buf_org = audio[ + -sola_search_frame + - crossfade_frame + + sola_offset : -sola_search_frame + + sola_offset + ] self.sola_buffer = sola_buf_org * self.np_prev_strength else: - self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength + self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength # self.sola_buffer = audio[- crossfade_frame:] mainprocess_time = t.secs @@ -263,12 +325,20 @@ class VoiceChanger(): with Timer("post-process") as t: result = result.astype(np.int16) if self.settings.inputSampleRate != processing_sampling_rate: - outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)) + outputData = cast( + AudioInOut, + resampy.resample( + result, + processing_sampling_rate, + self.settings.inputSampleRate, + ).astype(np.int16), + ) else: outputData = result print_convert_processing( - f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") + f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz" + ) if self.settings.recordIO == 1: self.ioRecorder.writeInput(receivedData) @@ -281,7 +351,9 @@ class VoiceChanger(): # # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz") postprocess_time = t.secs - print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") + print_convert_processing( + f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}" + ) perf = [preprocess_time, mainprocess_time, postprocess_time] return outputData, perf @@ -299,8 +371,9 @@ class VoiceChanger(): def export2onnx(self): return self.voiceChanger.export2onnx() - ############## + + PRINT_CONVERT_PROCESSING: bool = False # PRINT_CONVERT_PROCESSING = True @@ -318,5 +391,7 @@ def pad_array(arr: AudioInOut, target_length: int): pad_width = target_length - current_length pad_left = pad_width // 2 pad_right = pad_width - pad_left - padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0)) + padded_arr = np.pad( + arr, (pad_left, pad_right), "constant", constant_values=(0, 0) + ) return padded_arr diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 023be96e..03bb6149 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -1,12 +1,16 @@ import numpy as np from voice_changer.VoiceChanger import VoiceChanger from const import ModelType +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams -class VoiceChangerManager(): +class VoiceChangerManager(object): + _instance = None + voiceChanger: VoiceChanger = None + @classmethod - def get_instance(cls, params): - if not hasattr(cls, "_instance"): + def get_instance(cls, params: VoiceChangerParams): + if cls._instance is None: cls._instance = cls() cls._instance.voiceChanger = VoiceChanger(params) return cls._instance @@ -20,7 +24,7 @@ class VoiceChangerManager(): return info def get_info(self): - if hasattr(self, 'voiceChanger'): + if hasattr(self, "voiceChanger"): info = self.voiceChanger.get_info() info["status"] = "OK" return info @@ -28,7 +32,7 @@ class VoiceChangerManager(): return {"status": "ERROR", "msg": "no model loaded"} def update_settings(self, key: str, val: any): - if hasattr(self, 'voiceChanger'): + if hasattr(self, "voiceChanger"): info = self.voiceChanger.update_settings(key, val) info["status"] = "OK" return info @@ -36,7 +40,7 @@ class VoiceChangerManager(): return {"status": "ERROR", "msg": "no model loaded"} def changeVoice(self, receivedData: any): - if hasattr(self, 'voiceChanger') == True: + if hasattr(self, "voiceChanger") is True: return self.voiceChanger.on_request(receivedData) else: print("Voice Change is not loaded. Did you load a correct model?") diff --git a/server/voice_changer/utils/VoiceChangerParams.py b/server/voice_changer/utils/VoiceChangerParams.py new file mode 100644 index 00000000..c755ba1d --- /dev/null +++ b/server/voice_changer/utils/VoiceChangerParams.py @@ -0,0 +1,11 @@ +from dataclasses import dataclass + + +@dataclass +class VoiceChangerParams(): + content_vec_500: str + content_vec_500_onnx: str + content_vec_500_onnx_on: bool + hubert_base: str + hubert_soft: str + nsf_hifigan: str