refactoring

This commit is contained in:
wataru 2023-04-27 23:38:25 +09:00
parent a59631609c
commit 55118815b4
13 changed files with 525 additions and 280 deletions

16
server/.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,16 @@
{
"workbench.colorCustomizations": {
"tab.activeBackground": "#65952acc"
},
"python.formatting.provider": "black",
"python.linting.mypyEnabled": true,
"[python]": {
"editor.defaultFormatter": null, // Prettier 使
"editor.formatOnSave": true //
},
"flake8.args": [
"--ignore=E501"
// "--max-line-length=150",
// "--max-complexity=20"
]
}

View File

@ -2,12 +2,12 @@ import sys
from distutils.util import strtobool
from datetime import datetime
from dataclasses import dataclass
import misc.log_control
import socket
import platform
import os
import argparse
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
import uvicorn
from mods.ssl import create_self_signed_cert
from voice_changer.VoiceChangerManager import VoiceChangerManager
@ -21,30 +21,48 @@ import multiprocessing as mp
def setupArgParser():
parser = argparse.ArgumentParser()
parser.add_argument("-p", type=int, default=18888, help="port")
parser.add_argument("--https", type=strtobool,
default=False, help="use https")
parser.add_argument("--httpsKey", type=str,
default="ssl.key", help="path for the key of https")
parser.add_argument("--httpsCert", type=str,
default="ssl.cert", help="path for the cert of https")
parser.add_argument("--httpsSelfSigned", type=strtobool,
default=True, help="generate self-signed certificate")
parser.add_argument("--https", type=strtobool, default=False, help="use https")
parser.add_argument(
"--httpsKey", type=str, default="ssl.key", help="path for the key of https"
)
parser.add_argument(
"--httpsCert", type=str, default="ssl.cert", help="path for the cert of https"
)
parser.add_argument(
"--httpsSelfSigned",
type=strtobool,
default=True,
help="generate self-signed certificate",
)
# parser.add_argument("--internal", type=strtobool, default=False, help="各種パスをmac appの中身に変換")
parser.add_argument("--content_vec_500", type=str, help="path to content_vec_500 model(pytorch)")
parser.add_argument("--content_vec_500_onnx", type=str, help="path to content_vec_500 model(onnx)")
parser.add_argument("--content_vec_500_onnx_on", type=strtobool, default=False, help="use or not onnx for content_vec_500")
parser.add_argument("--hubert_base", type=str, help="path to hubert_base model(pytorch)")
parser.add_argument("--hubert_soft", type=str, help="path to hubert_soft model(pytorch)")
parser.add_argument("--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)")
parser.add_argument(
"--content_vec_500", type=str, help="path to content_vec_500 model(pytorch)"
)
parser.add_argument(
"--content_vec_500_onnx", type=str, help="path to content_vec_500 model(onnx)"
)
parser.add_argument(
"--content_vec_500_onnx_on",
type=strtobool,
default=False,
help="use or not onnx for content_vec_500",
)
parser.add_argument(
"--hubert_base", type=str, help="path to hubert_base model(pytorch)"
)
parser.add_argument(
"--hubert_soft", type=str, help="path to hubert_soft model(pytorch)"
)
parser.add_argument(
"--nsf_hifigan", type=str, help="path to nsf_hifigan model(pytorch)"
)
return parser
def printMessage(message, level=0):
pf = platform.system()
if pf == 'Windows':
if pf == "Windows":
if level == 0:
print(f"{message}")
elif level == 1:
@ -78,37 +96,38 @@ def localServer():
host="0.0.0.0",
port=int(PORT),
reload=False if hasattr(sys, "_MEIPASS") else True,
log_level="warning"
log_level="warning",
)
if __name__ == 'MMVCServerSIO':
voiceChangerManager = VoiceChangerManager.get_instance({
"content_vec_500": args.content_vec_500,
"content_vec_500_onnx": args.content_vec_500_onnx,
"content_vec_500_onnx_on": args.content_vec_500_onnx_on,
"hubert_base": args.hubert_base,
"hubert_soft": args.hubert_soft,
"nsf_hifigan": args.nsf_hifigan,
})
if __name__ == "MMVCServerSIO":
voiceChangerParams = VoiceChangerParams(
content_vec_500=args.content_vec_500,
content_vec_500_onnx=args.content_vec_500_onnx,
content_vec_500_onnx_on=args.content_vec_500_onnx_on,
hubert_base=args.hubert_base,
hubert_soft=args.hubert_soft,
nsf_hifigan=args.nsf_hifigan,
)
voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams)
print("voiceChangerManager", voiceChangerManager)
app_fastapi = MMVC_Rest.get_instance(voiceChangerManager)
app_socketio = MMVC_SocketIOApp.get_instance(app_fastapi, voiceChangerManager)
if __name__ == '__mp_main__':
printMessage(f"サーバプロセスを起動しています。", level=2)
if __name__ == "__mp_main__":
printMessage("サーバプロセスを起動しています。", level=2)
if __name__ == '__main__':
if __name__ == "__main__":
mp.freeze_support()
printMessage(f"Voice Changerを起動しています。", level=2)
printMessage("Voice Changerを起動しています。", level=2)
PORT = args.p
if os.getenv("EX_PORT"):
EX_PORT = os.environ["EX_PORT"]
printMessage(
f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1)
printMessage(f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1)
else:
printMessage(f"Internal_Port:{PORT}", level=1)
@ -123,38 +142,42 @@ if __name__ == '__main__':
key_base_name = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
keyname = f"{key_base_name}.key"
certname = f"{key_base_name}.cert"
create_self_signed_cert(certname, keyname, certargs={"Country": "JP",
"State": "Tokyo",
"City": "Chuo-ku",
"Organization": "F",
"Org. Unit": "F"}, cert_dir=SSL_KEY_DIR)
create_self_signed_cert(
certname,
keyname,
certargs={
"Country": "JP",
"State": "Tokyo",
"City": "Chuo-ku",
"Organization": "F",
"Org. Unit": "F",
},
cert_dir=SSL_KEY_DIR,
)
key_path = os.path.join(SSL_KEY_DIR, keyname)
cert_path = os.path.join(SSL_KEY_DIR, certname)
printMessage(
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1)
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1
)
elif args.https and args.httpsSelfSigned == 0:
# HTTPS
key_path = args.httpsKey
cert_path = args.httpsCert
printMessage(
f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1)
printMessage(f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1)
else:
# HTTP
printMessage(f"protocol: HTTP", level=1)
printMessage(f"-- ---- -- ", level=1)
printMessage("protocol: HTTP", level=1)
printMessage("-- ---- -- ", level=1)
# アドレス表示
printMessage(
f"ブラウザで次のURLを開いてください.", level=2)
printMessage("ブラウザで次のURLを開いてください.", level=2)
if args.https == 1:
printMessage(
f"https://<IP>:<PORT>/", level=1)
printMessage("https://<IP>:<PORT>/", level=1)
else:
printMessage(
f"http://<IP>:<PORT>/", level=1)
printMessage("http://<IP>:<PORT>/", level=1)
printMessage(f"多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
if "EX_PORT" in locals() and "EX_IP" in locals(): # シェルスクリプト経由起動(docker)
if args.https == 1:
printMessage(f"https://localhost:{EX_PORT}/", level=1)
@ -175,7 +198,7 @@ if __name__ == '__main__':
# サーバ起動
if args.https:
# HTTPS サーバ起動
res = uvicorn.run(
uvicorn.run(
f"{os.path.basename(__file__)[:-3]}:app_socketio",
host="0.0.0.0",
port=int(PORT),
@ -188,13 +211,17 @@ if __name__ == '__main__':
p = mp.Process(name="p", target=localServer)
p.start()
try:
if sys.platform.startswith('win'):
process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "-u", f"http://localhost:{PORT}/"])
if sys.platform.startswith("win"):
process = subprocess.Popen(
[NATIVE_CLIENT_FILE_WIN, "-u", f"http://localhost:{PORT}/"]
)
return_code = process.wait()
print("client closed.")
p.terminate()
elif sys.platform.startswith('darwin'):
process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "-u", f"http://localhost:{PORT}/"])
elif sys.platform.startswith("darwin"):
process = subprocess.Popen(
[NATIVE_CLIENT_FILE_MAC, "-u", f"http://localhost:{PORT}/"]
)
return_code = process.wait()
print("client closed.")
p.terminate()

View File

@ -26,14 +26,6 @@ TMP_DIR = os.path.join(tmpdir.name, "tmp_dir") if hasattr(sys, "_MEIPASS") else
os.makedirs(TMP_DIR, exist_ok=True)
# modelType: ModelType = "MMVCv15"
# def getModelType() -> ModelType:
# return modelType
# def setModelType(_modelType: ModelType):
# global modelType
# modelType = _modelType
def getFrontendPath():
frontend_path = os.path.join(sys._MEIPASS, "dist") if hasattr(sys, "_MEIPASS") else "../client/demo/dist"
return frontend_path

View File

@ -1,7 +1,8 @@
from fastapi import FastAPI, Request, Response
from fastapi import FastAPI, Request, Response, HTTPException
from fastapi.routing import APIRoute
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.exceptions import RequestValidationError
from typing import Callable
from voice_changer.VoiceChangerManager import VoiceChangerManager
@ -18,7 +19,7 @@ class ValidationErrorLoggingRoute(APIRoute):
async def custom_route_handler(request: Request) -> Response:
try:
return await original_route_handler(request)
except Exception as exc:
except RequestValidationError as exc:
print("Exception", request.url, str(exc))
body = await request.body()
detail = {"errors": exc.errors(), "body": body.decode()}
@ -28,10 +29,11 @@ class ValidationErrorLoggingRoute(APIRoute):
class MMVC_Rest:
_instance = None
@classmethod
def get_instance(cls, voiceChangerManager: VoiceChangerManager):
if not hasattr(cls, "_instance"):
if cls._instance is None:
app_fastapi = FastAPI()
app_fastapi.router.route_class = ValidationErrorLoggingRoute
app_fastapi.add_middleware(
@ -43,15 +45,25 @@ class MMVC_Rest:
)
app_fastapi.mount(
"/front", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static")
"/front",
StaticFiles(directory=f"{getFrontendPath()}", html=True),
name="static",
)
app_fastapi.mount(
"/trainer", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static")
"/trainer",
StaticFiles(directory=f"{getFrontendPath()}", html=True),
name="static",
)
app_fastapi.mount(
"/recorder", StaticFiles(directory=f'{getFrontendPath()}', html=True), name="static")
"/recorder",
StaticFiles(directory=f"{getFrontendPath()}", html=True),
name="static",
)
app_fastapi.mount(
"/tmp", StaticFiles(directory=f'{TMP_DIR}'), name="static")
"/tmp", StaticFiles(directory=f"{TMP_DIR}"), name="static"
)
restHello = MMVC_Rest_Hello()
app_fastapi.include_router(restHello.router)

View File

@ -4,12 +4,13 @@ from typing import Union
from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from fastapi import HTTPException, FastAPI, UploadFile, File, Form
from fastapi import UploadFile, File, Form
from restapi.mods.FileUploader import upload_file, concat_file_chunks
from voice_changer.VoiceChangerManager import VoiceChangerManager
from const import MODEL_DIR, UPLOAD_DIR, ModelType
os.makedirs(UPLOAD_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
@ -19,12 +20,22 @@ class MMVC_Rest_Fileuploader:
self.voiceChangerManager = voiceChangerManager
self.router = APIRouter()
self.router.add_api_route("/info", self.get_info, methods=["GET"])
self.router.add_api_route("/upload_file", self.post_upload_file, methods=["POST"])
self.router.add_api_route("/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"])
self.router.add_api_route("/update_settings", self.post_update_settings, methods=["POST"])
self.router.add_api_route(
"/upload_file", self.post_upload_file, methods=["POST"]
)
self.router.add_api_route(
"/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"]
)
self.router.add_api_route(
"/update_settings", self.post_update_settings, methods=["POST"]
)
self.router.add_api_route("/load_model", self.post_load_model, methods=["POST"])
self.router.add_api_route("/load_model_for_train", self.post_load_model_for_train, methods=["POST"])
self.router.add_api_route("/extract_voices", self.post_extract_voices, methods=["POST"])
self.router.add_api_route(
"/load_model_for_train", self.post_load_model_for_train, methods=["POST"]
)
self.router.add_api_route(
"/extract_voices", self.post_extract_voices, methods=["POST"]
)
self.router.add_api_route("/model_type", self.post_model_type, methods=["POST"])
self.router.add_api_route("/model_type", self.get_model_type, methods=["GET"])
self.router.add_api_route("/onnx", self.get_onnx, methods=["GET"])
@ -34,9 +45,13 @@ class MMVC_Rest_Fileuploader:
json_compatible_item_data = jsonable_encoder(res)
return JSONResponse(content=json_compatible_item_data)
def post_concat_uploaded_file(self, filename: str = Form(...), filenameChunkNum: int = Form(...)):
def post_concat_uploaded_file(
self, filename: str = Form(...), filenameChunkNum: int = Form(...)
):
slot = 0
res = concat_file_chunks(slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
res = concat_file_chunks(
slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR
)
json_compatible_item_data = jsonable_encoder(res)
return JSONResponse(content=json_compatible_item_data)
@ -45,7 +60,9 @@ class MMVC_Rest_Fileuploader:
json_compatible_item_data = jsonable_encoder(info)
return JSONResponse(content=json_compatible_item_data)
def post_update_settings(self, key: str = Form(...), val: Union[int, str, float] = Form(...)):
def post_update_settings(
self, key: str = Form(...), val: Union[int, str, float] = Form(...)
):
print("post_update_settings", key, val)
info = self.voiceChangerManager.update_settings(key, val)
json_compatible_item_data = jsonable_encoder(info)
@ -63,7 +80,6 @@ class MMVC_Rest_Fileuploader:
isHalf: bool = Form(...),
params: str = Form(...),
):
props = {
"slot": slot,
"isHalf": isHalf,
@ -73,9 +89,9 @@ class MMVC_Rest_Fileuploader:
"onnxModelFilename": onnxModelFilename,
"clusterTorchModelFilename": clusterTorchModelFilename,
"featureFilename": featureFilename,
"indexFilename": indexFilename
"indexFilename": indexFilename,
},
"params": params
"params": params,
}
# Change Filepath
for key, val in props["files"].items():
@ -103,9 +119,11 @@ class MMVC_Rest_Fileuploader:
modelDFilenameChunkNum: int = Form(...),
):
modelGFilePath = concat_file_chunks(
UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR)
UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR
)
modelDFilePath = concat_file_chunks(
UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR)
UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR
)
return {"File saved": f"{modelGFilePath}, {modelDFilePath}"}
def post_extract_voices(
@ -114,7 +132,8 @@ class MMVC_Rest_Fileuploader:
zipFileChunkNum: int = Form(...),
):
zipFilePath = concat_file_chunks(
UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR)
UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR
)
shutil.unpack_archive(zipFilePath, "MMVC_Trainer/dataset/textful/")
return {"Zip file unpacked": f"{zipFilePath}"}

View File

@ -1,6 +1,6 @@
from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
class MMVC_Rest_Hello:
def __init__(self):
self.router = APIRouter()
@ -8,6 +8,3 @@ class MMVC_Rest_Hello:
def hello(self):
return {"result": "Index"}

View File

@ -31,24 +31,24 @@ class MMVC_Rest_VoiceChanger:
buffer = voice.buffer
wav = base64.b64decode(buffer)
if wav == 0:
samplerate, data = read("dummy.wav")
unpackedData = data
else:
unpackedData = np.array(struct.unpack(
'<%sh' % (len(wav) // struct.calcsize('<h')), wav))
# write("logs/received_data.wav", 24000,
# unpackedData.astype(np.int16))
# if wav == 0:
# samplerate, data = read("dummy.wav")
# unpackedData = data
# else:
# unpackedData = np.array(
# struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
# )
unpackedData = np.array(
struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
)
self.tlock.acquire()
changedVoice = self.voiceChangerManager.changeVoice(unpackedData)
self.tlock.release()
changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode('utf-8')
data = {
"timestamp": timestamp,
"changedVoiceBase64": changedVoiceBase64
}
changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8")
data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64}
json_compatible_item_data = jsonable_encoder(data)
return JSONResponse(content=json_compatible_item_data)

View File

@ -1,40 +0,0 @@
import os
import numpy as np
import pylab
import librosa
import librosa.display
import pyworld as pw
class IOAnalyzer:
def _get_f0_dio(self, y, sr):
_f0, time = pw.dio(y, sr, frame_period=5)
f0 = pw.stonemask(y, _f0, time, sr)
time = np.linspace(0, y.shape[0] / sr, len(time))
return f0, time
def _get_f0_harvest(self, y, sr):
_f0, time = pw.harvest(y, sr, frame_period=5)
f0 = pw.stonemask(y, _f0, time, sr)
time = np.linspace(0, y.shape[0] / sr, len(time))
return f0, time
def analyze(self, inputDataFile: str, dioImageFile: str, harvestImageFile: str, samplingRate: int):
y, sr = librosa.load(inputDataFile, samplingRate)
y = y.astype(np.float64)
spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
f0_dio, times = self._get_f0_dio(y, sr=samplingRate)
f0_harvest, times = self._get_f0_harvest(y, sr=samplingRate)
pylab.close()
HOP_LENGTH = 128
img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
pylab.savefig(dioImageFile)
pylab.close()
HOP_LENGTH = 128
img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
pylab.savefig(harvestImageFile)

View File

@ -30,13 +30,15 @@ class ModelWrapper:
self.embChannels = metadata["embChannels"]
self.modelType = metadata["modelType"]
self.deprecated = False
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
self.embedder = metadata["embedder"] if "embedder" in metadata else "hubert_base"
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}, embedder:{self.embedder}")
except:
self.samplingRate = 48000
self.f0 = True
self.embChannels = 256
self.modelType = 0
self.deprecated = True
self.embedder = "hubert_base"
print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print(f"[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default")
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
@ -57,6 +59,9 @@ class ModelWrapper:
def getDeprecated(self):
return self.deprecated
def getEmbedder(self):
return self.embedder
def set_providers(self, providers, provider_options=[{}]):
self.onnx_session.set_providers(providers=providers, provider_options=provider_options)

View File

@ -4,11 +4,12 @@ import json
import resampy
from voice_changer.RVC.ModelWrapper import ModelWrapper
from Exceptions import NoModeLoadedException
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
# avoiding parse arg error in RVC
sys.argv = ["MMVCServerSIO.py"]
if sys.platform.startswith('darwin'):
if sys.platform.startswith("darwin"):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
if len(baseDir) != 1:
print("baseDir should be only one ", baseDir)
@ -24,6 +25,7 @@ from functools import reduce
import numpy as np
import torch
import onnxruntime
# onnxruntime.set_default_logger_severity(3)
from const import HUBERT_ONNX_MODEL_PATH, TMP_DIR
@ -36,11 +38,17 @@ from .models import SynthesizerTrnMsNSFsidNono as SynthesizerTrnMsNSFsidNono_web
from .const import RVC_MODEL_TYPE_RVC, RVC_MODEL_TYPE_WEBUI
from fairseq import checkpoint_utils
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
providers = [
"OpenVINOExecutionProvider",
"CUDAExecutionProvider",
"DmlExecutionProvider",
"CPUExecutionProvider",
]
@dataclass
class ModelSlot():
class ModelSlot:
pyTorchModelFile: str = ""
onnxModelFile: str = ""
featureFile: str = ""
@ -51,13 +59,11 @@ class ModelSlot():
f0: bool = True
embChannels: int = 256
deprecated: bool = False
# samplingRateOnnx: int = -1
# f0Onnx: bool = True
# embChannelsOnnx: int = 256
embedder: str = "hubert_base" # "hubert_base", "contentvec", "distilhubert"
@dataclass
class RVCSettings():
class RVCSettings:
gpu: int = 0
dstId: int = 0
@ -72,9 +78,7 @@ class RVCSettings():
onnxModelFile: str = ""
configFile: str = ""
modelSlots: list[ModelSlot] = field(
default_factory=lambda: [
ModelSlot(), ModelSlot(), ModelSlot()
]
default_factory=lambda: [ModelSlot(), ModelSlot(), ModelSlot()]
)
indexRatio: float = 0
rvcQuality: int = 0
@ -82,23 +86,28 @@ class RVCSettings():
modelSamplingRate: int = 48000
modelSlotIndex: int = -1
speakers: dict[str, int] = field(
default_factory=lambda: {}
)
speakers: dict[str, int] = field(default_factory=lambda: {})
# ↓mutableな物だけ列挙
intData = ["gpu", "dstId", "tran", "extraConvertSize", "rvcQuality", "modelSamplingRate", "silenceFront", "modelSlotIndex"]
intData = [
"gpu",
"dstId",
"tran",
"extraConvertSize",
"rvcQuality",
"modelSamplingRate",
"silenceFront",
"modelSlotIndex",
]
floatData = ["silentThreshold", "indexRatio"]
strData = ["framework", "f0Detector"]
class RVC:
def __init__(self, params):
def __init__(self, params: VoiceChangerParams):
self.initialLoad = True
self.settings = RVCSettings()
self.inferenceing: bool = False
self.net_g = None
self.onnx_session = None
self.feature_file = None
@ -108,7 +117,10 @@ class RVC:
self.prevVol = 0
self.params = params
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
self.mps_enabled: bool = (
getattr(torch.backends, "mps", None) is not None
and torch.backends.mps.is_available()
)
self.currentSlot = -1
print("RVC initialization: ", params)
print("mps: ", self.mps_enabled)
@ -120,26 +132,41 @@ class RVC:
params = json.loads(params_str)
newSlot = asdict(self.settings.modelSlots[tmp_slot])
newSlot.update({
"pyTorchModelFile": props["files"]["pyTorchModelFilename"],
"onnxModelFile": props["files"]["onnxModelFilename"],
"featureFile": props["files"]["featureFilename"],
"indexFile": props["files"]["indexFilename"],
"defaultTrans": params["trans"]
})
newSlot.update(
{
"pyTorchModelFile": props["files"]["pyTorchModelFilename"],
"onnxModelFile": props["files"]["onnxModelFilename"],
"featureFile": props["files"]["featureFilename"],
"indexFile": props["files"]["indexFilename"],
"defaultTrans": params["trans"],
}
)
self.settings.modelSlots[tmp_slot] = ModelSlot(**newSlot)
print("[Voice Changer] RVC loading... slot:", tmp_slot)
# Load metadata
if self.settings.modelSlots[tmp_slot].pyTorchModelFile != None and self.settings.modelSlots[tmp_slot].pyTorchModelFile != "":
self._setInfoByPytorch(tmp_slot, self.settings.modelSlots[tmp_slot].pyTorchModelFile)
if self.settings.modelSlots[tmp_slot].onnxModelFile != None and self.settings.modelSlots[tmp_slot].onnxModelFile != "":
self._setInfoByONNX(tmp_slot, self.settings.modelSlots[tmp_slot].onnxModelFile)
if (
self.settings.modelSlots[tmp_slot].pyTorchModelFile != None
and self.settings.modelSlots[tmp_slot].pyTorchModelFile != ""
):
self._setInfoByPytorch(
tmp_slot, self.settings.modelSlots[tmp_slot].pyTorchModelFile
)
if (
self.settings.modelSlots[tmp_slot].onnxModelFile != None
and self.settings.modelSlots[tmp_slot].onnxModelFile != ""
):
self._setInfoByONNX(
tmp_slot, self.settings.modelSlots[tmp_slot].onnxModelFile
)
try:
hubert_path = self.params["hubert_base"]
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([hubert_path], suffix="",)
hubert_path = self.params.hubert_base
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[hubert_path],
suffix="",
)
model = models[0]
model.eval()
if self.is_half:
@ -164,13 +191,21 @@ class RVC:
if config_len == 18:
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC
self.settings.modelSlots[slot].embChannels = 256
self.settings.modelSlots[slot].embedder = "hubert_base"
else:
self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI
self.settings.modelSlots[slot].embChannels = cpt["config"][17]
self.settings.modelSlots[slot].embedder = cpt["embedder_name"]
if self.settings.modelSlots[slot].embedder.endswith("768"):
self.settings.modelSlots[slot].embedder = self.settings.modelSlots[
slot
].embedder[:-3]
print("embedder....", self.settings.modelSlots[slot].embedder)
self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False
self.settings.modelSlots[slot].samplingRate = cpt["config"][-1]
self.settings.modelSamplingRate = cpt["config"][-1]
# self.settings.modelSamplingRate = cpt["config"][-1]
def _setInfoByONNX(self, slot, file):
tmp_onnx_session = ModelWrapper(file)
@ -179,6 +214,8 @@ class RVC:
self.settings.modelSlots[slot].f0 = tmp_onnx_session.getF0()
self.settings.modelSlots[slot].samplingRate = tmp_onnx_session.getSamplingRate()
self.settings.modelSlots[slot].deprecated = tmp_onnx_session.getDeprecated()
self.settings.modelSlots[slot].embedder = tmp_onnx_session.getEmbedder()
print("embedder....", self.settings.modelSlots[slot].embedder)
def prepareModel(self, slot: int):
print("[Voice Changer] Prepare Model of slot:", slot)
@ -188,7 +225,7 @@ class RVC:
if pyTorchModelFile != None and pyTorchModelFile != "":
print("[Voice Changer] Loading Pytorch Model...")
cpt = torch.load(pyTorchModelFile, map_location="cpu")
'''
"""
(1) オリジナルとrvc-webuiのモデル判定 config全体の形状
ーマル256
[1025, 32, 192, 192, 768, 2, 6, 3, 0, '1', [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 6, 2, 2, 2], 512, [16, 16, 4, 4, 4], 109, 256, 48000]
@ -200,32 +237,32 @@ class RVC:
0: ピッチレス, 1:ノーマル
(2-2) rvc-webuiの(256 or 768) x (ーマルor pitchレス)判定 256, or 768 は17番目の要素で判定, ーマルor pitchレスはckp["f0"]で判定
'''
# config_len = len(cpt["config"])
# if config_len == 18:
# self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_RVC
# self.settings.modelSlots[slot].embChannels = 256
# else:
# self.settings.modelSlots[slot].modelType = RVC_MODEL_TYPE_WEBUI
# self.settings.modelSlots[slot].embChannels = cpt["config"][17]
# self.settings.modelSlots[slot].f0 = True if cpt["f0"] == 1 else False
# self.settings.modelSlots[slot].samplingRate = cpt["config"][-1]
"""
# self.settings.modelSamplingRate = cpt["config"][-1]
if self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC and self.settings.modelSlots[slot].f0 == True:
if (
self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC
and self.settings.modelSlots[slot].f0 == True
):
net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=self.is_half)
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC and self.settings.modelSlots[slot].f0 == False:
elif (
self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_RVC
and self.settings.modelSlots[slot].f0 == False
):
net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI and self.settings.modelSlots[slot].f0 == True:
net_g = SynthesizerTrnMsNSFsid_webui(**cpt["params"], is_half=self.is_half)
elif self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI and self.settings.modelSlots[slot].f0 == False:
######################
# TBD
######################
print("webui non-f0 is not supported yet")
net_g = SynthesizerTrnMsNSFsidNono_webui(**cpt["params"], is_half=self.is_half)
elif (
self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI
and self.settings.modelSlots[slot].f0 == True
):
net_g = SynthesizerTrnMsNSFsid_webui(
**cpt["params"], is_half=self.is_half
)
elif (
self.settings.modelSlots[slot].modelType == RVC_MODEL_TYPE_WEBUI
and self.settings.modelSlots[slot].f0 == False
):
net_g = SynthesizerTrnMsNSFsidNono_webui(
**cpt["params"], is_half=self.is_half
)
else:
print("unknwon")
@ -259,11 +296,15 @@ class RVC:
self.next_trans = self.settings.modelSlots[slot].defaultTrans
self.next_samplingRate = self.settings.modelSlots[slot].samplingRate
self.next_framework = "ONNX" if self.next_onnx_session != None else "PyTorch"
print("[Voice Changer] Prepare done.",)
print(
"[Voice Changer] Prepare done.",
)
return self.get_info()
def switchModel(self):
print("[Voice Changer] Switching model..",)
print(
"[Voice Changer] Switching model..",
)
# del self.net_g
# del self.onnx_session
self.net_g = self.next_net_g
@ -275,17 +316,23 @@ class RVC:
self.settings.modelSamplingRate = self.next_samplingRate
self.next_net_g = None
self.next_onnx_session = None
print("[Voice Changer] Switching model..done",)
print(
"[Voice Changer] Switching model..done",
)
def update_settings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=[val], provider_options=provider_options
)
if hasattr(self, "hubert_onnx"):
self.hubert_onnx.set_providers(providers=[val], provider_options=provider_options)
self.hubert_onnx.set_providers(
providers=[val], provider_options=provider_options
)
else:
self.onnx_session.set_providers(providers=[val])
if hasattr(self, "hubert_onnx"):
@ -294,12 +341,20 @@ class RVC:
print("Onnx is not enabled. Please load model.")
return False
elif key in self.settings.intData:
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
if (
key == "gpu"
and val >= 0
and val < self.gpu_num
and self.onnx_session != None
):
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=["CUDAExecutionProvider"],
provider_options=provider_options,
)
if key == "modelSlotIndex":
# self.switchModel(int(val))
val = int(val) % 1000 # Quick hack for same slot is selected
@ -318,7 +373,9 @@ class RVC:
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
data["onnxExecutionProviders"] = (
self.onnx_session.get_providers() if self.onnx_session != None else []
)
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
@ -331,22 +388,30 @@ class RVC:
def get_processing_sampling_rate(self):
return self.settings.modelSamplingRate
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
def generate_input(
self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0
):
newData = newData.astype(np.float32) / 32768.0
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
self.audio_buffer = np.concatenate(
[self.audio_buffer, newData], 0
) # 過去のデータに連結
else:
self.audio_buffer = newData
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
convertSize = (
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
)
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
self.audio_buffer = self.audio_buffer[-1 * convertSize :] # 変換対象の部分だけ抽出
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮)
crop = self.audio_buffer[
-1 * (inputSize + crossfadeSize) : -1 * (crossfadeSize)
] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮)
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.0)
self.prevVol = vol
@ -390,15 +455,34 @@ class RVC:
f0 = self.settings.modelSlots[self.currentSlot].f0
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(self.hubert_model, self.onnx_session, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)
audio_out = vc.pipeline(
self.hubert_model,
self.onnx_session,
sid,
audio,
times,
f0_up_key,
f0_method,
file_index,
file_big_npy,
index_rate,
if_f0,
f0_file=f0_file,
silence_front=self.settings.extraConvertSize
/ self.settings.modelSamplingRate,
embChannels=embChannels,
)
result = audio_out * np.sqrt(vol)
return result
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.", hasattr(self, "net_g"), self.net_g)
print(
"[Voice Changer] No pyTorch session.",
hasattr(self, "net_g"),
self.net_g,
)
raise NoModeLoadedException("pytorch")
if self.settings.gpu < 0 or (self.gpu_num == 0 and self.mps_enabled == False):
@ -436,8 +520,23 @@ class RVC:
f0_file = None
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file, silence_front=self.settings.extraConvertSize / self.settings.modelSamplingRate, embChannels=embChannels)
audio_out = vc.pipeline(
self.hubert_model,
self.net_g,
sid,
audio,
times,
f0_up_key,
f0_method,
file_index,
file_big_npy,
index_rate,
if_f0,
f0_file=f0_file,
silence_front=self.settings.extraConvertSize
/ self.settings.modelSamplingRate,
embChannels=embChannels,
)
result = audio_out * np.sqrt(vol)
@ -445,7 +544,11 @@ class RVC:
def inference(self, data):
if self.settings.modelSlotIndex < 0:
print("[Voice Changer] wait for loading model...", self.settings.modelSlotIndex, self.currentSlot)
print(
"[Voice Changer] wait for loading model...",
self.settings.modelSlotIndex,
self.currentSlot,
)
raise NoModeLoadedException("model_common")
if self.currentSlot != self.settings.modelSlotIndex:
@ -482,7 +585,9 @@ class RVC:
print("[Voice Changer] export2onnx, No pyTorch session.")
return {"status": "ng", "path": f""}
pyTorchModelFile = self.settings.modelSlots[self.settings.modelSlotIndex].pyTorchModelFile # inference前にexportできるようにcurrentSlotではなくslot
pyTorchModelFile = self.settings.modelSlots[
self.settings.modelSlotIndex
].pyTorchModelFile # inference前にexportできるようにcurrentSlotではなくslot
if pyTorchModelFile == None:
print("[Voice Changer] export2onnx, No pyTorch filepath.")
@ -490,23 +595,45 @@ class RVC:
import voice_changer.RVC.export2onnx as onnxExporter
output_file = os.path.splitext(os.path.basename(pyTorchModelFile))[0] + ".onnx"
output_file_simple = os.path.splitext(os.path.basename(pyTorchModelFile))[0] + "_simple.onnx"
output_file_simple = (
os.path.splitext(os.path.basename(pyTorchModelFile))[0] + "_simple.onnx"
)
output_path = os.path.join(TMP_DIR, output_file)
output_path_simple = os.path.join(TMP_DIR, output_file_simple)
print("embChannels", self.settings.modelSlots[self.settings.modelSlotIndex].embChannels)
print(
"embChannels",
self.settings.modelSlots[self.settings.modelSlotIndex].embChannels,
)
metadata = {
"application": "VC_CLIENT",
"version": "1",
"modelType": self.settings.modelSlots[self.settings.modelSlotIndex].modelType,
"samplingRate": self.settings.modelSlots[self.settings.modelSlotIndex].samplingRate,
"modelType": self.settings.modelSlots[
self.settings.modelSlotIndex
].modelType,
"samplingRate": self.settings.modelSlots[
self.settings.modelSlotIndex
].samplingRate,
"f0": self.settings.modelSlots[self.settings.modelSlotIndex].f0,
"embChannels": self.settings.modelSlots[self.settings.modelSlotIndex].embChannels,
"embChannels": self.settings.modelSlots[
self.settings.modelSlotIndex
].embChannels,
"embedder": self.settings.modelSlots[self.settings.modelSlotIndex].embedder,
}
if torch.cuda.device_count() > 0:
onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, True, metadata)
onnxExporter.export2onnx(
pyTorchModelFile, output_path, output_path_simple, True, metadata
)
else:
print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.")
onnxExporter.export2onnx(pyTorchModelFile, output_path, output_path_simple, False, metadata)
print(
"[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled."
)
onnxExporter.export2onnx(
pyTorchModelFile, output_path, output_path_simple, False, metadata
)
return {"status": "ok", "path": f"/tmp/{output_file_simple}", "filename": output_file_simple}
return {
"status": "ok",
"path": f"/tmp/{output_file_simple}",
"filename": output_file_simple,
}

View File

@ -9,14 +9,18 @@ import resampy
from voice_changer.IORecorder import IORecorder
# from voice_changer.IOAnalyzer import IOAnalyzer
from voice_changer.utils.Timer import Timer
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
import time
from Exceptions import NoModeLoadedException, ONNXInputArgumentException
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
providers = [
"OpenVINOExecutionProvider",
"CUDAExecutionProvider",
"DmlExecutionProvider",
"CPUExecutionProvider",
]
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
@ -25,7 +29,7 @@ STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")
@dataclass
class VoiceChangerSettings():
class VoiceChangerSettings:
inputSampleRate: int = 48000 # 48000 or 24000
crossFadeOffsetRate: float = 0.1
@ -41,16 +45,14 @@ class VoiceChangerSettings():
floatData: list[str] = field(
default_factory=lambda: ["crossFadeOffsetRate", "crossFadeEndRate"]
)
strData: list[str] = field(
default_factory=lambda: []
)
strData: list[str] = field(default_factory=lambda: [])
class VoiceChanger():
class VoiceChanger:
settings: VoiceChangerSettings
voiceChanger: VoiceChangerModel
def __init__(self, params):
def __init__(self, params: VoiceChangerParams):
# 初期化
self.settings = VoiceChangerSettings()
self.onnx_session = None
@ -64,9 +66,14 @@ class VoiceChanger():
self.params = params
self.gpu_num = torch.cuda.device_count()
self.prev_audio = np.zeros(4096)
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
self.mps_enabled: bool = (
getattr(torch.backends, "mps", None) is not None
and torch.backends.mps.is_available()
)
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
print(
f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})"
)
def switchModelType(self, modelType: ModelType):
if hasattr(self, "voiceChanger") and self.voiceChanger != None:
@ -77,24 +84,31 @@ class VoiceChanger():
self.modelType = modelType
if self.modelType == "MMVCv15":
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
self.voiceChanger = MMVCv15() # type: ignore
elif self.modelType == "MMVCv13":
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
self.voiceChanger = MMVCv13()
elif self.modelType == "so-vits-svc-40v2":
from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2
self.voiceChanger = SoVitsSvc40v2(self.params)
elif self.modelType == "so-vits-svc-40" or self.modelType == "so-vits-svc-40_c":
from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40
self.voiceChanger = SoVitsSvc40(self.params)
elif self.modelType == "DDSP-SVC":
from voice_changer.DDSP_SVC.DDSP_SVC import DDSP_SVC
self.voiceChanger = DDSP_SVC(self.params)
elif self.modelType == "RVC":
from voice_changer.RVC.RVC import RVC
self.voiceChanger = RVC(self.params)
else:
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
self.voiceChanger = MMVCv13()
return {"status": "OK", "msg": "vc is switched."}
@ -109,7 +123,6 @@ class VoiceChanger():
self,
props,
):
try:
return self.voiceChanger.loadModel(props)
except Exception as e:
@ -143,7 +156,9 @@ class VoiceChanger():
if key == "recordIO" and val == 1:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
self.ioRecorder = IORecorder(
STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate
)
if key == "recordIO" and val == 0:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
@ -174,12 +189,12 @@ class VoiceChanger():
return self.get_info()
def _generate_strength(self, crossfadeSize: int):
if self.crossfadeSize != crossfadeSize or \
self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \
self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \
self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
if (
self.crossfadeSize != crossfadeSize
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
):
self.crossfadeSize = crossfadeSize
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
@ -193,30 +208,54 @@ class VoiceChanger():
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength,
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(crossfadeSize - cf_offset - len(np_cur_strength))])
self.np_prev_strength = np.concatenate(
[
np.ones(cf_offset),
np_prev_strength,
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)),
]
)
self.np_cur_strength = np.concatenate(
[
np.zeros(cf_offset),
np_cur_strength,
np.ones(crossfadeSize - cf_offset - len(np_cur_strength)),
]
)
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
print(
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
)
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'np_prev_audio1') == True:
if hasattr(self, "np_prev_audio1") == True:
delattr(self, "np_prev_audio1")
if hasattr(self, "sola_buffer"):
del self.sola_buffer
# receivedData: tuple of short
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
def on_request(
self, receivedData: AudioInOut
) -> tuple[AudioInOut, list[Union[int, float]]]:
return self.on_request_sola(receivedData)
def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
def on_request_sola(
self, receivedData: AudioInOut
) -> tuple[AudioInOut, list[Union[int, float]]]:
try:
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != processing_sampling_rate:
newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
newData = cast(
AudioInOut,
resampy.resample(
receivedData,
self.settings.inputSampleRate,
processing_sampling_rate,
),
)
else:
newData = receivedData
@ -226,7 +265,9 @@ class VoiceChanger():
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
data = self.voiceChanger.generate_input(
newData, block_frame, crossfade_frame, sola_search_frame
)
preprocess_time = t.secs
# 変換処理
@ -234,15 +275,28 @@ class VoiceChanger():
# Inference
audio = self.voiceChanger.inference(data)
if hasattr(self, 'sola_buffer') == True:
if hasattr(self, "sola_buffer") == True:
np.set_printoptions(threshold=10000)
audio = audio[-sola_search_frame - crossfade_frame - block_frame:]
audio = audio[-sola_search_frame - crossfade_frame - block_frame :]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(audio[: crossfade_frame + sola_search_frame], np.flip(self.sola_buffer), 'valid')
cor_den = np.sqrt(np.convolve(audio[: crossfade_frame + sola_search_frame] ** 2, np.ones(crossfade_frame), 'valid') + 1e-3)
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],
np.flip(self.sola_buffer),
"valid",
)
cor_den = np.sqrt(
np.convolve(
audio[: crossfade_frame + sola_search_frame] ** 2,
np.ones(crossfade_frame),
"valid",
)
+ 1e-3
)
sola_offset = np.argmax(cor_nom / cor_den)
output_wav = audio[sola_offset: sola_offset + block_frame].astype(np.float64)
output_wav = audio[sola_offset : sola_offset + block_frame].astype(
np.float64
)
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
@ -251,11 +305,19 @@ class VoiceChanger():
print("[Voice Changer] no sola buffer. (You can ignore this.)")
result = np.zeros(4096).astype(np.int16)
if hasattr(self, 'sola_buffer') == True and sola_offset < sola_search_frame:
sola_buf_org = audio[- sola_search_frame - crossfade_frame + sola_offset: -sola_search_frame + sola_offset]
if (
hasattr(self, "sola_buffer") == True
and sola_offset < sola_search_frame
):
sola_buf_org = audio[
-sola_search_frame
- crossfade_frame
+ sola_offset : -sola_search_frame
+ sola_offset
]
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
# self.sola_buffer = audio[- crossfade_frame:]
mainprocess_time = t.secs
@ -263,12 +325,20 @@ class VoiceChanger():
with Timer("post-process") as t:
result = result.astype(np.int16)
if self.settings.inputSampleRate != processing_sampling_rate:
outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
outputData = cast(
AudioInOut,
resampy.resample(
result,
processing_sampling_rate,
self.settings.inputSampleRate,
).astype(np.int16),
)
else:
outputData = result
print_convert_processing(
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz"
)
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
@ -281,7 +351,9 @@ class VoiceChanger():
# # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
postprocess_time = t.secs
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
print_convert_processing(
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
)
perf = [preprocess_time, mainprocess_time, postprocess_time]
return outputData, perf
@ -299,8 +371,9 @@ class VoiceChanger():
def export2onnx(self):
return self.voiceChanger.export2onnx()
##############
PRINT_CONVERT_PROCESSING: bool = False
# PRINT_CONVERT_PROCESSING = True
@ -318,5 +391,7 @@ def pad_array(arr: AudioInOut, target_length: int):
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
padded_arr = np.pad(
arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
)
return padded_arr

View File

@ -1,12 +1,16 @@
import numpy as np
from voice_changer.VoiceChanger import VoiceChanger
from const import ModelType
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
class VoiceChangerManager():
class VoiceChangerManager(object):
_instance = None
voiceChanger: VoiceChanger = None
@classmethod
def get_instance(cls, params):
if not hasattr(cls, "_instance"):
def get_instance(cls, params: VoiceChangerParams):
if cls._instance is None:
cls._instance = cls()
cls._instance.voiceChanger = VoiceChanger(params)
return cls._instance
@ -20,7 +24,7 @@ class VoiceChangerManager():
return info
def get_info(self):
if hasattr(self, 'voiceChanger'):
if hasattr(self, "voiceChanger"):
info = self.voiceChanger.get_info()
info["status"] = "OK"
return info
@ -28,7 +32,7 @@ class VoiceChangerManager():
return {"status": "ERROR", "msg": "no model loaded"}
def update_settings(self, key: str, val: any):
if hasattr(self, 'voiceChanger'):
if hasattr(self, "voiceChanger"):
info = self.voiceChanger.update_settings(key, val)
info["status"] = "OK"
return info
@ -36,7 +40,7 @@ class VoiceChangerManager():
return {"status": "ERROR", "msg": "no model loaded"}
def changeVoice(self, receivedData: any):
if hasattr(self, 'voiceChanger') == True:
if hasattr(self, "voiceChanger") is True:
return self.voiceChanger.on_request(receivedData)
else:
print("Voice Change is not loaded. Did you load a correct model?")

View File

@ -0,0 +1,11 @@
from dataclasses import dataclass
@dataclass
class VoiceChangerParams():
content_vec_500: str
content_vec_500_onnx: str
content_vec_500_onnx_on: bool
hubert_base: str
hubert_soft: str
nsf_hifigan: str