WIP: add so-vits-svc

This commit is contained in:
wataru 2023-03-11 01:56:10 +09:00
parent 5ea82af9b3
commit a49a1f4558
7 changed files with 288 additions and 18 deletions

1
.gitignore vendored
View File

@ -6,6 +6,7 @@ __pycache__
server/upload_dir/
server/MMVC_Client_v13/
server/MMVC_Client_v15/
server/so-vits-svc-40v2/
server/keys
server/info
server/in.wav

View File

@ -38,7 +38,7 @@ def setupArgParser():
parser.add_argument("--colab", type=strtobool,
default=False, help="run on colab")
parser.add_argument("--modelType", type=str,
default="MMVCv15", help="model type")
default="MMVCv15", help="model type: MMVCv13, MMVCv15, so-vits-svc-40v2")
return parser

View File

@ -27,4 +27,10 @@ cd ..
cd MMVC_Client
git checkout 6dd4f2451fec701d85f611fa831d7e5f4ddce8da
cd ..
# for so-vits-svc
cd so-vits-svc/
git checkout 016db3de81f6a4034b85ffba120554d07829f132
cd ..
```

View File

@ -18,10 +18,6 @@ NATIVE_CLIENT_FILE_MAC = os.path.join(sys._MEIPASS, "voice-changer-native-client
TMP_DIR = os.path.join(tmpdir.name, "tmp_dir") if hasattr(sys, "_MEIPASS") else "tmp_dir"
os.makedirs(TMP_DIR, exist_ok=True)
# SSL_KEY_DIR = os.path.join(sys._MEIPASS, "keys") if hasattr(sys, "_MEIPASS") else "keys"
# MODEL_DIR = os.path.join(sys._MEIPASS, "logs") if hasattr(sys, "_MEIPASS") else "logs"
# UPLOAD_DIR = os.path.join(sys._MEIPASS, "upload_dir") if hasattr(sys, "_MEIPASS") else "upload_dir"
modelType = "MMVCv15"
@ -40,5 +36,6 @@ def getFrontendPath():
frontend_path = os.path.join(sys._MEIPASS, "dist_v15") if hasattr(sys, "_MEIPASS") else "../client/demo_v15/dist"
elif modelType == "MMVCv13":
frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist"
elif modelType == "so-vits-svc-40v2":
frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist"
return frontend_path

View File

@ -2,13 +2,18 @@ import logging
# logging.getLogger('numba').setLevel(logging.WARNING)
class UvicornSuppressFilter(logging.Filter):
def filter(self, record):
return False
logger = logging.getLogger("uvicorn.error")
logger.addFilter(UvicornSuppressFilter())
logger = logging.getLogger("fairseq.tasks.hubert_pretraining")
logger.addFilter(UvicornSuppressFilter())
# logger.propagate = False
logger = logging.getLogger("multipart.multipart")

View File

@ -0,0 +1,224 @@
import sys
import os
if sys.platform.startswith('darwin'):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
if len(baseDir) != 1:
print("baseDir should be only one ", baseDir)
sys.exit()
modulePath = os.path.join(baseDir[0], "so-vits-svc-40v2")
sys.path.append(modulePath)
else:
sys.path.append("so-vits-svc-40v2")
import io
from dataclasses import dataclass, asdict
from functools import reduce
import numpy as np
import torch
import onnxruntime
import pyworld as pw
from models import SynthesizerTrn
import utils
from fairseq import checkpoint_utils
import librosa
from inference import infer_tool
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@dataclass
class SoVitsSvc40v2Settings():
gpu: int = 0
srcId: int = 0
dstId: int = 101
f0Factor: float = 1.0
f0Detector: str = "dio" # dio or harvest
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
# ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId"]
floatData = ["f0Factor"]
strData = ["framework", "f0Detector"]
class SoVitsSvc40v2:
def __init__(self):
self.settings = SoVitsSvc40v2Settings()
self.net_g = None
self.onnx_session = None
self.raw_path = io.BytesIO()
self.gpu_num = torch.cuda.device_count()
self.prevVol = 0
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config
self.hps = utils.get_hparams_from_file(config)
# hubert model
print("loading hubert model")
vec_path = "hubert/checkpoint_best_legacy_500.pt"
print("load model(s) from {}".format(vec_path))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
[vec_path],
suffix="",
)
model = models[0]
model.eval()
self.hubert_model = utils.get_hubert_model().cpu()
if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file:
self.settings.onnxModelFile = onnx_model_file
# PyTorchモデル生成
if pyTorch_model_file != None:
self.net_g = SynthesizerTrn(
self.hps
)
self.net_g.eval()
utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
# # ONNXモデル生成
# if onnx_model_file != None:
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
# self.onnx_session = onnxruntime.InferenceSession(
# onnx_model_file,
# providers=providers
# )
return self.get_info()
def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
else:
return False
return True
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def get_processing_sampling_rate(self):
return self.hps.data.sampling_rate
def get_unit_f0(self, audio_buffer, tran):
wav_44k = audio_buffer
# f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
f0 = utils.compute_f0_dio(wav_44k, sampling_rate=self.hps.data.sampling_rate, hop_length=self.hps.data.hop_length)
f0, uv = utils.interpolate_f0(f0)
f0 = torch.FloatTensor(f0)
uv = torch.FloatTensor(uv)
f0 = f0 * 2 ** (tran / 12)
f0 = f0.unsqueeze(0)
uv = uv.unsqueeze(0)
# wav16k = librosa.resample(audio_buffer, orig_sr=24000, target_sr=16000)
wav16k = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000)
wav16k = torch.from_numpy(wav16k)
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
c = c.unsqueeze(0)
return c, f0, uv
def generate_input(self, newData: any, convertSize: int, cropRange):
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
crop = self.audio_buffer[cropRange[0]:cropRange[1]]
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.1)
self.prevVol = vol
print(f" Crop:{crop.shape}, vol{vol}")
c, f0, uv = self.get_unit_f0(self.audio_buffer, 20)
return (c, f0, uv, convertSize, vol)
def _onnx_inference(self, data):
pass
def _pyTorch_inference(self, data):
if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
if self.settings.gpu < 0 or self.gpu_num == 0:
dev = torch.device("cpu")
else:
dev = torch.device("cuda", index=self.settings.gpu)
convertSize = data[3]
vol = data[4]
data = (data[0], data[1], data[2],)
if vol < 0.00001:
print("silcent")
return np.zeros(convertSize).astype(np.int16)
print(vol)
with torch.no_grad():
c, f0, uv = [x.to(dev)for x in data]
sid_target = torch.LongTensor([0]).to(dev)
self.net_g.to(dev)
# audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=True, noice_scale=0.1)[0][0, 0].data.float()
audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=False, noice_scale=0.4)[0][0, 0].data.float()
audio1 = audio1 * self.hps.data.max_wav_value
result = audio1.float().cpu().numpy()
# result = infer_tool.pad_array(result, length)
return result
def inference(self, data):
if self.settings.framework == "ONNX":
audio = self._onnx_inference(data)
else:
audio = self._pyTorch_inference(data)
return audio
def destroy(self):
del self.net_g
del self.onnx_session

View File

@ -12,7 +12,7 @@ from voice_changer.IOAnalyzer import IOAnalyzer
import time
import librosa
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
@ -53,6 +53,13 @@ class VoiceChanger():
if modelType == "MMVCv15":
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
self.voiceChanger = MMVCv15()
elif modelType == "MMVCv13":
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
self.voiceChanger = MMVCv13()
elif modelType == "so-vits-svc-40v2":
from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2
self.voiceChanger = SoVitsSvc40v2()
else:
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
self.voiceChanger = MMVCv13()
@ -139,24 +146,32 @@ class VoiceChanger():
# receivedData: tuple of short
def on_request(self, receivedData: any):
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
print(f"------------ Convert processing.... ------------")
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != 24000:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
if self.settings.inputSampleRate != processing_sampling_rate:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)
else:
newData = receivedData
inputSize = newData.shape[0]
convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
# print(convertSize, unpackedData.shape[0])
print(f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
if convertSize < 8192:
convertSize = 8192
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
cropRange = (-1 * (inputSize + overlapSize), -1 * overlapSize)
print(f" Convert input data size of {convertSize}")
print(f" overlap:{overlapSize}, cropRange:{cropRange}")
self._generate_strength(inputSize)
data = self.voiceChanger.generate_input(newData, convertSize)
data = self.voiceChanger.generate_input(newData, convertSize, cropRange)
preprocess_time = t.secs
# 変換処理
@ -165,10 +180,8 @@ class VoiceChanger():
# Inference
audio = self.voiceChanger.inference(data)
# CrossFade
if hasattr(self, 'np_prev_audio1') == True:
np.set_printoptions(threshold=10000)
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.np_prev_strength
@ -177,6 +190,7 @@ class VoiceChanger():
cur = audio[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0)
print(f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
@ -195,20 +209,43 @@ class VoiceChanger():
# 後処理
with Timer("post-process") as t:
result = result.astype(np.int16)
if self.settings.inputSampleRate != 24000:
result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
if self.settings.inputSampleRate != processing_sampling_rate:
outputData = resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)
else:
outputData = result
print(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(result.tobytes())
self.ioRecorder.writeOutput(outputData.tobytes())
if receivedData.shape[0] != outputData.shape[0]:
outputData = pad_array(outputData, receivedData.shape[0])
print(
f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
postprocess_time = t.secs
print(" [fin] Input/Output size:", receivedData.shape[0], outputData.shape[0])
perf = [preprocess_time, mainprocess_time, postprocess_time]
return result, perf
return outputData, perf
def pad_array(arr, target_length):
current_length = arr.shape[0]
if current_length >= target_length:
return arr
else:
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
return padded_arr
##############
class Timer(object):
def __init__(self, title: str):
self.title = title