mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
WIP: add so-vits-svc
This commit is contained in:
parent
5ea82af9b3
commit
a49a1f4558
1
.gitignore
vendored
1
.gitignore
vendored
@ -6,6 +6,7 @@ __pycache__
|
||||
server/upload_dir/
|
||||
server/MMVC_Client_v13/
|
||||
server/MMVC_Client_v15/
|
||||
server/so-vits-svc-40v2/
|
||||
server/keys
|
||||
server/info
|
||||
server/in.wav
|
||||
|
@ -38,7 +38,7 @@ def setupArgParser():
|
||||
parser.add_argument("--colab", type=strtobool,
|
||||
default=False, help="run on colab")
|
||||
parser.add_argument("--modelType", type=str,
|
||||
default="MMVCv15", help="model type")
|
||||
default="MMVCv15", help="model type: MMVCv13, MMVCv15, so-vits-svc-40v2")
|
||||
|
||||
return parser
|
||||
|
||||
|
@ -27,4 +27,10 @@ cd ..
|
||||
cd MMVC_Client
|
||||
git checkout 6dd4f2451fec701d85f611fa831d7e5f4ddce8da
|
||||
cd ..
|
||||
|
||||
# for so-vits-svc
|
||||
cd so-vits-svc/
|
||||
git checkout 016db3de81f6a4034b85ffba120554d07829f132
|
||||
cd ..
|
||||
|
||||
```
|
@ -18,10 +18,6 @@ NATIVE_CLIENT_FILE_MAC = os.path.join(sys._MEIPASS, "voice-changer-native-client
|
||||
TMP_DIR = os.path.join(tmpdir.name, "tmp_dir") if hasattr(sys, "_MEIPASS") else "tmp_dir"
|
||||
os.makedirs(TMP_DIR, exist_ok=True)
|
||||
|
||||
# SSL_KEY_DIR = os.path.join(sys._MEIPASS, "keys") if hasattr(sys, "_MEIPASS") else "keys"
|
||||
# MODEL_DIR = os.path.join(sys._MEIPASS, "logs") if hasattr(sys, "_MEIPASS") else "logs"
|
||||
# UPLOAD_DIR = os.path.join(sys._MEIPASS, "upload_dir") if hasattr(sys, "_MEIPASS") else "upload_dir"
|
||||
|
||||
|
||||
modelType = "MMVCv15"
|
||||
|
||||
@ -40,5 +36,6 @@ def getFrontendPath():
|
||||
frontend_path = os.path.join(sys._MEIPASS, "dist_v15") if hasattr(sys, "_MEIPASS") else "../client/demo_v15/dist"
|
||||
elif modelType == "MMVCv13":
|
||||
frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist"
|
||||
|
||||
elif modelType == "so-vits-svc-40v2":
|
||||
frontend_path = os.path.join(sys._MEIPASS, "dist_v13") if hasattr(sys, "_MEIPASS") else "../client/demo_v13/dist"
|
||||
return frontend_path
|
||||
|
@ -2,13 +2,18 @@ import logging
|
||||
|
||||
# logging.getLogger('numba').setLevel(logging.WARNING)
|
||||
|
||||
|
||||
class UvicornSuppressFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
return False
|
||||
|
||||
|
||||
logger = logging.getLogger("uvicorn.error")
|
||||
logger.addFilter(UvicornSuppressFilter())
|
||||
|
||||
logger = logging.getLogger("fairseq.tasks.hubert_pretraining")
|
||||
logger.addFilter(UvicornSuppressFilter())
|
||||
|
||||
# logger.propagate = False
|
||||
|
||||
logger = logging.getLogger("multipart.multipart")
|
||||
|
224
server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
Normal file
224
server/voice_changer/SoVitsSvc40v2/SoVitsSvc40v2.py
Normal file
@ -0,0 +1,224 @@
|
||||
import sys
|
||||
import os
|
||||
if sys.platform.startswith('darwin'):
|
||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||
if len(baseDir) != 1:
|
||||
print("baseDir should be only one ", baseDir)
|
||||
sys.exit()
|
||||
modulePath = os.path.join(baseDir[0], "so-vits-svc-40v2")
|
||||
sys.path.append(modulePath)
|
||||
else:
|
||||
sys.path.append("so-vits-svc-40v2")
|
||||
|
||||
import io
|
||||
from dataclasses import dataclass, asdict
|
||||
from functools import reduce
|
||||
import numpy as np
|
||||
import torch
|
||||
import onnxruntime
|
||||
import pyworld as pw
|
||||
|
||||
from models import SynthesizerTrn
|
||||
import utils
|
||||
from fairseq import checkpoint_utils
|
||||
import librosa
|
||||
from inference import infer_tool
|
||||
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class SoVitsSvc40v2Settings():
|
||||
gpu: int = 0
|
||||
srcId: int = 0
|
||||
dstId: int = 101
|
||||
|
||||
f0Factor: float = 1.0
|
||||
f0Detector: str = "dio" # dio or harvest
|
||||
|
||||
framework: str = "PyTorch" # PyTorch or ONNX
|
||||
pyTorchModelFile: str = ""
|
||||
onnxModelFile: str = ""
|
||||
configFile: str = ""
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = ["gpu", "srcId", "dstId"]
|
||||
floatData = ["f0Factor"]
|
||||
strData = ["framework", "f0Detector"]
|
||||
|
||||
|
||||
class SoVitsSvc40v2:
|
||||
def __init__(self):
|
||||
self.settings = SoVitsSvc40v2Settings()
|
||||
self.net_g = None
|
||||
self.onnx_session = None
|
||||
|
||||
self.raw_path = io.BytesIO()
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
self.prevVol = 0
|
||||
|
||||
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
|
||||
self.settings.configFile = config
|
||||
self.hps = utils.get_hparams_from_file(config)
|
||||
|
||||
# hubert model
|
||||
print("loading hubert model")
|
||||
vec_path = "hubert/checkpoint_best_legacy_500.pt"
|
||||
print("load model(s) from {}".format(vec_path))
|
||||
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
|
||||
[vec_path],
|
||||
suffix="",
|
||||
)
|
||||
model = models[0]
|
||||
model.eval()
|
||||
self.hubert_model = utils.get_hubert_model().cpu()
|
||||
|
||||
if pyTorch_model_file != None:
|
||||
self.settings.pyTorchModelFile = pyTorch_model_file
|
||||
if onnx_model_file:
|
||||
self.settings.onnxModelFile = onnx_model_file
|
||||
|
||||
# PyTorchモデル生成
|
||||
if pyTorch_model_file != None:
|
||||
self.net_g = SynthesizerTrn(
|
||||
self.hps
|
||||
)
|
||||
self.net_g.eval()
|
||||
utils.load_checkpoint(pyTorch_model_file, self.net_g, None)
|
||||
|
||||
# # ONNXモデル生成
|
||||
# if onnx_model_file != None:
|
||||
# ort_options = onnxruntime.SessionOptions()
|
||||
# ort_options.intra_op_num_threads = 8
|
||||
# self.onnx_session = onnxruntime.InferenceSession(
|
||||
# onnx_model_file,
|
||||
# providers=providers
|
||||
# )
|
||||
return self.get_info()
|
||||
|
||||
def update_setteings(self, key: str, val: any):
|
||||
if key == "onnxExecutionProvider" and self.onnx_session != None:
|
||||
if val == "CUDAExecutionProvider":
|
||||
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
|
||||
self.settings.gpu = 0
|
||||
provider_options = [{'device_id': self.settings.gpu}]
|
||||
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
|
||||
else:
|
||||
self.onnx_session.set_providers(providers=[val])
|
||||
elif key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
|
||||
providers = self.onnx_session.get_providers()
|
||||
print("Providers:", providers)
|
||||
if "CUDAExecutionProvider" in providers:
|
||||
provider_options = [{'device_id': self.settings.gpu}]
|
||||
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
|
||||
elif key in self.settings.floatData:
|
||||
setattr(self.settings, key, float(val))
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
|
||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
|
||||
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
||||
for f in files:
|
||||
if data[f] != None and os.path.exists(data[f]):
|
||||
data[f] = os.path.basename(data[f])
|
||||
else:
|
||||
data[f] = ""
|
||||
|
||||
return data
|
||||
|
||||
def get_processing_sampling_rate(self):
|
||||
return self.hps.data.sampling_rate
|
||||
|
||||
def get_unit_f0(self, audio_buffer, tran):
|
||||
wav_44k = audio_buffer
|
||||
# f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
|
||||
f0 = utils.compute_f0_dio(wav_44k, sampling_rate=self.hps.data.sampling_rate, hop_length=self.hps.data.hop_length)
|
||||
|
||||
f0, uv = utils.interpolate_f0(f0)
|
||||
f0 = torch.FloatTensor(f0)
|
||||
uv = torch.FloatTensor(uv)
|
||||
f0 = f0 * 2 ** (tran / 12)
|
||||
f0 = f0.unsqueeze(0)
|
||||
uv = uv.unsqueeze(0)
|
||||
|
||||
# wav16k = librosa.resample(audio_buffer, orig_sr=24000, target_sr=16000)
|
||||
wav16k = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000)
|
||||
wav16k = torch.from_numpy(wav16k)
|
||||
c = utils.get_hubert_content(self.hubert_model, wav_16k_tensor=wav16k)
|
||||
c = utils.repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
||||
c = c.unsqueeze(0)
|
||||
return c, f0, uv
|
||||
|
||||
def generate_input(self, newData: any, convertSize: int, cropRange):
|
||||
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
||||
|
||||
if hasattr(self, "audio_buffer"):
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
||||
else:
|
||||
self.audio_buffer = newData
|
||||
|
||||
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
|
||||
|
||||
crop = self.audio_buffer[cropRange[0]:cropRange[1]]
|
||||
|
||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||
vol = max(rms, self.prevVol * 0.1)
|
||||
self.prevVol = vol
|
||||
print(f" Crop:{crop.shape}, vol{vol}")
|
||||
|
||||
c, f0, uv = self.get_unit_f0(self.audio_buffer, 20)
|
||||
return (c, f0, uv, convertSize, vol)
|
||||
|
||||
def _onnx_inference(self, data):
|
||||
pass
|
||||
|
||||
def _pyTorch_inference(self, data):
|
||||
if hasattr(self, "net_g") == False or self.net_g == None:
|
||||
print("[Voice Changer] No pyTorch session.")
|
||||
return np.zeros(1).astype(np.int16)
|
||||
|
||||
if self.settings.gpu < 0 or self.gpu_num == 0:
|
||||
dev = torch.device("cpu")
|
||||
else:
|
||||
dev = torch.device("cuda", index=self.settings.gpu)
|
||||
|
||||
convertSize = data[3]
|
||||
vol = data[4]
|
||||
data = (data[0], data[1], data[2],)
|
||||
|
||||
if vol < 0.00001:
|
||||
print("silcent")
|
||||
return np.zeros(convertSize).astype(np.int16)
|
||||
print(vol)
|
||||
|
||||
with torch.no_grad():
|
||||
c, f0, uv = [x.to(dev)for x in data]
|
||||
sid_target = torch.LongTensor([0]).to(dev)
|
||||
self.net_g.to(dev)
|
||||
# audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=True, noice_scale=0.1)[0][0, 0].data.float()
|
||||
audio1 = self.net_g.infer(c, f0=f0, g=sid_target, uv=uv, predict_f0=False, noice_scale=0.4)[0][0, 0].data.float()
|
||||
audio1 = audio1 * self.hps.data.max_wav_value
|
||||
|
||||
result = audio1.float().cpu().numpy()
|
||||
|
||||
# result = infer_tool.pad_array(result, length)
|
||||
return result
|
||||
|
||||
def inference(self, data):
|
||||
if self.settings.framework == "ONNX":
|
||||
audio = self._onnx_inference(data)
|
||||
else:
|
||||
audio = self._pyTorch_inference(data)
|
||||
return audio
|
||||
|
||||
def destroy(self):
|
||||
del self.net_g
|
||||
del self.onnx_session
|
@ -12,7 +12,7 @@ from voice_changer.IOAnalyzer import IOAnalyzer
|
||||
|
||||
|
||||
import time
|
||||
|
||||
import librosa
|
||||
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
|
||||
@ -53,6 +53,13 @@ class VoiceChanger():
|
||||
if modelType == "MMVCv15":
|
||||
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
|
||||
self.voiceChanger = MMVCv15()
|
||||
elif modelType == "MMVCv13":
|
||||
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
||||
self.voiceChanger = MMVCv13()
|
||||
elif modelType == "so-vits-svc-40v2":
|
||||
from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2
|
||||
self.voiceChanger = SoVitsSvc40v2()
|
||||
|
||||
else:
|
||||
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
||||
self.voiceChanger = MMVCv13()
|
||||
@ -139,24 +146,32 @@ class VoiceChanger():
|
||||
|
||||
# receivedData: tuple of short
|
||||
def on_request(self, receivedData: any):
|
||||
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
print(f"------------ Convert processing.... ------------")
|
||||
# 前処理
|
||||
with Timer("pre-process") as t:
|
||||
|
||||
if self.settings.inputSampleRate != 24000:
|
||||
newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
|
||||
if self.settings.inputSampleRate != processing_sampling_rate:
|
||||
newData = resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate)
|
||||
else:
|
||||
newData = receivedData
|
||||
|
||||
inputSize = newData.shape[0]
|
||||
convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
|
||||
# print(convertSize, unpackedData.shape[0])
|
||||
print(f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
|
||||
|
||||
if convertSize < 8192:
|
||||
convertSize = 8192
|
||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (128 - (convertSize % 128))
|
||||
|
||||
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
|
||||
cropRange = (-1 * (inputSize + overlapSize), -1 * overlapSize)
|
||||
|
||||
print(f" Convert input data size of {convertSize}")
|
||||
print(f" overlap:{overlapSize}, cropRange:{cropRange}")
|
||||
self._generate_strength(inputSize)
|
||||
data = self.voiceChanger.generate_input(newData, convertSize)
|
||||
data = self.voiceChanger.generate_input(newData, convertSize, cropRange)
|
||||
preprocess_time = t.secs
|
||||
|
||||
# 変換処理
|
||||
@ -165,10 +180,8 @@ class VoiceChanger():
|
||||
# Inference
|
||||
audio = self.voiceChanger.inference(data)
|
||||
|
||||
# CrossFade
|
||||
if hasattr(self, 'np_prev_audio1') == True:
|
||||
np.set_printoptions(threshold=10000)
|
||||
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
|
||||
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
|
||||
cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
|
||||
powered_prev = prev_overlap * self.np_prev_strength
|
||||
@ -177,6 +190,7 @@ class VoiceChanger():
|
||||
|
||||
cur = audio[-1 * inputSize:-1 * overlapSize]
|
||||
result = np.concatenate([powered_result, cur], axis=0)
|
||||
print(f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
|
||||
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
|
||||
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
|
||||
|
||||
@ -195,20 +209,43 @@ class VoiceChanger():
|
||||
# 後処理
|
||||
with Timer("post-process") as t:
|
||||
result = result.astype(np.int16)
|
||||
if self.settings.inputSampleRate != 24000:
|
||||
result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
|
||||
if self.settings.inputSampleRate != processing_sampling_rate:
|
||||
outputData = resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16)
|
||||
else:
|
||||
outputData = result
|
||||
|
||||
print(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
|
||||
if self.settings.recordIO == 1:
|
||||
self.ioRecorder.writeInput(receivedData)
|
||||
self.ioRecorder.writeOutput(result.tobytes())
|
||||
self.ioRecorder.writeOutput(outputData.tobytes())
|
||||
|
||||
if receivedData.shape[0] != outputData.shape[0]:
|
||||
outputData = pad_array(outputData, receivedData.shape[0])
|
||||
print(
|
||||
f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
|
||||
postprocess_time = t.secs
|
||||
|
||||
print(" [fin] Input/Output size:", receivedData.shape[0], outputData.shape[0])
|
||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||
return result, perf
|
||||
return outputData, perf
|
||||
|
||||
|
||||
def pad_array(arr, target_length):
|
||||
current_length = arr.shape[0]
|
||||
if current_length >= target_length:
|
||||
return arr
|
||||
else:
|
||||
pad_width = target_length - current_length
|
||||
pad_left = pad_width // 2
|
||||
pad_right = pad_width - pad_left
|
||||
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
|
||||
return padded_arr
|
||||
|
||||
##############
|
||||
|
||||
|
||||
class Timer(object):
|
||||
def __init__(self, title: str):
|
||||
self.title = title
|
||||
|
Loading…
Reference in New Issue
Block a user