WIP: support DDSP-SVC

This commit is contained in:
wataru 2023-04-17 04:37:22 +09:00
parent eb37febce8
commit 390a39fa64
5 changed files with 63 additions and 295 deletions

4
.gitignore vendored
View File

@ -41,4 +41,8 @@ client/lib/worklet/dist
docker/cudnn/
server/hubert_base.pt
server/hubert-soft-0d54a1f4.pt
server/nsf_hifigan/
start_trainer.sh

View File

@ -66,6 +66,8 @@ Windows 版と Mac 版を提供しています。
- so-vits-svc 4.0/so-vits-svc 4.0v2、RVC(Retrieval-based-Voice-Conversion)の動作には hubert のモデルが必要になります。[このリポジトリ](https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main)から`hubert_base.pt`をダウンロードして、バッチファイルがあるフォルダに格納してください。
- DDSP-SVC の動作には、hubert-soft と enhancer のモデルが必要です。hubert-soft は[このリンク](https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt)からダウンロードして、バッチファイルがあるフォルダに格納してください。enhancer は[このサイト](https://github.com/openvpi/vocoders/releases/tag/nsf-hifigan-v1)から`nsf_hifigan_20221211.zip`ダウンロードして下さい。解凍すると出てくる`nsf_hifigan`というフォルダをバッチファイルがあるフォルダに格納してください。
| Version | OS | フレームワーク | link | サポート VC | サイズ |
| --------- | --- | --------------------------------- | ---------------------------------------------------------------------------------------- | ------------------------------------------------------------------- | ------ |
| v.1.5.2.2 | mac | ONNX(cpu), PyTorch(cpu) | [通常](https://drive.google.com/uc?id=1dbAiGkPtGWWcQDNL0IHXl4OyTRZR8SIQ&export=download) | MMVC v.1.5.x, MMVC v.1.3.x, so-vits-svc 4.0, so-vits-svc 4.0v2, RVC | 635MB |

View File

@ -40,9 +40,13 @@ def setupArgParser():
parser.add_argument("--modelType", type=str,
default="MMVCv15", help="model type: MMVCv13, MMVCv15, so-vits-svc-40, so-vits-svc-40v2")
parser.add_argument("--cluster", type=str, help="path to cluster model")
parser.add_argument("--hubert", type=str, help="path to hubert model")
parser.add_argument("--internal", type=strtobool, default=False, help="各種パスをmac appの中身に変換")
parser.add_argument("--hubert", type=str, help="path to hubert model")
parser.add_argument("--useHubertOnnx", type=strtobool, default=False, help="use hubert onnx")
parser.add_argument("--hubertSoftPt", type=str, help="path to hubert-soft model(pytorch)")
parser.add_argument("--enhancerPt", type=str, help="path to enhancer model(pytorch)")
parser.add_argument("--enhancerOnnx", type=str, help="path to enhancer model(onnx)")
return parser
@ -82,12 +86,11 @@ printMessage(f"Booting PHASE :{__name__}", level=2)
TYPE = args.t
PORT = args.p
CONFIG = args.c if args.c != None else None
MODEL = args.m if args.m != None else None
ONNX_MODEL = args.o if args.o != None else None
HUBERT_MODEL = args.hubert if args.hubert != None else None # hubertはユーザがダウンロードして解凍フォルダに格納する運用。
CONFIG = args.c
MODEL = args.m
ONNX_MODEL = args.o
CLUSTER_MODEL = args.cluster if args.cluster != None else None
USE_HUBERT_ONNX = args.useHubertOnnx
if args.internal and hasattr(sys, "_MEIPASS"):
print("use internal path")
@ -125,7 +128,13 @@ if args.colab == True:
os.environ["colab"] = "True"
if __name__ == 'MMVCServerSIO':
voiceChangerManager = VoiceChangerManager.get_instance({"hubert": HUBERT_MODEL, "useHubertOnnx": USE_HUBERT_ONNX})
voiceChangerManager = VoiceChangerManager.get_instance({
"hubert": args.hubert,
"useHubertOnnx": args.useHubertOnnx,
"hubertSoftPt": args.hubertSoftPt,
"enhancerPt": args.enhancerPt,
"enhancerOnnx": args.enhancerOnnx
})
if CONFIG and (MODEL or ONNX_MODEL):
if MODEL_TYPE == "MMVCv15" or MODEL_TYPE == "MMVCv13":
voiceChangerManager.loadModel(CONFIG, MODEL, ONNX_MODEL, None)

View File

@ -20,14 +20,8 @@ import pyworld as pw
import ddsp.vocoder as vo
from ddsp.core import upsample
from enhancer import Enhancer
from slicer import Slicer
import librosa
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
import resampy
from scipy.io import wavfile
SAMPLING_RATE = 44100
@dataclass
class DDSP_SVCSettings():
@ -69,25 +63,31 @@ class DDSP_SVC:
self.params = params
print("DDSP-SVC initialization:", params)
def useDevice(self):
if self.settings.gpu >= 0 and torch.cuda.is_available():
return torch.device("cuda", index=self.settings.gpu)
else:
return torch.device("cpu")
def loadModel(self, props):
self.settings.configFile = props["files"]["configFilename"]
self.settings.pyTorchModelFile = props["files"]["pyTorchModelFilename"]
# model
model, args = vo.load_model(self.settings.pyTorchModelFile)
model, args = vo.load_model(self.settings.pyTorchModelFile, device=self.useDevice())
self.model = model
self.args = args
self.hop_size = int(self.args.data.block_size * SAMPLING_RATE / self.args.data.sampling_rate)
# self.sampling_rate = args.data.sampling_rate
self.sampling_rate = args.data.sampling_rate
self.hop_size = int(self.args.data.block_size * self.sampling_rate / self.args.data.sampling_rate)
print("-------------------hopsize", self.hop_size)
# hubert
# vec_path = self.params["hubert"]
vec_path = "./model_DDSP-SVC/hubert-soft-0d54a1f4.pt"
self.vec_path = self.params["hubertSoftPt"]
self.encoder = vo.Units_Encoder(
args.data.encoder,
vec_path,
args.data.encoder_sample_rate,
args.data.encoder_hop_size,
device="cpu")
self.args.data.encoder,
self.vec_path,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
device=self.useDevice())
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
@ -106,13 +106,14 @@ class DDSP_SVC:
self.f0_detector = vo.F0_Extractor(
# "crepe",
self.settings.f0Detector,
SAMPLING_RATE,
self.sampling_rate,
self.hop_size,
float(50),
float(1100))
self.volume_extractor = vo.Volume_Extractor(self.hop_size)
self.enhancer = Enhancer(self.args.enhancer.type, "./model_DDSP-SVC/enhancer/model", "cpu")
self.enhancer_path = self.params["enhancerPt"]
self.enhancer = Enhancer(self.args.enhancer.type, self.enhancer_path, device=self.useDevice())
return self.get_info()
def update_settings(self, key: str, val: any):
@ -132,6 +133,13 @@ class DDSP_SVC:
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "gpu":
model, _args = vo.load_model(self.settings.pyTorchModelFile, device=self.useDevice())
self.model = model
self.enhancer = Enhancer(self.args.enhancer.type, self.enhancer_path, device=self.useDevice())
self.encoder = vo.Units_Encoder(self.args.data.encoder, self.vec_path, self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size, device=self.useDevice())
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
@ -140,9 +148,14 @@ class DDSP_SVC:
print("f0Detector update", val)
if val == "dio":
val = "parselmouth"
if hasattr(self, "sampling_rate") == False:
self.sampling_rate = 44100
self.hop_size = 512
self.f0_detector = vo.F0_Extractor(
val,
SAMPLING_RATE,
self.sampling_rate,
self.hop_size,
float(50),
float(1100))
@ -165,7 +178,7 @@ class DDSP_SVC:
return data
def get_processing_sampling_rate(self):
return SAMPLING_RATE
return self.sampling_rate
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaSearchFrame: int = 0):
newData = newData.astype(np.float32) / 32768.0
@ -197,8 +210,8 @@ class DDSP_SVC:
volume = torch.from_numpy(volume).float().unsqueeze(-1).unsqueeze(0)
# embed
audio = torch.from_numpy(self.audio_buffer).float().unsqueeze(0)
seg_units = self.encoder.encode(audio, SAMPLING_RATE, self.hop_size)
audio = torch.from_numpy(self.audio_buffer).float().to(self.useDevice()).unsqueeze(0)
seg_units = self.encoder.encode(audio, self.sampling_rate, self.hop_size)
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
@ -247,22 +260,19 @@ class DDSP_SVC:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
c = data[0]
f0 = data[1]
volume = data[2]
mask = data[3]
c = data[0].to(self.useDevice())
f0 = data[1].to(self.useDevice())
volume = data[2].to(self.useDevice())
mask = data[3].to(self.useDevice())
convertSize = data[4]
vol = data[5]
print(volume.device)
# if vol < self.settings.silentThreshold:
# print("threshold")
# return np.zeros(convertSize).astype(np.int16)
with torch.no_grad():
spk_id = torch.LongTensor(np.array([[int(1)]]))
spk_id = torch.LongTensor(np.array([[int(1)]])).to(self.useDevice())
seg_output, _, (s_h, s_n) = self.model(c, f0, volume, spk_id=spk_id, spk_mix_dict=None)
seg_output *= mask

View File

@ -1,257 +0,0 @@
import sys
import os
if sys.platform.startswith('darwin'):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
if len(baseDir) != 1:
print("baseDir should be only one ", baseDir)
sys.exit()
modulePath = os.path.join(baseDir[0], "DDSP-SVC")
sys.path.append(modulePath)
else:
sys.path.append("DDSP-SVC")
import io
from dataclasses import dataclass, asdict, field
from functools import reduce
import numpy as np
import torch
import onnxruntime
import pyworld as pw
import ddsp.vocoder as vo
import librosa
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@dataclass
class DDSP_SVCSettings():
gpu: int = 0
dstId: int = 0
f0Detector: str = "dio" # dio or harvest
tran: int = 20
noiceScale: float = 0.3
predictF0: int = 0 # 0:False, 1:True
silentThreshold: float = 0.00001
extraConvertSize: int = 1024 * 32
clusterInferRatio: float = 0.1
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
speakers: dict[str, int] = field(
default_factory=lambda: {}
)
# ↓mutableな物だけ列挙
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize"]
floatData = ["noiceScale", "silentThreshold", "clusterInferRatio"]
strData = ["framework", "f0Detector"]
class DDSP_SVC:
def __init__(self, params):
self.settings = DDSP_SVCSettings()
self.net_g = None
self.onnx_session = None
self.raw_path = io.BytesIO()
self.gpu_num = torch.cuda.device_count()
self.prevVol = 0
self.params = params
print("DDSP-SVC initialization:", params)
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None, clusterTorchModel: str = None):
self.settings.configFile = config
# model
model, args = vo.load_model(pyTorch_model_file)
# hubert
self.model = model
self.args = args
vec_path = self.params["hubert"]
self.encoder = vo.Units_Encoder(
args.data.encoder,
vec_path,
args.data.encoder_sample_rate,
args.data.encoder_hop_size,
device="cpu")
# f0dec
self.f0_detector = vo.F0_Extractor(
self.settings.f0Detector,
44100,
512,
float(50),
float(1100))
return self.get_info()
def update_settings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu" and val >= 0 and val < self.gpu_num and self.onnx_session != None:
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
else:
return False
return True
def get_info(self):
data = asdict(self.settings)
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def get_processing_sampling_rate(self):
return 44100
def get_unit_f0(self, audio_buffer, tran):
if (self.settings.gpu < 0 or self.gpu_num == 0) or self.settings.framework == "ONNX":
dev = torch.device("cpu")
else:
dev = torch.device("cpu")
# dev = torch.device("cuda", index=self.settings.gpu)
wav_44k = audio_buffer
f0 = self.f0_detector.extract(wav_44k, uv_interp=True, device=dev)
f0 = torch.from_numpy(f0).float().to(dev).unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(10) / 12)
# print("f0:", f0)
print("wav_44k:::", wav_44k)
c = self.encoder.encode(torch.from_numpy(audio_buffer).float().unsqueeze(0).to(dev), 44100, 512)
# print("c:", c)
return c, f0
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int):
# newData = newData.astype(np.float32) / 32768.0
# newData = newData.astype(np.float32) / self.hps.data.max_wav_value
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
hop_size = int(self.args.data.block_size * 44100 / self.args.data.sampling_rate)
print("hopsize", hop_size)
if convertSize % hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (hop_size - (convertSize % hop_size))
print("convsize", convertSize)
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.0)
self.prevVol = vol
c, f0 = self.get_unit_f0(self.audio_buffer, self.settings.tran)
return (c, f0, convertSize, vol)
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No onnx session.")
return np.zeros(1).astype(np.int16)
c = data[0]
f0 = data[1]
convertSize = data[2]
vol = data[3]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
c, f0, uv = [x.numpy() for x in data]
audio1 = self.onnx_session.run(
["audio"],
{
"c": c,
"f0": f0,
"g": np.array([self.settings.dstId]).astype(np.int64),
"uv": np.array([self.settings.dstId]).astype(np.int64),
"predict_f0": np.array([self.settings.dstId]).astype(np.int64),
"noice_scale": np.array([self.settings.dstId]).astype(np.int64),
})[0][0, 0] * self.hps.data.max_wav_value
audio1 = audio1 * vol
result = audio1
return result
pass
def _pyTorch_inference(self, data):
if hasattr(self, "model") == False or self.model == None:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
if self.settings.gpu < 0 or self.gpu_num == 0:
dev = torch.device("cpu")
else:
dev = torch.device("cpu")
# dev = torch.device("cuda", index=self.settings.gpu)
c = data[0]
f0 = data[1]
convertSize = data[2]
vol = data[3]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
with torch.no_grad():
c.to(dev)
f0.to(dev)
vol = torch.from_numpy(np.array([vol] * c.shape[1])).float().to(dev).unsqueeze(-1).unsqueeze(0)
spk_id = torch.LongTensor(np.array([[1]])).to(dev)
# print("vol", vol)
print("input", c.shape, f0.shape)
seg_output, _, (s_h, s_n) = self.model(c, f0, vol, spk_id=spk_id)
seg_output = seg_output.squeeze().cpu().numpy()
print("SEG:", seg_output)
return seg_output
def inference(self, data):
if self.settings.framework == "ONNX":
audio = self._onnx_inference(data)
else:
audio = self._pyTorch_inference(data)
return audio
def destroy(self):
del self.net_g
del self.onnx_session