mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
import diffusion svc core
This commit is contained in:
parent
ad013bf4d3
commit
9c829ac91a
11
client/demo/dist/index.html
vendored
11
client/demo/dist/index.html
vendored
@ -1 +1,10 @@
|
||||
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
||||
<!DOCTYPE html>
|
||||
<html style="width: 100%; height: 100%; overflow: hidden">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Voice Changer Client Demo</title>
|
||||
<script defer src="index.js"></script></head>
|
||||
<body style="width: 100%; height: 100%; margin: 0px">
|
||||
<div id="app" style="width: 100%; height: 100%"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
1125
client/demo/dist/index.js
vendored
1125
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
31
client/demo/dist/index.js.LICENSE.txt
vendored
31
client/demo/dist/index.js.LICENSE.txt
vendored
@ -1,31 +0,0 @@
|
||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react-dom.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* scheduler.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
@ -78,6 +78,9 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
|
||||
!!setting.files.find(x => { return x.kind == "ddspSvcDiffusion" }) &&
|
||||
!!setting.files.find(x => { return x.kind == "ddspSvcDiffusionConfig" })
|
||||
return enough
|
||||
} else if (setting.voiceChangerType == "Diffusion-SVC") {
|
||||
const enough = !!setting.files.find(x => { return x.kind == "diffusionSVCModel" })
|
||||
return enough
|
||||
}
|
||||
return false
|
||||
}
|
||||
@ -132,6 +135,8 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
|
||||
rows.push(generateFileRow(uploadSetting!, "Model", "ddspSvcModel", ["pth", "pt"], "model/"))
|
||||
rows.push(generateFileRow(uploadSetting!, "Config(diff)", "ddspSvcDiffusionConfig", ["yaml"], "diff/"))
|
||||
rows.push(generateFileRow(uploadSetting!, "Model(diff)", "ddspSvcDiffusion", ["pth", "pt"], "diff/"))
|
||||
} else if(vcType == "Diffusion-SVC") {
|
||||
rows.push(generateFileRow(uploadSetting!, "Model", "diffusionSVCModel", ["ptc"]))
|
||||
}
|
||||
return rows
|
||||
}
|
||||
|
@ -9,7 +9,8 @@ export const VoiceChangerType = {
|
||||
"MMVCv13": "MMVCv13",
|
||||
"so-vits-svc-40": "so-vits-svc-40",
|
||||
"DDSP-SVC": "DDSP-SVC",
|
||||
"RVC": "RVC"
|
||||
"RVC": "RVC",
|
||||
"Diffusion-SVC":"Diffusion-SVC"
|
||||
|
||||
} as const
|
||||
export type VoiceChangerType = typeof VoiceChangerType[keyof typeof VoiceChangerType]
|
||||
|
@ -27,6 +27,8 @@ export const ModelFileKind = {
|
||||
"ddspSvcDiffusion": "ddspSvcDiffusion",
|
||||
"ddspSvcDiffusionConfig": "ddspSvcDiffusionConfig",
|
||||
|
||||
"diffusionSVCModel": "diffusionSVCModel",
|
||||
|
||||
} as const
|
||||
export type ModelFileKind = typeof ModelFileKind[keyof typeof ModelFileKind]
|
||||
|
||||
|
@ -11,14 +11,7 @@ VoiceChangerType: TypeAlias = Literal[
|
||||
"so-vits-svc-40",
|
||||
"DDSP-SVC",
|
||||
"RVC",
|
||||
]
|
||||
|
||||
ModelType: TypeAlias = Literal[
|
||||
"MMVCv15",
|
||||
"MMVCv13",
|
||||
"so-vits-svc-40",
|
||||
"DDSP-SVC",
|
||||
"RVC",
|
||||
"Diffusion-SVC"
|
||||
]
|
||||
|
||||
STORED_SETTING_FILE = "stored_setting.json"
|
||||
|
@ -102,7 +102,22 @@ class DDSPSVCModelSlot(ModelSlot):
|
||||
speakers: dict = field(default_factory=lambda: {1: "user"})
|
||||
|
||||
|
||||
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot]
|
||||
@dataclass
|
||||
class DiffusionSVCModelSlot(ModelSlot):
|
||||
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
|
||||
modelFile: str = ""
|
||||
isONNX: bool = False
|
||||
modelType: str = "combo"
|
||||
dstId: int = 1
|
||||
|
||||
sampleId: str = ""
|
||||
defaultTune: int = 0
|
||||
kstep: int = 100
|
||||
speakers: dict = field(default_factory=lambda: {1: "user"})
|
||||
embedder: EmbedderType = "hubert_base"
|
||||
|
||||
|
||||
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
|
||||
|
||||
|
||||
def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
|
||||
@ -122,6 +137,8 @@ def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
|
||||
return SoVitsSvc40ModelSlot(**jsonDict)
|
||||
elif slotInfo.voiceChangerType == "DDSP-SVC":
|
||||
return DDSPSVCModelSlot(**jsonDict)
|
||||
elif slotInfo.voiceChangerType == "Diffusion-SVC":
|
||||
return DiffusionSVCModelSlot(**jsonDict)
|
||||
else:
|
||||
return ModelSlot()
|
||||
|
||||
|
@ -9,7 +9,7 @@ from fastapi import UploadFile, File, Form
|
||||
from restapi.mods.FileUploader import upload_file, concat_file_chunks
|
||||
from voice_changer.VoiceChangerManager import VoiceChangerManager
|
||||
|
||||
from const import MODEL_DIR, UPLOAD_DIR, ModelType
|
||||
from const import MODEL_DIR, UPLOAD_DIR
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
|
||||
|
||||
|
||||
@ -27,8 +27,6 @@ class MMVC_Rest_Fileuploader:
|
||||
self.router.add_api_route("/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"])
|
||||
self.router.add_api_route("/update_settings", self.post_update_settings, methods=["POST"])
|
||||
self.router.add_api_route("/load_model", self.post_load_model, methods=["POST"])
|
||||
self.router.add_api_route("/model_type", self.post_model_type, methods=["POST"])
|
||||
self.router.add_api_route("/model_type", self.get_model_type, methods=["GET"])
|
||||
self.router.add_api_route("/onnx", self.get_onnx, methods=["GET"])
|
||||
self.router.add_api_route("/merge_model", self.post_merge_models, methods=["POST"])
|
||||
self.router.add_api_route("/update_model_default", self.post_update_model_default, methods=["POST"])
|
||||
@ -97,22 +95,6 @@ class MMVC_Rest_Fileuploader:
|
||||
except Exception as e:
|
||||
print("[Voice Changer] post_load_model ex:", e)
|
||||
|
||||
def post_model_type(self, modelType: ModelType = Form(...)):
|
||||
try:
|
||||
info = self.voiceChangerManager.switchModelType(modelType)
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] post_model_type ex:", e)
|
||||
|
||||
def get_model_type(self):
|
||||
try:
|
||||
info = self.voiceChangerManager.getModelType()
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] get_model_type ex:", e)
|
||||
|
||||
def get_onnx(self):
|
||||
try:
|
||||
info = self.voiceChangerManager.export2onnx()
|
||||
|
219
server/voice_changer/DiffusionSVC/DiffusionSVC.py
Normal file
219
server/voice_changer/DiffusionSVC/DiffusionSVC.py
Normal file
@ -0,0 +1,219 @@
|
||||
# import sys
|
||||
# import os
|
||||
from dataclasses import asdict
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchaudio
|
||||
from data.ModelSlot import DiffusionSVCModelSlot
|
||||
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
||||
|
||||
from Exceptions import DeviceCannotSupportHalfPrecisionException
|
||||
|
||||
|
||||
class DiffusionSVC(VoiceChangerModel):
|
||||
def __init__(self, params: VoiceChangerParams, slotInfo: DiffusionSVCModelSlot):
|
||||
print("[Voice Changer] [DiffusionSVC] Creating instance ")
|
||||
self.deviceManager = DeviceManager.get_instance()
|
||||
EmbedderManager.initialize(params)
|
||||
PitchExtractorManager.initialize(params)
|
||||
self.settings = DiffusionSVCSettings()
|
||||
self.params = params
|
||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||
|
||||
self.pipeline: Pipeline | None = None
|
||||
|
||||
self.audio_buffer: AudioInOut | None = None
|
||||
self.pitchf_buffer: PitchfInOut | None = None
|
||||
self.feature_buffer: FeatureInOut | None = None
|
||||
self.prevVol = 0.0
|
||||
self.slotInfo = slotInfo
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
print("[Voice Changer] [DiffusionSVC] Initializing... ")
|
||||
|
||||
# pipelineの生成
|
||||
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector)
|
||||
|
||||
# その他の設定
|
||||
self.settings.tran = self.slotInfo.defaultTune
|
||||
self.settings.dstId = self.slotInfo.dstId
|
||||
self.settings.kstep = self.slotInfo.kstep
|
||||
|
||||
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
||||
|
||||
def update_settings(self, key: str, val: int | float | str):
|
||||
print("[Voice Changer][RVC]: update_settings", key, val)
|
||||
if key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
if key == "gpu":
|
||||
self.deviceManager.setForceTensor(False)
|
||||
self.initialize()
|
||||
elif key in self.settings.floatData:
|
||||
setattr(self.settings, key, float(val))
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
if key == "f0Detector" and self.pipeline is not None:
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||
else:
|
||||
return False
|
||||
return True
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
if self.pipeline is not None:
|
||||
pipelineInfo = self.pipeline.getPipelineInfo()
|
||||
data["pipelineInfo"] = pipelineInfo
|
||||
return data
|
||||
|
||||
def get_processing_sampling_rate(self):
|
||||
return self.slotInfo.samplingRate
|
||||
|
||||
def generate_input(
|
||||
self,
|
||||
newData: AudioInOut,
|
||||
inputSize: int,
|
||||
crossfadeSize: int,
|
||||
solaSearchFrame: int = 0,
|
||||
):
|
||||
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||
|
||||
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
|
||||
if self.audio_buffer is not None:
|
||||
# 過去のデータに連結
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
|
||||
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
|
||||
else:
|
||||
self.audio_buffer = newData
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
||||
|
||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||
|
||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (128 - (convertSize % 128))
|
||||
outSize = convertSize - self.settings.extraConvertSize
|
||||
|
||||
# バッファがたまっていない場合はzeroで補う
|
||||
if self.audio_buffer.shape[0] < convertSize:
|
||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
|
||||
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
|
||||
|
||||
convertOffset = -1 * convertSize
|
||||
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
|
||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
|
||||
self.feature_buffer = self.feature_buffer[featureOffset:]
|
||||
|
||||
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
||||
cropOffset = -1 * (inputSize + crossfadeSize)
|
||||
cropEnd = -1 * (crossfadeSize)
|
||||
crop = self.audio_buffer[cropOffset:cropEnd]
|
||||
vol = np.sqrt(np.square(crop).mean())
|
||||
vol = max(vol, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
|
||||
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)
|
||||
|
||||
def inference(self, data):
|
||||
audio = data[0]
|
||||
pitchf = data[1]
|
||||
feature = data[2]
|
||||
convertSize = data[3]
|
||||
vol = data[4]
|
||||
outSize = data[5]
|
||||
|
||||
if vol < self.settings.silentThreshold:
|
||||
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
|
||||
|
||||
if self.pipeline is not None:
|
||||
device = self.pipeline.device
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
||||
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
||||
repeat = 1 if self.settings.rvcQuality else 0
|
||||
sid = self.settings.dstId
|
||||
f0_up_key = self.settings.tran
|
||||
index_rate = self.settings.indexRatio
|
||||
protect = self.settings.protect
|
||||
|
||||
if_f0 = 1 if self.slotInfo.f0 else 0
|
||||
embOutputLayer = self.slotInfo.embOutputLayer
|
||||
useFinalProj = self.slotInfo.useFinalProj
|
||||
|
||||
try:
|
||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||
sid,
|
||||
audio,
|
||||
pitchf,
|
||||
feature,
|
||||
f0_up_key,
|
||||
index_rate,
|
||||
if_f0,
|
||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
|
||||
embOutputLayer,
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect,
|
||||
outSize
|
||||
)
|
||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
|
||||
return result
|
||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||
print("[Device Manager] Device cannot support half precision. Fallback to float....")
|
||||
self.deviceManager.setForceTensor(True)
|
||||
self.initialize()
|
||||
# raise e
|
||||
|
||||
return
|
||||
|
||||
def __del__(self):
|
||||
del self.pipeline
|
||||
|
||||
def export2onnx(self):
|
||||
modelSlot = self.slotInfo
|
||||
|
||||
if modelSlot.isONNX:
|
||||
print("[Voice Changer] export2onnx, No pyTorch filepath.")
|
||||
return {"status": "ng", "path": ""}
|
||||
|
||||
output_file_simple = export2onnx(self.settings.gpu, modelSlot)
|
||||
return {
|
||||
"status": "ok",
|
||||
"path": f"/tmp/{output_file_simple}",
|
||||
"filename": output_file_simple,
|
||||
}
|
||||
|
||||
def get_model_current(self):
|
||||
return [
|
||||
{
|
||||
"key": "defaultTune",
|
||||
"val": self.settings.tran,
|
||||
},
|
||||
{
|
||||
"key": "defaultIndexRatio",
|
||||
"val": self.settings.indexRatio,
|
||||
},
|
||||
{
|
||||
"key": "defaultProtect",
|
||||
"val": self.settings.protect,
|
||||
},
|
||||
]
|
@ -0,0 +1,162 @@
|
||||
import os
|
||||
from const import EnumInferenceTypes
|
||||
from dataclasses import asdict
|
||||
import torch
|
||||
import onnxruntime
|
||||
import json
|
||||
|
||||
from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot, RVCModelSlot
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
|
||||
|
||||
|
||||
class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
|
||||
@classmethod
|
||||
def loadModel(cls, props: LoadModelParams):
|
||||
slotInfo: DiffusionSVCModelSlot = DiffusionSVCModelSlot()
|
||||
for file in props.files:
|
||||
if file.kind == "diffusionSVCModel":
|
||||
slotInfo.modelFile = file.name
|
||||
slotInfo.defaultTune = 0
|
||||
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||
slotInfo.iconFile = "/assets/icons/noimage.png"
|
||||
|
||||
# if slotInfo.isONNX:
|
||||
# slotInfo = cls._setInfoByONNX(slotInfo)
|
||||
# else:
|
||||
# slotInfo = cls._setInfoByPytorch(slotInfo)
|
||||
return slotInfo
|
||||
|
||||
@classmethod
|
||||
def _setInfoByPytorch(cls, slot: ModelSlot):
|
||||
cpt = torch.load(slot.modelFile, map_location="cpu")
|
||||
config_len = len(cpt["config"])
|
||||
version = cpt.get("version", "v1")
|
||||
|
||||
slot = RVCModelSlot(**asdict(slot))
|
||||
|
||||
if version == "voras_beta":
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value
|
||||
slot.embChannels = 768
|
||||
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
|
||||
slot.useFinalProj = False
|
||||
|
||||
slot.embedder = cpt["embedder_name"]
|
||||
if slot.embedder.endswith("768"):
|
||||
slot.embedder = slot.embedder[:-3]
|
||||
|
||||
# if slot.embedder == "hubert":
|
||||
# slot.embedder = "hubert"
|
||||
# elif slot.embedder == "contentvec":
|
||||
# slot.embedder = "contentvec"
|
||||
# elif slot.embedder == "hubert_jp":
|
||||
# slot.embedder = "hubert_jp"
|
||||
else:
|
||||
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
|
||||
|
||||
elif config_len == 18:
|
||||
# Original RVC
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
version = cpt.get("version", "v1")
|
||||
if version is None or version == "v1":
|
||||
slot.modelType = EnumInferenceTypes.pyTorchRVC.value if slot.f0 else EnumInferenceTypes.pyTorchRVCNono.value
|
||||
slot.embChannels = 256
|
||||
slot.embOutputLayer = 9
|
||||
slot.useFinalProj = True
|
||||
slot.embedder = "hubert_base"
|
||||
print("[Voice Changer] Official Model(pyTorch) : v1")
|
||||
else:
|
||||
slot.modelType = EnumInferenceTypes.pyTorchRVCv2.value if slot.f0 else EnumInferenceTypes.pyTorchRVCv2Nono.value
|
||||
slot.embChannels = 768
|
||||
slot.embOutputLayer = 12
|
||||
slot.useFinalProj = False
|
||||
slot.embedder = "hubert_base"
|
||||
print("[Voice Changer] Official Model(pyTorch) : v2")
|
||||
|
||||
else:
|
||||
# DDPN RVC
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = EnumInferenceTypes.pyTorchWebUI.value if slot.f0 else EnumInferenceTypes.pyTorchWebUINono.value
|
||||
slot.embChannels = cpt["config"][17]
|
||||
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
|
||||
if slot.embChannels == 256:
|
||||
slot.useFinalProj = True
|
||||
else:
|
||||
slot.useFinalProj = False
|
||||
|
||||
# DDPNモデルの情報を表示
|
||||
if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj is True:
|
||||
print("[Voice Changer] DDPN Model(pyTorch) : Official v1 like")
|
||||
elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False:
|
||||
print("[Voice Changer] DDPN Model(pyTorch): Official v2 like")
|
||||
else:
|
||||
print(f"[Voice Changer] DDPN Model(pyTorch): ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}")
|
||||
|
||||
slot.embedder = cpt["embedder_name"]
|
||||
if slot.embedder.endswith("768"):
|
||||
slot.embedder = slot.embedder[:-3]
|
||||
|
||||
if "speaker_info" in cpt.keys():
|
||||
for k, v in cpt["speaker_info"].items():
|
||||
slot.speakers[int(k)] = str(v)
|
||||
|
||||
slot.samplingRate = cpt["config"][-1]
|
||||
|
||||
del cpt
|
||||
|
||||
return slot
|
||||
|
||||
@classmethod
|
||||
def _setInfoByONNX(cls, slot: ModelSlot):
|
||||
tmp_onnx_session = onnxruntime.InferenceSession(slot.modelFile, providers=["CPUExecutionProvider"])
|
||||
modelmeta = tmp_onnx_session.get_modelmeta()
|
||||
try:
|
||||
slot = RVCModelSlot(**asdict(slot))
|
||||
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
|
||||
|
||||
# slot.modelType = metadata["modelType"]
|
||||
slot.embChannels = metadata["embChannels"]
|
||||
|
||||
slot.embOutputLayer = metadata["embOutputLayer"] if "embOutputLayer" in metadata else 9
|
||||
slot.useFinalProj = metadata["useFinalProj"] if "useFinalProj" in metadata else True if slot.embChannels == 256 else False
|
||||
|
||||
if slot.embChannels == 256:
|
||||
slot.useFinalProj = True
|
||||
else:
|
||||
slot.useFinalProj = False
|
||||
|
||||
# ONNXモデルの情報を表示
|
||||
if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj is True:
|
||||
print("[Voice Changer] ONNX Model: Official v1 like")
|
||||
elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False:
|
||||
print("[Voice Changer] ONNX Model: Official v2 like")
|
||||
else:
|
||||
print(f"[Voice Changer] ONNX Model: ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}")
|
||||
|
||||
if "embedder" not in metadata:
|
||||
slot.embedder = "hubert_base"
|
||||
else:
|
||||
slot.embedder = metadata["embedder"]
|
||||
|
||||
slot.f0 = metadata["f0"]
|
||||
slot.modelType = EnumInferenceTypes.onnxRVC.value if slot.f0 else EnumInferenceTypes.onnxRVCNono.value
|
||||
slot.samplingRate = metadata["samplingRate"]
|
||||
slot.deprecated = False
|
||||
|
||||
except Exception as e:
|
||||
slot.modelType = EnumInferenceTypes.onnxRVC.value
|
||||
slot.embChannels = 256
|
||||
slot.embedder = "hubert_base"
|
||||
slot.f0 = True
|
||||
slot.samplingRate = 48000
|
||||
slot.deprecated = True
|
||||
|
||||
print("[Voice Changer] setInfoByONNX", e)
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
|
||||
del tmp_onnx_session
|
||||
return slot
|
32
server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
Normal file
32
server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
Normal file
@ -0,0 +1,32 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class DiffusionSVCSettings:
|
||||
gpu: int = 0
|
||||
dstId: int = 0
|
||||
|
||||
f0Detector: str = "harvest" # dio or harvest
|
||||
tran: int = 12
|
||||
silentThreshold: float = 0.00001
|
||||
extraConvertSize: int = 1024 * 4
|
||||
|
||||
kstep: int = 100
|
||||
|
||||
silenceFront: int = 1 # 0:off, 1:on
|
||||
modelSamplingRate: int = 44100
|
||||
|
||||
speakers: dict[str, int] = field(default_factory=lambda: {})
|
||||
# isHalf: int = 1 # 0:off, 1:on
|
||||
# enableDirectML: int = 0 # 0:off, 1:on
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = [
|
||||
"gpu",
|
||||
"dstId",
|
||||
"tran",
|
||||
"extraConvertSize",
|
||||
"kstep",
|
||||
"silenceFront",
|
||||
]
|
||||
floatData = ["silentThreshold"]
|
||||
strData = ["f0Detector"]
|
@ -0,0 +1,35 @@
|
||||
import torch
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
|
||||
|
||||
class RVCInferencer(Inferencer):
|
||||
def loadModel(self, file: str, gpu: int):
|
||||
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
||||
|
||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
|
||||
model = model.to(dev)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
|
||||
self.model = model
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor,
|
||||
pitchf: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
convert_length: int | None,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
|
@ -0,0 +1,480 @@
|
||||
import numpy as np
|
||||
import time
|
||||
import os
|
||||
import torch
|
||||
import torch.nn.functional
|
||||
from torchaudio.transforms import Resample
|
||||
from tqdm import tqdm
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder, load_model_vocoder_from_combo
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.slicer import split
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.units_index import UnitsIndexer
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.tools import F0_Extractor, Volume_Extractor, Units_Encoder, SpeakerEncoder, cross_fade
|
||||
|
||||
|
||||
class DiffusionSVC:
|
||||
def __init__(self, device=None):
|
||||
if device is not None:
|
||||
self.device = device
|
||||
else:
|
||||
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.model_path = None
|
||||
self.model = None
|
||||
self.vocoder = None
|
||||
self.args = None
|
||||
# 特征提取器
|
||||
self.units_encoder = None
|
||||
self.f0_extractor = None
|
||||
self.f0_model = None
|
||||
self.f0_min = None
|
||||
self.f0_max = None
|
||||
self.volume_extractor = None
|
||||
self.speaker_encoder = None
|
||||
self.spk_emb_dict = None
|
||||
self.resample_dict_16000 = {}
|
||||
self.units_indexer = None
|
||||
self.naive_model_path = None
|
||||
self.naive_model = None
|
||||
self.naive_model_args = None
|
||||
self.use_combo_model = False
|
||||
|
||||
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
|
||||
|
||||
if ('1234' + model_path)[-4:] == '.ptc':
|
||||
self.use_combo_model = True
|
||||
self.model_path = model_path
|
||||
self.naive_model_path = model_path
|
||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(model_path,
|
||||
device=self.device)
|
||||
self.model = diff_model
|
||||
self.args = diff_args
|
||||
self.naive_model = naive_model
|
||||
self.naive_model_args = naive_args
|
||||
self.vocoder = vocoder
|
||||
else:
|
||||
self.model_path = model_path
|
||||
self.model, self.vocoder, self.args = load_model_vocoder(model_path, device=self.device)
|
||||
|
||||
self.units_encoder = Units_Encoder(
|
||||
self.args.data.encoder,
|
||||
self.args.data.encoder_ckpt,
|
||||
self.args.data.encoder_sample_rate,
|
||||
self.args.data.encoder_hop_size,
|
||||
cnhubertsoft_gate=self.args.data.cnhubertsoft_gate,
|
||||
device=self.device,
|
||||
units_forced_mode=self.args.data.units_forced_mode
|
||||
)
|
||||
|
||||
self.volume_extractor = Volume_Extractor(
|
||||
hop_size=512,
|
||||
block_size=self.args.data.block_size,
|
||||
model_sampling_rate=self.args.data.sampling_rate
|
||||
)
|
||||
|
||||
self.load_f0_extractor(f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
|
||||
|
||||
if self.args.model.use_speaker_encoder:
|
||||
self.speaker_encoder = SpeakerEncoder(
|
||||
self.args.data.speaker_encoder,
|
||||
self.args.data.speaker_encoder_config,
|
||||
self.args.data.speaker_encoder_ckpt,
|
||||
self.args.data.speaker_encoder_sample_rate,
|
||||
device=self.device
|
||||
)
|
||||
path_spk_emb_dict = os.path.join(os.path.split(model_path)[0], 'spk_emb_dict.npy')
|
||||
self.set_spk_emb_dict(path_spk_emb_dict)
|
||||
|
||||
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
|
||||
|
||||
def flush(self, model_path=None, f0_model=None, f0_min=None, f0_max=None, naive_model_path=None):
|
||||
assert (model_path is not None) or (naive_model_path is not None)
|
||||
# flush model if changed
|
||||
if ((self.model_path != model_path) or (self.f0_model != f0_model)
|
||||
or (self.f0_min != f0_min) or (self.f0_max != f0_max)):
|
||||
self.load_model(model_path, f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
|
||||
if (self.naive_model_path != naive_model_path) and (naive_model_path is not None):
|
||||
self.load_naive_model(naive_model_path)
|
||||
# check args if use naive
|
||||
if self.naive_model is not None:
|
||||
if self.naive_model_args.data.encoder != self.args.data.encoder:
|
||||
raise ValueError("encoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.model.n_spk != self.args.model.n_spk:
|
||||
raise ValueError("n_spk of Naive Model and Diffusion Model are different")
|
||||
if bool(self.naive_model_args.model.use_speaker_encoder) != bool(self.args.model.use_speaker_encoder):
|
||||
raise ValueError("use_speaker_encoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.vocoder.type != self.args.vocoder.type:
|
||||
raise ValueError("vocoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.data.block_size != self.args.data.block_size:
|
||||
raise ValueError("block_size of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.data.sampling_rate != self.args.data.sampling_rate:
|
||||
raise ValueError("sampling_rate of Naive Model and Diffusion Model are different")
|
||||
|
||||
def flush_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||
if (f0_model != self.f0_model) and (f0_model is not None):
|
||||
self.load_f0_extractor(f0_model)
|
||||
|
||||
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
|
||||
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
|
||||
self.f0_max = f0_max if (f0_max is not None) else self.args.data.f0_max
|
||||
self.f0_model = f0_model
|
||||
self.f0_extractor = F0_Extractor(
|
||||
f0_extractor=self.f0_model,
|
||||
sample_rate=44100,
|
||||
hop_size=512,
|
||||
f0_min=self.f0_min,
|
||||
f0_max=self.f0_max,
|
||||
block_size=self.args.data.block_size,
|
||||
model_sampling_rate=self.args.data.sampling_rate
|
||||
)
|
||||
|
||||
def load_naive_model(self, naive_model_path):
|
||||
self.naive_model_path = naive_model_path
|
||||
model, _, args = load_model_vocoder(naive_model_path, device=self.device, loaded_vocoder=self.vocoder)
|
||||
self.naive_model = model
|
||||
self.naive_model_args = args
|
||||
print(f" [INFO] Load naive model from {naive_model_path}")
|
||||
|
||||
@torch.no_grad()
|
||||
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
||||
aug_shift=0, spk_emb=None):
|
||||
# spk_id
|
||||
spk_emb_dict = None
|
||||
if self.args.model.use_speaker_encoder: # with speaker encoder
|
||||
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||
# without speaker encoder
|
||||
else:
|
||||
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.device)
|
||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.device)
|
||||
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer=True,
|
||||
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||
return out_spec
|
||||
|
||||
def set_spk_emb_dict(self, spk_emb_dict_or_path): # 从路径加载或直接设置
|
||||
if spk_emb_dict_or_path is None:
|
||||
return None
|
||||
if spk_emb_dict_or_path is dict:
|
||||
self.spk_emb_dict = spk_emb_dict_or_path
|
||||
print(f" [INFO] Load spk_emb_dict from {spk_emb_dict_or_path}")
|
||||
else:
|
||||
self.spk_emb_dict = np.load(spk_emb_dict_or_path, allow_pickle=True).item()
|
||||
print(f" [INFO] Load spk_emb_dict from {spk_emb_dict_or_path}")
|
||||
|
||||
@torch.no_grad()
|
||||
def encode_units(self, audio, sr=44100, padding_mask=None):
|
||||
assert self.units_encoder is not None
|
||||
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
|
||||
return self.units_encoder.encode(audio, sr, hop_size, padding_mask=padding_mask)
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_f0(self, audio, key=0, sr=44100, silence_front=0):
|
||||
assert self.f0_extractor is not None
|
||||
f0 = self.f0_extractor.extract(audio, uv_interp=True, device=self.device, silence_front=silence_front, sr=sr)
|
||||
f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
f0 = f0 * 2 ** (float(key) / 12)
|
||||
return f0
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_volume_and_mask(self, audio, sr=44100, threhold=-60.0):
|
||||
assert self.volume_extractor is not None
|
||||
volume = self.volume_extractor.extract(audio, sr)
|
||||
mask = self.volume_extractor.get_mask_from_volume(volume, threhold=threhold, device=self.device)
|
||||
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
return volume, mask
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_mel(self, audio, sr=44100):
|
||||
assert sr == 441000
|
||||
mel = self.vocoder.extract(audio, self.args.data.sampling_rate)
|
||||
return mel
|
||||
|
||||
@torch.no_grad()
|
||||
def encode_spk(self, audio, sr=44100):
|
||||
assert self.speaker_encoder is not None
|
||||
return self.speaker_encoder(audio=audio, sample_rate=sr)
|
||||
|
||||
@torch.no_grad()
|
||||
def encode_spk_from_path(self, path): # 从path读取预先提取的声纹(必须是.npy文件), 或从声音文件提取声纹(此时可以是文件或目录)
|
||||
if path is None:
|
||||
return None
|
||||
assert self.speaker_encoder is not None
|
||||
if (('122333444455555' + path)[-4:] == '.npy') and os.path.isfile(path):
|
||||
spk_emb = np.load(path)
|
||||
else:
|
||||
if os.path.isfile(path):
|
||||
path_list = [path]
|
||||
else:
|
||||
path_list = os.listdir(path)
|
||||
for _index in range(len(path_list)):
|
||||
path_list[_index] = os.path.join(path, path_list[_index])
|
||||
spk_emb = self.speaker_encoder.mean_spk_emb_from_path_list(path_list)
|
||||
return spk_emb
|
||||
|
||||
def pre_spk_emb(self, spk_id, spk_mix_dict, units_len, spk_emb):
|
||||
spk_emb_dict = self.spk_emb_dict
|
||||
if (spk_mix_dict is not None) or (spk_emb is None):
|
||||
assert spk_emb_dict is not None
|
||||
if spk_emb is None:
|
||||
spk_emb = spk_emb_dict[str(spk_id)]
|
||||
# pad and to device
|
||||
spk_emb = np.tile(spk_emb, (units_len, 1))
|
||||
spk_emb = torch.from_numpy(spk_emb).float().to(self.device)
|
||||
return spk_mix_dict, spk_emb
|
||||
|
||||
@torch.no_grad()
|
||||
def mel2wav(self, mel, f0, start_frame=0):
|
||||
if start_frame == 0:
|
||||
return self.vocoder.infer(mel, f0)
|
||||
else: # for realtime speedup
|
||||
mel = mel[:, start_frame:, :]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
out_wav = self.vocoder.infer(mel, f0)
|
||||
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||
|
||||
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
||||
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None):
|
||||
|
||||
if self.args.model.k_step_max is not None:
|
||||
if k_step is None:
|
||||
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
|
||||
if k_step > int(self.args.model.k_step_max):
|
||||
raise ValueError(f"k_step must <= k_step_max of Shallow Diffusion Model")
|
||||
if gt_spec is None:
|
||||
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
|
||||
"input mel or output of naive model")
|
||||
print(f' [INFO] k_step_max is {self.args.model.k_step_max}.')
|
||||
|
||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.device)
|
||||
|
||||
# spk_id
|
||||
spk_emb_dict = None
|
||||
if self.args.model.use_speaker_encoder: # with speaker encoder
|
||||
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||
# without speaker encoder
|
||||
else:
|
||||
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.device)
|
||||
|
||||
if k_step is not None:
|
||||
print(f' [INFO] get k_step, do shallow diffusion {k_step} steps')
|
||||
else:
|
||||
print(f' [INFO] Do full 1000 steps depth diffusion {k_step}')
|
||||
print(f" [INFO] method:{method}; infer_speedup:{infer_speedup}")
|
||||
return self.model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||
|
||||
@torch.no_grad() # 比__call__多了声码器代码,输出波形
|
||||
def infer(self, units, f0, volume, gt_spec=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None):
|
||||
if k_step is not None:
|
||||
if self.naive_model is not None:
|
||||
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, spk_emb=spk_emb)
|
||||
print(f" [INFO] get mel from naive model out.")
|
||||
assert gt_spec is not None
|
||||
if self.naive_model is None:
|
||||
print(f" [INFO] get mel from input wav.")
|
||||
if input(" [WARN] You are attempting shallow diffusion "
|
||||
"on the mel of the input source,"
|
||||
" Please enter 'gt_mel' to continue") != 'gt_mel':
|
||||
raise ValueError("Please understand what you're doing")
|
||||
k_step = int(k_step)
|
||||
gt_spec = gt_spec
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
return self.mel2wav(out_mel, f0)
|
||||
|
||||
@torch.no_grad() # 为实时浅扩散优化的推理代码,可以切除pad省算力
|
||||
def infer_for_realtime(self, units, f0, volume, audio_t=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, silence_front=0, diff_jump_silence_front=False):
|
||||
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
if audio_t is not None:
|
||||
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
units = units[:, start_frame:, :]
|
||||
volume = volume[:, start_frame:, :]
|
||||
|
||||
if k_step is not None:
|
||||
assert audio_t is not None
|
||||
k_step = int(k_step)
|
||||
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
|
||||
# 如果缺帧再开这行gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
out_wav = self.mel2wav(out_mel, f0)
|
||||
else:
|
||||
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
|
||||
return out_wav
|
||||
|
||||
@torch.no_grad() # 不切片从音频推理代码
|
||||
def infer_from_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, threhold=-60, index_ratio=0):
|
||||
units = self.encode_units(audio, sr)
|
||||
if index_ratio > 0:
|
||||
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
|
||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
||||
if k_step is not None:
|
||||
assert 0 < int(k_step) <= 1000
|
||||
k_step = int(k_step)
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
output = self.infer(units, f0, volume, gt_spec=gt_spec, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
output *= mask
|
||||
return output.squeeze().cpu().numpy(), self.args.data.sampling_rate
|
||||
|
||||
@torch.no_grad() # 切片从音频推理代码
|
||||
def infer_from_long_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None,
|
||||
threhold=-60, threhold_for_split=-40, min_len=5000, index_ratio=0):
|
||||
|
||||
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
|
||||
segments = split(audio, sr, hop_size, db_thresh=threhold_for_split, min_len=min_len)
|
||||
|
||||
print(f' [INFO] Extract f0 volume and mask: Use {self.f0_model}, start...')
|
||||
_f0_start_time = time.time()
|
||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
||||
_f0_end_time = time.time()
|
||||
_f0_used_time = _f0_end_time - _f0_start_time
|
||||
print(f' [INFO] Extract f0 volume and mask: Done. Use time:{_f0_used_time}')
|
||||
|
||||
if k_step is not None:
|
||||
assert 0 < int(k_step) <= 1000
|
||||
k_step = int(k_step)
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
result = np.zeros(0)
|
||||
current_length = 0
|
||||
for segment in tqdm(segments):
|
||||
start_frame = segment[0]
|
||||
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(self.device)
|
||||
seg_units = self.units_encoder.encode(seg_input, sr, hop_size)
|
||||
if index_ratio > 0:
|
||||
seg_units = self.units_indexer(units_t=seg_units, spk_id=spk_id, ratio=index_ratio)
|
||||
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
seg_volume = volume[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
if gt_spec is not None:
|
||||
seg_gt_spec = gt_spec[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
else:
|
||||
seg_gt_spec = None
|
||||
seg_output = self.infer(seg_units, seg_f0, seg_volume, gt_spec=seg_gt_spec, spk_id=spk_id,
|
||||
spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
_left = start_frame * self.args.data.block_size
|
||||
_right = (start_frame + seg_units.size(1)) * self.args.data.block_size
|
||||
seg_output *= mask[:, _left:_right]
|
||||
seg_output = seg_output.squeeze().cpu().numpy()
|
||||
silent_length = round(start_frame * self.args.data.block_size) - current_length
|
||||
if silent_length >= 0:
|
||||
result = np.append(result, np.zeros(silent_length))
|
||||
result = np.append(result, seg_output)
|
||||
else:
|
||||
result = cross_fade(result, seg_output, current_length + silent_length)
|
||||
current_length = current_length + silent_length + len(seg_output)
|
||||
|
||||
return result, self.args.data.sampling_rate
|
||||
|
||||
@torch.no_grad() # 为实时优化的推理代码,可以切除pad省算力
|
||||
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, silence_front=0, diff_jump_silence_front=False, threhold=-60,
|
||||
index_ratio=0, use_hubert_mask=False):
|
||||
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
|
||||
if self.naive_model is None:
|
||||
print(" [INFO] No combo_model or naive_model, diffusion without shallow-model.")
|
||||
else:
|
||||
assert k_step is not None
|
||||
print(" [INFO] Shallow Diffusion mode!")
|
||||
|
||||
key_str = str(sr)
|
||||
if key_str not in self.resample_dict_16000:
|
||||
self.resample_dict_16000[key_str] = Resample(sr, 16000, lowpass_filter_width=128).to(self.device)
|
||||
if int(sr) != 16000:
|
||||
audio_t_16k = self.resample_dict_16000[key_str](audio_t)
|
||||
else:
|
||||
audio_t_16k = audio_t
|
||||
|
||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
||||
if use_hubert_mask:
|
||||
mask16k = mask.clone().unsqueeze(0).unsqueeze(0)
|
||||
mask16k = torch.nn.functional.interpolate(mask16k, size=tuple(audio_t_16k.shape), mode='nearest')
|
||||
mask16k = ~(mask16k.squeeze(0).squeeze(0).bool())
|
||||
else:
|
||||
mask16k = None
|
||||
units = self.encode_units(audio_t_16k, sr=16000, padding_mask=mask16k)
|
||||
if index_ratio > 0:
|
||||
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
|
||||
f0 = self.extract_f0(audio, key=key, sr=sr, silence_front=silence_front)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
units = units[:, start_frame:, :]
|
||||
volume = volume[:, start_frame:, :]
|
||||
|
||||
if k_step is not None:
|
||||
k_step = int(k_step)
|
||||
if (k_step >= 1000) or (k_step <= 0):
|
||||
k_step = 300
|
||||
print(f" [WARN] k_step must < 1000 and > 0, now set to {k_step}")
|
||||
if self.args.model.k_step_max is not None:
|
||||
k_step_max = int(self.args.model.k_step_max)
|
||||
if k_step > k_step_max:
|
||||
print(f" [WARN] k_step must <= k_step_max={k_step_max}, not k_step set to{k_step_max}.")
|
||||
k_step = k_step_max
|
||||
if int(k_step/infer_speedup) < 3:
|
||||
infer_speedup = int(k_step/4)
|
||||
print(f" [WARN] diffusion step must > 4 (3 when qndm), not set to{infer_speedup}")
|
||||
if self.naive_model is not None:
|
||||
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, spk_emb=spk_emb)
|
||||
else:
|
||||
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
|
||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
out_wav = self.mel2wav(out_mel, f0)
|
||||
else:
|
||||
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
|
||||
out_wav *= mask
|
||||
return out_wav.squeeze(), self.args.data.sampling_rate
|
@ -0,0 +1,386 @@
|
||||
from collections import deque
|
||||
from functools import partial
|
||||
from inspect import isfunction
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def exists(x):
|
||||
return x is not None
|
||||
|
||||
|
||||
def default(val, d):
|
||||
if exists(val):
|
||||
return val
|
||||
return d() if isfunction(d) else d
|
||||
|
||||
|
||||
def extract(a, t, x_shape):
|
||||
b, *_ = t.shape
|
||||
out = a.gather(-1, t)
|
||||
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
|
||||
|
||||
|
||||
def noise_like(shape, device, repeat=False):
|
||||
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
|
||||
noise = lambda: torch.randn(shape, device=device)
|
||||
return repeat_noise() if repeat else noise()
|
||||
|
||||
|
||||
def linear_beta_schedule(timesteps, max_beta=0.02):
|
||||
"""
|
||||
linear schedule
|
||||
"""
|
||||
betas = np.linspace(1e-4, max_beta, timesteps)
|
||||
return betas
|
||||
|
||||
|
||||
def cosine_beta_schedule(timesteps, s=0.008):
|
||||
"""
|
||||
cosine schedule
|
||||
as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
|
||||
"""
|
||||
steps = timesteps + 1
|
||||
x = np.linspace(0, steps, steps)
|
||||
alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
|
||||
alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
|
||||
betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
|
||||
return np.clip(betas, a_min=0, a_max=0.999)
|
||||
|
||||
|
||||
beta_schedule = {
|
||||
"cosine": cosine_beta_schedule,
|
||||
"linear": linear_beta_schedule,
|
||||
}
|
||||
|
||||
|
||||
class GaussianDiffusion(nn.Module):
|
||||
def __init__(self,
|
||||
denoise_fn,
|
||||
out_dims=128,
|
||||
timesteps=1000,
|
||||
k_step=1000,
|
||||
max_beta=0.02,
|
||||
spec_min=-12,
|
||||
spec_max=2):
|
||||
super().__init__()
|
||||
self.denoise_fn = denoise_fn
|
||||
self.out_dims = out_dims
|
||||
betas = beta_schedule['linear'](timesteps, max_beta=max_beta)
|
||||
|
||||
alphas = 1. - betas
|
||||
alphas_cumprod = np.cumprod(alphas, axis=0)
|
||||
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
|
||||
|
||||
timesteps, = betas.shape
|
||||
self.num_timesteps = int(timesteps)
|
||||
self.k_step = k_step
|
||||
|
||||
self.noise_list = deque(maxlen=4)
|
||||
|
||||
to_torch = partial(torch.tensor, dtype=torch.float32)
|
||||
|
||||
self.register_buffer('betas', to_torch(betas))
|
||||
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
|
||||
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
|
||||
|
||||
# calculations for diffusion q(x_t | x_{t-1}) and others
|
||||
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
|
||||
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
|
||||
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
|
||||
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
|
||||
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
|
||||
|
||||
# calculations for posterior q(x_{t-1} | x_t, x_0)
|
||||
posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
|
||||
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
|
||||
self.register_buffer('posterior_variance', to_torch(posterior_variance))
|
||||
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
|
||||
self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
|
||||
self.register_buffer('posterior_mean_coef1', to_torch(
|
||||
betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
|
||||
self.register_buffer('posterior_mean_coef2', to_torch(
|
||||
(1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
|
||||
|
||||
self.register_buffer('spec_min', torch.FloatTensor([spec_min])[None, None, :out_dims])
|
||||
self.register_buffer('spec_max', torch.FloatTensor([spec_max])[None, None, :out_dims])
|
||||
|
||||
def q_mean_variance(self, x_start, t):
|
||||
mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
|
||||
variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
|
||||
log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
|
||||
return mean, variance, log_variance
|
||||
|
||||
def predict_start_from_noise(self, x_t, t, noise):
|
||||
return (
|
||||
extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
|
||||
extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
|
||||
)
|
||||
|
||||
def q_posterior(self, x_start, x_t, t):
|
||||
posterior_mean = (
|
||||
extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
|
||||
extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
|
||||
)
|
||||
posterior_variance = extract(self.posterior_variance, t, x_t.shape)
|
||||
posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
|
||||
return posterior_mean, posterior_variance, posterior_log_variance_clipped
|
||||
|
||||
def p_mean_variance(self, x, t, cond):
|
||||
noise_pred = self.denoise_fn(x, t, cond=cond)
|
||||
x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
|
||||
|
||||
x_recon.clamp_(-1., 1.)
|
||||
|
||||
model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
|
||||
return model_mean, posterior_variance, posterior_log_variance
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
|
||||
b, *_, device = *x.shape, x.device
|
||||
model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond)
|
||||
noise = noise_like(x.shape, device, repeat_noise)
|
||||
# no noise when t == 0
|
||||
nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
|
||||
return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_ddim(self, x, t, interval, cond):
|
||||
a_t = extract(self.alphas_cumprod, t, x.shape)
|
||||
a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
|
||||
|
||||
noise_pred = self.denoise_fn(x, t, cond=cond)
|
||||
x_prev = a_prev.sqrt() * (x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt()-((1 - a_t) / a_t).sqrt()) * noise_pred)
|
||||
return x_prev
|
||||
|
||||
@torch.no_grad()
|
||||
def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
|
||||
"""
|
||||
Use the PLMS method from
|
||||
[Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
|
||||
"""
|
||||
|
||||
def get_x_pred(x, noise_t, t):
|
||||
a_t = extract(self.alphas_cumprod, t, x.shape)
|
||||
a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
|
||||
a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
|
||||
|
||||
x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (
|
||||
a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
|
||||
x_pred = x + x_delta
|
||||
|
||||
return x_pred
|
||||
|
||||
noise_list = self.noise_list
|
||||
noise_pred = self.denoise_fn(x, t, cond=cond)
|
||||
|
||||
if len(noise_list) == 0:
|
||||
x_pred = get_x_pred(x, noise_pred, t)
|
||||
noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
|
||||
noise_pred_prime = (noise_pred + noise_pred_prev) / 2
|
||||
elif len(noise_list) == 1:
|
||||
noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
|
||||
elif len(noise_list) == 2:
|
||||
noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
|
||||
else:
|
||||
noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
|
||||
|
||||
x_prev = get_x_pred(x, noise_pred_prime, t)
|
||||
noise_list.append(noise_pred)
|
||||
|
||||
return x_prev
|
||||
|
||||
def q_sample(self, x_start, t, noise=None):
|
||||
noise = default(noise, lambda: torch.randn_like(x_start))
|
||||
return (
|
||||
extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
|
||||
extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
|
||||
)
|
||||
|
||||
def p_losses(self, x_start, t, cond, noise=None, loss_type='l2'):
|
||||
noise = default(noise, lambda: torch.randn_like(x_start))
|
||||
|
||||
x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
|
||||
x_recon = self.denoise_fn(x_noisy, t, cond)
|
||||
|
||||
if loss_type == 'l1':
|
||||
loss = (noise - x_recon).abs().mean()
|
||||
elif loss_type == 'l2':
|
||||
loss = F.mse_loss(noise, x_recon)
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
return loss
|
||||
|
||||
def forward(self,
|
||||
condition,
|
||||
gt_spec=None,
|
||||
infer=True,
|
||||
infer_speedup=10,
|
||||
method='dpm-solver',
|
||||
k_step=None,
|
||||
use_tqdm=True):
|
||||
"""
|
||||
conditioning diffusion, use fastspeech2 encoder output as the condition
|
||||
"""
|
||||
cond = condition.transpose(1, 2)
|
||||
b, device = condition.shape[0], condition.device
|
||||
|
||||
if not infer:
|
||||
spec = self.norm_spec(gt_spec)
|
||||
if k_step is None:
|
||||
t_max = self.k_step
|
||||
else:
|
||||
t_max = k_step
|
||||
t = torch.randint(0, t_max, (b,), device=device).long()
|
||||
norm_spec = spec.transpose(1, 2)[:, None, :, :] # [B, 1, M, T]
|
||||
return self.p_losses(norm_spec, t, cond=cond)
|
||||
else:
|
||||
shape = (cond.shape[0], 1, self.out_dims, cond.shape[2])
|
||||
|
||||
if gt_spec is None or k_step is None:
|
||||
t = self.k_step
|
||||
x = torch.randn(shape, device=device)
|
||||
else:
|
||||
t = k_step
|
||||
norm_spec = self.norm_spec(gt_spec)
|
||||
norm_spec = norm_spec.transpose(1, 2)[:, None, :, :]
|
||||
x = self.q_sample(x_start=norm_spec, t=torch.tensor([t - 1], device=device).long())
|
||||
|
||||
if method is not None and infer_speedup > 1:
|
||||
if method == 'dpm-solver':
|
||||
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
# 2. Convert your discrete-time `model` to the continuous-time
|
||||
# noise prediction model. Here is an example for a diffusion model
|
||||
# `model` with the noise prediction type ("noise") .
|
||||
def my_wrapper(fn):
|
||||
def wrapped(x, t, **kwargs):
|
||||
ret = fn(x, t, **kwargs)
|
||||
if use_tqdm:
|
||||
self.bar.update(1)
|
||||
return ret
|
||||
|
||||
return wrapped
|
||||
|
||||
model_fn = model_wrapper(
|
||||
my_wrapper(self.denoise_fn),
|
||||
noise_schedule,
|
||||
model_type="noise", # or "x_start" or "v" or "score"
|
||||
model_kwargs={"cond": cond}
|
||||
)
|
||||
|
||||
# 3. Define dpm-solver and sample by singlestep DPM-Solver.
|
||||
# (We recommend singlestep DPM-Solver for unconditional sampling)
|
||||
# You can adjust the `steps` to balance the computation
|
||||
# costs and the sample quality.
|
||||
dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
|
||||
|
||||
steps = t // infer_speedup
|
||||
if use_tqdm:
|
||||
self.bar = tqdm(desc="sample time step", total=steps)
|
||||
x = dpm_solver.sample(
|
||||
x,
|
||||
steps=steps,
|
||||
order=2,
|
||||
skip_type="time_uniform",
|
||||
method="multistep",
|
||||
)
|
||||
if use_tqdm:
|
||||
self.bar.close()
|
||||
elif method == 'unipc':
|
||||
from .uni_pc import NoiseScheduleVP, model_wrapper, UniPC
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
# 2. Convert your discrete-time `model` to the continuous-time
|
||||
# noise prediction model. Here is an example for a diffusion model
|
||||
# `model` with the noise prediction type ("noise") .
|
||||
def my_wrapper(fn):
|
||||
def wrapped(x, t, **kwargs):
|
||||
ret = fn(x, t, **kwargs)
|
||||
if use_tqdm:
|
||||
self.bar.update(1)
|
||||
return ret
|
||||
|
||||
return wrapped
|
||||
|
||||
model_fn = model_wrapper(
|
||||
my_wrapper(self.denoise_fn),
|
||||
noise_schedule,
|
||||
model_type="noise", # or "x_start" or "v" or "score"
|
||||
model_kwargs={"cond": cond}
|
||||
)
|
||||
|
||||
# 3. Define uni_pc and sample by multistep UniPC.
|
||||
# You can adjust the `steps` to balance the computation
|
||||
# costs and the sample quality.
|
||||
uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
|
||||
|
||||
steps = t // infer_speedup
|
||||
if use_tqdm:
|
||||
self.bar = tqdm(desc="sample time step", total=steps)
|
||||
x = uni_pc.sample(
|
||||
x,
|
||||
steps=steps,
|
||||
order=2,
|
||||
skip_type="time_uniform",
|
||||
method="multistep",
|
||||
)
|
||||
if use_tqdm:
|
||||
self.bar.close()
|
||||
elif method == 'pndm':
|
||||
self.noise_list = deque(maxlen=4)
|
||||
if use_tqdm:
|
||||
for i in tqdm(
|
||||
reversed(range(0, t, infer_speedup)), desc='sample time step',
|
||||
total=t // infer_speedup,
|
||||
):
|
||||
x = self.p_sample_plms(
|
||||
x, torch.full((b,), i, device=device, dtype=torch.long),
|
||||
infer_speedup, cond=cond
|
||||
)
|
||||
else:
|
||||
for i in reversed(range(0, t, infer_speedup)):
|
||||
x = self.p_sample_plms(
|
||||
x, torch.full((b,), i, device=device, dtype=torch.long),
|
||||
infer_speedup, cond=cond
|
||||
)
|
||||
elif method == 'ddim':
|
||||
if use_tqdm:
|
||||
for i in tqdm(
|
||||
reversed(range(0, t, infer_speedup)), desc='sample time step',
|
||||
total=t // infer_speedup,
|
||||
):
|
||||
x = self.p_sample_ddim(
|
||||
x, torch.full((b,), i, device=device, dtype=torch.long),
|
||||
infer_speedup, cond=cond
|
||||
)
|
||||
else:
|
||||
for i in reversed(range(0, t, infer_speedup)):
|
||||
x = self.p_sample_ddim(
|
||||
x, torch.full((b,), i, device=device, dtype=torch.long),
|
||||
infer_speedup, cond=cond
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(method)
|
||||
else:
|
||||
if use_tqdm:
|
||||
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
|
||||
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
|
||||
else:
|
||||
for i in reversed(range(0, t)):
|
||||
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
|
||||
x = x.squeeze(1).transpose(1, 2) # [B, T, M]
|
||||
return self.denorm_spec(x)
|
||||
|
||||
def norm_spec(self, x):
|
||||
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
|
||||
|
||||
def denorm_spec(self, x):
|
||||
return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min
|
@ -0,0 +1,122 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.utils import weight_norm
|
||||
from .pcmer import PCmer
|
||||
|
||||
|
||||
class Unit2MelNaive(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
input_channel,
|
||||
n_spk,
|
||||
use_pitch_aug=False,
|
||||
out_dims=128,
|
||||
n_layers=3,
|
||||
n_chans=256,
|
||||
n_hidden=None, # 废弃
|
||||
use_speaker_encoder=False,
|
||||
speaker_encoder_out_channels=256,
|
||||
use_full_siren=False,
|
||||
l2reg_loss=0
|
||||
):
|
||||
super().__init__()
|
||||
self.l2reg_loss = l2reg_loss if (l2reg_loss is not None) else 0
|
||||
self.f0_embed = nn.Linear(1, n_chans)
|
||||
self.volume_embed = nn.Linear(1, n_chans)
|
||||
if use_pitch_aug:
|
||||
self.aug_shift_embed = nn.Linear(1, n_chans, bias=False)
|
||||
else:
|
||||
self.aug_shift_embed = None
|
||||
self.n_spk = n_spk
|
||||
self.use_speaker_encoder = use_speaker_encoder
|
||||
if use_speaker_encoder:
|
||||
self.spk_embed = nn.Linear(speaker_encoder_out_channels, n_chans, bias=False)
|
||||
else:
|
||||
if n_spk is not None and n_spk > 1:
|
||||
self.spk_embed = nn.Embedding(n_spk, n_chans)
|
||||
|
||||
# conv in stack
|
||||
self.stack = nn.Sequential(
|
||||
nn.Conv1d(input_channel, n_chans, 3, 1, 1),
|
||||
nn.GroupNorm(4, n_chans),
|
||||
nn.LeakyReLU(),
|
||||
nn.Conv1d(n_chans, n_chans, 3, 1, 1))
|
||||
|
||||
# transformer
|
||||
if use_full_siren:
|
||||
from .pcmer_siren_full import PCmer as PCmerfs
|
||||
self.decoder = PCmerfs(
|
||||
num_layers=n_layers,
|
||||
num_heads=8,
|
||||
dim_model=n_chans,
|
||||
dim_keys=n_chans,
|
||||
dim_values=n_chans,
|
||||
residual_dropout=0.1,
|
||||
attention_dropout=0.1)
|
||||
else:
|
||||
self.decoder = PCmer(
|
||||
num_layers=n_layers,
|
||||
num_heads=8,
|
||||
dim_model=n_chans,
|
||||
dim_keys=n_chans,
|
||||
dim_values=n_chans,
|
||||
residual_dropout=0.1,
|
||||
attention_dropout=0.1)
|
||||
self.norm = nn.LayerNorm(n_chans)
|
||||
|
||||
# out
|
||||
self.n_out = out_dims
|
||||
self.dense_out = weight_norm(
|
||||
nn.Linear(n_chans, self.n_out))
|
||||
|
||||
def forward(self, units, f0, volume, spk_id=None, spk_mix_dict=None, aug_shift=None,
|
||||
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, spk_emb_dict=None):
|
||||
|
||||
'''
|
||||
input:
|
||||
B x n_frames x n_unit
|
||||
return:
|
||||
dict of B x n_frames x feat
|
||||
'''
|
||||
x = self.stack(units.transpose(1,2)).transpose(1,2)
|
||||
x = x + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
|
||||
if self.use_speaker_encoder:
|
||||
if spk_mix_dict is not None:
|
||||
assert spk_emb_dict is not None
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = spk_emb_dict[str(k)]
|
||||
spk_id_torch = np.tile(spk_id_torch, (len(units), 1))
|
||||
spk_id_torch = torch.from_numpy(spk_id_torch).float().to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_emb)
|
||||
else:
|
||||
if self.n_spk is not None and self.n_spk > 1:
|
||||
if spk_mix_dict is not None:
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch - 1)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id - 1)
|
||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||
|
||||
x = self.decoder(x)
|
||||
x = self.norm(x)
|
||||
x = self.dense_out(x)
|
||||
if not infer:
|
||||
x = F.mse_loss(x, gt_spec)
|
||||
if self.l2reg_loss > 0:
|
||||
x = x + l2_regularization(model=self, l2_alpha=self.l2reg_loss)
|
||||
return x
|
||||
|
||||
|
||||
def l2_regularization(model, l2_alpha):
|
||||
l2_loss = []
|
||||
for module in model.modules():
|
||||
if type(module) is nn.Conv2d:
|
||||
l2_loss.append((module.weight ** 2).sum() / 2.0)
|
||||
return l2_alpha * sum(l2_loss)
|
@ -0,0 +1,380 @@
|
||||
import torch
|
||||
|
||||
from torch import nn
|
||||
import math
|
||||
from functools import partial
|
||||
from einops import rearrange, repeat
|
||||
|
||||
from local_attention import LocalAttention
|
||||
import torch.nn.functional as F
|
||||
#import fast_transformers.causal_product.causal_product_cuda
|
||||
|
||||
def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None):
|
||||
b, h, *_ = data.shape
|
||||
# (batch size, head, length, model_dim)
|
||||
|
||||
# normalize model dim
|
||||
data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.
|
||||
|
||||
# what is ration?, projection_matrix.shape[0] --> 266
|
||||
|
||||
ratio = (projection_matrix.shape[0] ** -0.5)
|
||||
|
||||
projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h)
|
||||
projection = projection.type_as(data)
|
||||
|
||||
#data_dash = w^T x
|
||||
data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection)
|
||||
|
||||
|
||||
# diag_data = D**2
|
||||
diag_data = data ** 2
|
||||
diag_data = torch.sum(diag_data, dim=-1)
|
||||
diag_data = (diag_data / 2.0) * (data_normalizer ** 2)
|
||||
diag_data = diag_data.unsqueeze(dim=-1)
|
||||
|
||||
#print ()
|
||||
if is_query:
|
||||
data_dash = ratio * (
|
||||
torch.exp(data_dash - diag_data -
|
||||
torch.max(data_dash, dim=-1, keepdim=True).values) + eps)
|
||||
else:
|
||||
data_dash = ratio * (
|
||||
torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps)
|
||||
|
||||
return data_dash.type_as(data)
|
||||
|
||||
def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None):
|
||||
unstructured_block = torch.randn((cols, cols), device = device)
|
||||
q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced')
|
||||
q, r = map(lambda t: t.to(device), (q, r))
|
||||
|
||||
# proposed by @Parskatt
|
||||
# to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
|
||||
if qr_uniform_q:
|
||||
d = torch.diag(r, 0)
|
||||
q *= d.sign()
|
||||
return q.t()
|
||||
def exists(val):
|
||||
return val is not None
|
||||
|
||||
def empty(tensor):
|
||||
return tensor.numel() == 0
|
||||
|
||||
def default(val, d):
|
||||
return val if exists(val) else d
|
||||
|
||||
def cast_tuple(val):
|
||||
return (val,) if not isinstance(val, tuple) else val
|
||||
|
||||
class PCmer(nn.Module):
|
||||
"""The encoder that is used in the Transformer model."""
|
||||
|
||||
def __init__(self,
|
||||
num_layers,
|
||||
num_heads,
|
||||
dim_model,
|
||||
dim_keys,
|
||||
dim_values,
|
||||
residual_dropout,
|
||||
attention_dropout):
|
||||
super().__init__()
|
||||
self.num_layers = num_layers
|
||||
self.num_heads = num_heads
|
||||
self.dim_model = dim_model
|
||||
self.dim_values = dim_values
|
||||
self.dim_keys = dim_keys
|
||||
self.residual_dropout = residual_dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
|
||||
self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
|
||||
|
||||
# METHODS ########################################################################################################
|
||||
|
||||
def forward(self, phone, mask=None):
|
||||
|
||||
# apply all layers to the input
|
||||
for (i, layer) in enumerate(self._layers):
|
||||
phone = layer(phone, mask)
|
||||
# provide the final sequence
|
||||
return phone
|
||||
|
||||
|
||||
# ==================================================================================================================== #
|
||||
# CLASS _ E N C O D E R L A Y E R #
|
||||
# ==================================================================================================================== #
|
||||
|
||||
|
||||
class _EncoderLayer(nn.Module):
|
||||
"""One layer of the encoder.
|
||||
|
||||
Attributes:
|
||||
attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
|
||||
feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
|
||||
"""
|
||||
|
||||
def __init__(self, parent: PCmer):
|
||||
"""Creates a new instance of ``_EncoderLayer``.
|
||||
|
||||
Args:
|
||||
parent (Encoder): The encoder that the layers is created for.
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
|
||||
self.conformer = ConformerConvModule(parent.dim_model)
|
||||
self.norm = nn.LayerNorm(parent.dim_model)
|
||||
self.dropout = nn.Dropout(parent.residual_dropout)
|
||||
|
||||
# selfatt -> fastatt: performer!
|
||||
self.attn = SelfAttention(dim = parent.dim_model,
|
||||
heads = parent.num_heads,
|
||||
causal = False)
|
||||
|
||||
# METHODS ########################################################################################################
|
||||
|
||||
def forward(self, phone, mask=None):
|
||||
|
||||
# compute attention sub-layer
|
||||
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
||||
|
||||
phone = phone + (self.conformer(phone))
|
||||
|
||||
return phone
|
||||
|
||||
def calc_same_padding(kernel_size):
|
||||
pad = kernel_size // 2
|
||||
return (pad, pad - (kernel_size + 1) % 2)
|
||||
|
||||
# helper classes
|
||||
|
||||
class Swish(nn.Module):
|
||||
def forward(self, x):
|
||||
return x * x.sigmoid()
|
||||
|
||||
class Transpose(nn.Module):
|
||||
def __init__(self, dims):
|
||||
super().__init__()
|
||||
assert len(dims) == 2, 'dims must be a tuple of two dimensions'
|
||||
self.dims = dims
|
||||
|
||||
def forward(self, x):
|
||||
return x.transpose(*self.dims)
|
||||
|
||||
class GLU(nn.Module):
|
||||
def __init__(self, dim):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x):
|
||||
out, gate = x.chunk(2, dim=self.dim)
|
||||
return out * gate.sigmoid()
|
||||
|
||||
class DepthWiseConv1d(nn.Module):
|
||||
def __init__(self, chan_in, chan_out, kernel_size, padding):
|
||||
super().__init__()
|
||||
self.padding = padding
|
||||
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
|
||||
|
||||
def forward(self, x):
|
||||
x = F.pad(x, self.padding)
|
||||
return self.conv(x)
|
||||
|
||||
class ConformerConvModule(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
dim,
|
||||
causal = False,
|
||||
expansion_factor = 2,
|
||||
kernel_size = 31,
|
||||
dropout = 0.):
|
||||
super().__init__()
|
||||
|
||||
inner_dim = dim * expansion_factor
|
||||
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
|
||||
|
||||
self.net = nn.Sequential(
|
||||
nn.LayerNorm(dim),
|
||||
Transpose((1, 2)),
|
||||
nn.Conv1d(dim, inner_dim * 2, 1),
|
||||
GLU(dim=1),
|
||||
DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
|
||||
#nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
|
||||
Swish(),
|
||||
nn.Conv1d(inner_dim, dim, 1),
|
||||
Transpose((1, 2)),
|
||||
nn.Dropout(dropout)
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.net(x)
|
||||
|
||||
def linear_attention(q, k, v):
|
||||
if v is None:
|
||||
#print (k.size(), q.size())
|
||||
out = torch.einsum('...ed,...nd->...ne', k, q)
|
||||
return out
|
||||
|
||||
else:
|
||||
k_cumsum = k.sum(dim = -2)
|
||||
#k_cumsum = k.sum(dim = -2)
|
||||
D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8)
|
||||
|
||||
context = torch.einsum('...nd,...ne->...de', k, v)
|
||||
#print ("TRUEEE: ", context.size(), q.size(), D_inv.size())
|
||||
out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv)
|
||||
return out
|
||||
|
||||
def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None):
|
||||
nb_full_blocks = int(nb_rows / nb_columns)
|
||||
#print (nb_full_blocks)
|
||||
block_list = []
|
||||
|
||||
for _ in range(nb_full_blocks):
|
||||
q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
|
||||
block_list.append(q)
|
||||
# block_list[n] is a orthogonal matrix ... (model_dim * model_dim)
|
||||
#print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1)))
|
||||
#print (nb_rows, nb_full_blocks, nb_columns)
|
||||
remaining_rows = nb_rows - nb_full_blocks * nb_columns
|
||||
#print (remaining_rows)
|
||||
if remaining_rows > 0:
|
||||
q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
|
||||
#print (q[:remaining_rows].size())
|
||||
block_list.append(q[:remaining_rows])
|
||||
|
||||
final_matrix = torch.cat(block_list)
|
||||
|
||||
if scaling == 0:
|
||||
multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1)
|
||||
elif scaling == 1:
|
||||
multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device)
|
||||
else:
|
||||
raise ValueError(f'Invalid scaling {scaling}')
|
||||
|
||||
return torch.diag(multiplier) @ final_matrix
|
||||
|
||||
class FastAttention(nn.Module):
|
||||
def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False):
|
||||
super().__init__()
|
||||
nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
|
||||
|
||||
self.dim_heads = dim_heads
|
||||
self.nb_features = nb_features
|
||||
self.ortho_scaling = ortho_scaling
|
||||
|
||||
self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q)
|
||||
projection_matrix = self.create_projection()
|
||||
self.register_buffer('projection_matrix', projection_matrix)
|
||||
|
||||
self.generalized_attention = generalized_attention
|
||||
self.kernel_fn = kernel_fn
|
||||
|
||||
# if this is turned on, no projection will be used
|
||||
# queries and keys will be softmax-ed as in the original efficient attention paper
|
||||
self.no_projection = no_projection
|
||||
|
||||
self.causal = causal
|
||||
if causal:
|
||||
try:
|
||||
import fast_transformers.causal_product.causal_product_cuda
|
||||
self.causal_linear_fn = partial(causal_linear_attention)
|
||||
except ImportError:
|
||||
print('unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version')
|
||||
self.causal_linear_fn = causal_linear_attention_noncuda
|
||||
@torch.no_grad()
|
||||
def redraw_projection_matrix(self):
|
||||
projections = self.create_projection()
|
||||
self.projection_matrix.copy_(projections)
|
||||
del projections
|
||||
|
||||
def forward(self, q, k, v):
|
||||
device = q.device
|
||||
|
||||
if self.no_projection:
|
||||
q = q.softmax(dim = -1)
|
||||
k = torch.exp(k) if self.causal else k.softmax(dim = -2)
|
||||
|
||||
elif self.generalized_attention:
|
||||
create_kernel = partial(generalized_kernel, kernel_fn = self.kernel_fn, projection_matrix = self.projection_matrix, device = device)
|
||||
q, k = map(create_kernel, (q, k))
|
||||
|
||||
else:
|
||||
create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device)
|
||||
|
||||
q = create_kernel(q, is_query = True)
|
||||
k = create_kernel(k, is_query = False)
|
||||
|
||||
attn_fn = linear_attention if not self.causal else self.causal_linear_fn
|
||||
if v is None:
|
||||
out = attn_fn(q, k, None)
|
||||
return out
|
||||
else:
|
||||
out = attn_fn(q, k, v)
|
||||
return out
|
||||
class SelfAttention(nn.Module):
|
||||
def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False):
|
||||
super().__init__()
|
||||
assert dim % heads == 0, 'dimension must be divisible by number of heads'
|
||||
dim_head = default(dim_head, dim // heads)
|
||||
inner_dim = dim_head * heads
|
||||
self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection)
|
||||
|
||||
self.heads = heads
|
||||
self.global_heads = heads - local_heads
|
||||
self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None
|
||||
|
||||
#print (heads, nb_features, dim_head)
|
||||
#name_embedding = torch.zeros(110, heads, dim_head, dim_head)
|
||||
#self.name_embedding = nn.Parameter(name_embedding, requires_grad=True)
|
||||
|
||||
|
||||
self.to_q = nn.Linear(dim, inner_dim)
|
||||
self.to_k = nn.Linear(dim, inner_dim)
|
||||
self.to_v = nn.Linear(dim, inner_dim)
|
||||
self.to_out = nn.Linear(inner_dim, dim)
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
@torch.no_grad()
|
||||
def redraw_projection_matrix(self):
|
||||
self.fast_attention.redraw_projection_matrix()
|
||||
#torch.nn.init.zeros_(self.name_embedding)
|
||||
#print (torch.sum(self.name_embedding))
|
||||
def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs):
|
||||
b, n, _, h, gh = *x.shape, self.heads, self.global_heads
|
||||
|
||||
cross_attend = exists(context)
|
||||
|
||||
context = default(context, x)
|
||||
context_mask = default(context_mask, mask) if not cross_attend else context_mask
|
||||
#print (torch.sum(self.name_embedding))
|
||||
q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
|
||||
|
||||
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
|
||||
(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
|
||||
|
||||
attn_outs = []
|
||||
#print (name)
|
||||
#print (self.name_embedding[name].size())
|
||||
if not empty(q):
|
||||
if exists(context_mask):
|
||||
global_mask = context_mask[:, None, :, None]
|
||||
v.masked_fill_(~global_mask, 0.)
|
||||
if cross_attend:
|
||||
pass
|
||||
#print (torch.sum(self.name_embedding))
|
||||
#out = self.fast_attention(q,self.name_embedding[name],None)
|
||||
#print (torch.sum(self.name_embedding[...,-1:]))
|
||||
else:
|
||||
out = self.fast_attention(q, k, v)
|
||||
attn_outs.append(out)
|
||||
|
||||
if not empty(lq):
|
||||
assert not cross_attend, 'local attention is not compatible with cross attention'
|
||||
out = self.local_attn(lq, lk, lv, input_mask = mask)
|
||||
attn_outs.append(out)
|
||||
|
||||
out = torch.cat(attn_outs, dim = 1)
|
||||
out = rearrange(out, 'b h n d -> b n (h d)')
|
||||
out = self.to_out(out)
|
||||
return self.dropout(out)
|
@ -0,0 +1,178 @@
|
||||
import os
|
||||
import yaml
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
from .diffusion import GaussianDiffusion
|
||||
from .wavenet import WaveNet
|
||||
from .vocoder import Vocoder
|
||||
from .naive.naive import Unit2MelNaive
|
||||
|
||||
|
||||
class DotDict(dict):
|
||||
def __getattr__(*args):
|
||||
val = dict.get(*args)
|
||||
return DotDict(val) if type(val) is dict else val
|
||||
|
||||
__setattr__ = dict.__setitem__
|
||||
__delattr__ = dict.__delitem__
|
||||
|
||||
|
||||
def load_model_vocoder(
|
||||
model_path,
|
||||
device='cpu',
|
||||
loaded_vocoder=None):
|
||||
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
|
||||
with open(config_file, "r") as config:
|
||||
args = yaml.safe_load(config)
|
||||
args = DotDict(args)
|
||||
|
||||
# load vocoder
|
||||
if loaded_vocoder is None:
|
||||
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=device)
|
||||
else:
|
||||
vocoder = loaded_vocoder
|
||||
|
||||
# load model
|
||||
model = load_svc_model(args=args, vocoder_dimension=vocoder.dimension)
|
||||
|
||||
print(' [Loading] ' + model_path)
|
||||
ckpt = torch.load(model_path, map_location=torch.device(device))
|
||||
model.to(device)
|
||||
model.load_state_dict(ckpt['model'])
|
||||
model.eval()
|
||||
return model, vocoder, args
|
||||
|
||||
|
||||
def load_model_vocoder_from_combo(combo_model_path, device='cpu'):
|
||||
read_dict = torch.load(combo_model_path, map_location=torch.device(device))
|
||||
# args
|
||||
diff_args = DotDict(read_dict["diff_config_dict"])
|
||||
naive_args = DotDict(read_dict["naive_config_dict"])
|
||||
# vocoder
|
||||
vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
|
||||
|
||||
# diff_model
|
||||
print(' [Loading] ' + combo_model_path)
|
||||
diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
|
||||
diff_model.to(device)
|
||||
diff_model.load_state_dict(read_dict["diff_model"]['model'])
|
||||
diff_model.eval()
|
||||
|
||||
# naive_model
|
||||
naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension)
|
||||
naive_model.to(device)
|
||||
naive_model.load_state_dict(read_dict["naive_model"]['model'])
|
||||
naive_model.eval()
|
||||
return diff_model, diff_args, naive_model, naive_args, vocoder
|
||||
|
||||
|
||||
def load_svc_model(args, vocoder_dimension):
|
||||
if args.model.type == 'Diffusion':
|
||||
model = Unit2Mel(
|
||||
args.data.encoder_out_channels,
|
||||
args.model.n_spk,
|
||||
args.model.use_pitch_aug,
|
||||
vocoder_dimension,
|
||||
args.model.n_layers,
|
||||
args.model.n_chans,
|
||||
args.model.n_hidden,
|
||||
use_speaker_encoder=args.model.use_speaker_encoder,
|
||||
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels)
|
||||
|
||||
elif args.model.type == 'Naive':
|
||||
model = Unit2MelNaive(
|
||||
args.data.encoder_out_channels,
|
||||
args.model.n_spk,
|
||||
args.model.use_pitch_aug,
|
||||
vocoder_dimension,
|
||||
args.model.n_layers,
|
||||
args.model.n_chans,
|
||||
use_speaker_encoder=args.model.use_speaker_encoder,
|
||||
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels)
|
||||
|
||||
elif args.model.type == 'NaiveFS':
|
||||
model = Unit2MelNaive(
|
||||
args.data.encoder_out_channels,
|
||||
args.model.n_spk,
|
||||
args.model.use_pitch_aug,
|
||||
vocoder_dimension,
|
||||
args.model.n_layers,
|
||||
args.model.n_chans,
|
||||
use_speaker_encoder=args.model.use_speaker_encoder,
|
||||
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels,
|
||||
use_full_siren=True,
|
||||
l2reg_loss=args.model.l2_reg_loss)
|
||||
else:
|
||||
raise ("Unknow model")
|
||||
return model
|
||||
|
||||
|
||||
class Unit2Mel(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
input_channel,
|
||||
n_spk,
|
||||
use_pitch_aug=False,
|
||||
out_dims=128,
|
||||
n_layers=20,
|
||||
n_chans=384,
|
||||
n_hidden=256,
|
||||
use_speaker_encoder=False,
|
||||
speaker_encoder_out_channels=256):
|
||||
super().__init__()
|
||||
self.unit_embed = nn.Linear(input_channel, n_hidden)
|
||||
self.f0_embed = nn.Linear(1, n_hidden)
|
||||
self.volume_embed = nn.Linear(1, n_hidden)
|
||||
if use_pitch_aug:
|
||||
self.aug_shift_embed = nn.Linear(1, n_hidden, bias=False)
|
||||
else:
|
||||
self.aug_shift_embed = None
|
||||
self.n_spk = n_spk
|
||||
self.use_speaker_encoder = use_speaker_encoder
|
||||
if use_speaker_encoder:
|
||||
self.spk_embed = nn.Linear(speaker_encoder_out_channels, n_hidden, bias=False)
|
||||
else:
|
||||
if n_spk is not None and n_spk > 1:
|
||||
self.spk_embed = nn.Embedding(n_spk, n_hidden)
|
||||
|
||||
# diffusion
|
||||
self.decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden), out_dims=out_dims)
|
||||
|
||||
def forward(self, units, f0, volume, spk_id=None, spk_mix_dict=None, aug_shift=None,
|
||||
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, spk_emb_dict=None):
|
||||
|
||||
'''
|
||||
input:
|
||||
B x n_frames x n_unit
|
||||
return:
|
||||
dict of B x n_frames x feat
|
||||
'''
|
||||
|
||||
x = self.unit_embed(units) + self.f0_embed((1 + f0 / 700).log()) + self.volume_embed(volume)
|
||||
if self.use_speaker_encoder:
|
||||
if spk_mix_dict is not None:
|
||||
assert spk_emb_dict is not None
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = spk_emb_dict[str(k)]
|
||||
spk_id_torch = np.tile(spk_id_torch, (len(units), 1))
|
||||
spk_id_torch = torch.from_numpy(spk_id_torch).float().to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_emb)
|
||||
else:
|
||||
if self.n_spk is not None and self.n_spk > 1:
|
||||
if spk_mix_dict is not None:
|
||||
for k, v in spk_mix_dict.items():
|
||||
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
|
||||
x = x + v * self.spk_embed(spk_id_torch - 1)
|
||||
else:
|
||||
x = x + self.spk_embed(spk_id - 1)
|
||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||
|
||||
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm)
|
||||
|
||||
return x
|
@ -0,0 +1,96 @@
|
||||
import torch
|
||||
from nsf_hifigan.nvSTFT import STFT
|
||||
from nsf_hifigan.models import load_model, load_config
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
|
||||
class Vocoder:
|
||||
def __init__(self, vocoder_type, vocoder_ckpt, device=None):
|
||||
if device is None:
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.device = device
|
||||
|
||||
if vocoder_type == 'nsf-hifigan':
|
||||
self.vocoder = NsfHifiGAN(vocoder_ckpt, device=device)
|
||||
elif vocoder_type == 'nsf-hifigan-log10':
|
||||
self.vocoder = NsfHifiGANLog10(vocoder_ckpt, device=device)
|
||||
else:
|
||||
raise ValueError(f" [x] Unknown vocoder: {vocoder_type}")
|
||||
|
||||
self.resample_kernel = {}
|
||||
self.vocoder_sample_rate = self.vocoder.sample_rate()
|
||||
self.vocoder_hop_size = self.vocoder.hop_size()
|
||||
self.dimension = self.vocoder.dimension()
|
||||
|
||||
def extract(self, audio, sample_rate, keyshift=0):
|
||||
|
||||
# resample
|
||||
if sample_rate == self.vocoder_sample_rate:
|
||||
audio_res = audio
|
||||
else:
|
||||
key_str = str(sample_rate)
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(sample_rate, self.vocoder_sample_rate,
|
||||
lowpass_filter_width=128).to(self.device)
|
||||
audio_res = self.resample_kernel[key_str](audio)
|
||||
|
||||
# extract
|
||||
mel = self.vocoder.extract(audio_res, keyshift=keyshift) # B, n_frames, bins
|
||||
return mel
|
||||
|
||||
def infer(self, mel, f0):
|
||||
f0 = f0[:, :mel.size(1), 0] # B, n_frames
|
||||
audio = self.vocoder(mel, f0)
|
||||
return audio
|
||||
|
||||
|
||||
class NsfHifiGAN(torch.nn.Module):
|
||||
def __init__(self, model_path, device=None):
|
||||
super().__init__()
|
||||
if device is None:
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.device = device
|
||||
self.model_path = model_path
|
||||
self.model = None
|
||||
self.h = load_config(model_path)
|
||||
self.stft = STFT(
|
||||
self.h.sampling_rate,
|
||||
self.h.num_mels,
|
||||
self.h.n_fft,
|
||||
self.h.win_size,
|
||||
self.h.hop_size,
|
||||
self.h.fmin,
|
||||
self.h.fmax)
|
||||
|
||||
def sample_rate(self):
|
||||
return self.h.sampling_rate
|
||||
|
||||
def hop_size(self):
|
||||
return self.h.hop_size
|
||||
|
||||
def dimension(self):
|
||||
return self.h.num_mels
|
||||
|
||||
def extract(self, audio, keyshift=0):
|
||||
mel = self.stft.get_mel(audio, keyshift=keyshift).transpose(1, 2) # B, n_frames, bins
|
||||
return mel
|
||||
|
||||
def forward(self, mel, f0):
|
||||
if self.model is None:
|
||||
print('| Load HifiGAN: ', self.model_path)
|
||||
self.model, self.h = load_model(self.model_path, device=self.device)
|
||||
with torch.no_grad():
|
||||
c = mel.transpose(1, 2)
|
||||
audio = self.model(c, f0)
|
||||
return audio
|
||||
|
||||
|
||||
class NsfHifiGANLog10(NsfHifiGAN):
|
||||
def forward(self, mel, f0):
|
||||
if self.model is None:
|
||||
print('| Load HifiGAN: ', self.model_path)
|
||||
self.model, self.h = load_model(self.model_path, device=self.device)
|
||||
with torch.no_grad():
|
||||
c = 0.434294 * mel.transpose(1, 2)
|
||||
audio = self.model(c, f0)
|
||||
return audio
|
@ -0,0 +1,108 @@
|
||||
import math
|
||||
from math import sqrt
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import Mish
|
||||
|
||||
|
||||
class Conv1d(torch.nn.Conv1d):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
nn.init.kaiming_normal_(self.weight)
|
||||
|
||||
|
||||
class SinusoidalPosEmb(nn.Module):
|
||||
def __init__(self, dim):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
def forward(self, x):
|
||||
device = x.device
|
||||
half_dim = self.dim // 2
|
||||
emb = math.log(10000) / (half_dim - 1)
|
||||
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
|
||||
emb = x[:, None] * emb[None, :]
|
||||
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
|
||||
return emb
|
||||
|
||||
|
||||
class ResidualBlock(nn.Module):
|
||||
def __init__(self, encoder_hidden, residual_channels, dilation):
|
||||
super().__init__()
|
||||
self.residual_channels = residual_channels
|
||||
self.dilated_conv = nn.Conv1d(
|
||||
residual_channels,
|
||||
2 * residual_channels,
|
||||
kernel_size=3,
|
||||
padding=dilation,
|
||||
dilation=dilation
|
||||
)
|
||||
self.diffusion_projection = nn.Linear(residual_channels, residual_channels)
|
||||
self.conditioner_projection = nn.Conv1d(encoder_hidden, 2 * residual_channels, 1)
|
||||
self.output_projection = nn.Conv1d(residual_channels, 2 * residual_channels, 1)
|
||||
|
||||
def forward(self, x, conditioner, diffusion_step):
|
||||
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
|
||||
conditioner = self.conditioner_projection(conditioner)
|
||||
y = x + diffusion_step
|
||||
|
||||
y = self.dilated_conv(y) + conditioner
|
||||
|
||||
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
|
||||
gate, filter = torch.split(y, [self.residual_channels, self.residual_channels], dim=1)
|
||||
y = torch.sigmoid(gate) * torch.tanh(filter)
|
||||
|
||||
y = self.output_projection(y)
|
||||
|
||||
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
|
||||
residual, skip = torch.split(y, [self.residual_channels, self.residual_channels], dim=1)
|
||||
return (x + residual) / math.sqrt(2.0), skip
|
||||
|
||||
|
||||
class WaveNet(nn.Module):
|
||||
def __init__(self, in_dims=128, n_layers=20, n_chans=384, n_hidden=256):
|
||||
super().__init__()
|
||||
self.input_projection = Conv1d(in_dims, n_chans, 1)
|
||||
self.diffusion_embedding = SinusoidalPosEmb(n_chans)
|
||||
self.mlp = nn.Sequential(
|
||||
nn.Linear(n_chans, n_chans * 4),
|
||||
Mish(),
|
||||
nn.Linear(n_chans * 4, n_chans)
|
||||
)
|
||||
self.residual_layers = nn.ModuleList([
|
||||
ResidualBlock(
|
||||
encoder_hidden=n_hidden,
|
||||
residual_channels=n_chans,
|
||||
dilation=1
|
||||
)
|
||||
for i in range(n_layers)
|
||||
])
|
||||
self.skip_projection = Conv1d(n_chans, n_chans, 1)
|
||||
self.output_projection = Conv1d(n_chans, in_dims, 1)
|
||||
nn.init.zeros_(self.output_projection.weight)
|
||||
|
||||
def forward(self, spec, diffusion_step, cond):
|
||||
"""
|
||||
:param spec: [B, 1, M, T]
|
||||
:param diffusion_step: [B, 1]
|
||||
:param cond: [B, M, T]
|
||||
:return:
|
||||
"""
|
||||
x = spec.squeeze(1)
|
||||
x = self.input_projection(x) # [B, residual_channel, T]
|
||||
|
||||
x = F.relu(x)
|
||||
diffusion_step = self.diffusion_embedding(diffusion_step)
|
||||
diffusion_step = self.mlp(diffusion_step)
|
||||
skip = []
|
||||
for layer in self.residual_layers:
|
||||
x, skip_connection = layer(x, cond, diffusion_step)
|
||||
skip.append(skip_connection)
|
||||
|
||||
x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
|
||||
x = self.skip_projection(x)
|
||||
x = F.relu(x)
|
||||
x = self.output_projection(x) # [B, mel_bins, T]
|
||||
return x[:, None, :, :]
|
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
class AttrDict(dict):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AttrDict, self).__init__(*args, **kwargs)
|
||||
self.__dict__ = self
|
||||
|
||||
|
||||
def build_env(config, config_name, path):
|
||||
t_path = os.path.join(path, config_name)
|
||||
if config != t_path:
|
||||
os.makedirs(path, exist_ok=True)
|
||||
shutil.copyfile(config, os.path.join(path, config_name))
|
@ -0,0 +1,434 @@
|
||||
import os
|
||||
import json
|
||||
from .env import AttrDict
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from .utils import init_weights, get_padding
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
def load_model(model_path, device='cuda'):
|
||||
h = load_config(model_path)
|
||||
|
||||
generator = Generator(h).to(device)
|
||||
|
||||
cp_dict = torch.load(model_path, map_location=device)
|
||||
generator.load_state_dict(cp_dict['generator'])
|
||||
generator.eval()
|
||||
generator.remove_weight_norm()
|
||||
del cp_dict
|
||||
return generator, h
|
||||
|
||||
def load_config(model_path):
|
||||
config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
|
||||
with open(config_file) as f:
|
||||
data = f.read()
|
||||
|
||||
json_config = json.loads(data)
|
||||
h = AttrDict(json_config)
|
||||
return h
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.h = h
|
||||
self.convs1 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
||||
padding=get_padding(kernel_size, dilation[2])))
|
||||
])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1)))
|
||||
])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.h = h
|
||||
self.convs = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1])))
|
||||
])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class SineGen(torch.nn.Module):
|
||||
""" Definition of sine generator
|
||||
SineGen(samp_rate, harmonic_num = 0,
|
||||
sine_amp = 0.1, noise_std = 0.003,
|
||||
voiced_threshold = 0,
|
||||
flag_for_pulse=False)
|
||||
samp_rate: sampling rate in Hz
|
||||
harmonic_num: number of harmonic overtones (default 0)
|
||||
sine_amp: amplitude of sine-wavefrom (default 0.1)
|
||||
noise_std: std of Gaussian noise (default 0.003)
|
||||
voiced_thoreshold: F0 threshold for U/V classification (default 0)
|
||||
flag_for_pulse: this SinGen is used inside PulseGen (default False)
|
||||
Note: when flag_for_pulse is True, the first time step of a voiced
|
||||
segment is always sin(np.pi) or cos(0)
|
||||
"""
|
||||
|
||||
def __init__(self, samp_rate, harmonic_num=0,
|
||||
sine_amp=0.1, noise_std=0.003,
|
||||
voiced_threshold=0):
|
||||
super(SineGen, self).__init__()
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = noise_std
|
||||
self.harmonic_num = harmonic_num
|
||||
self.dim = self.harmonic_num + 1
|
||||
self.sampling_rate = samp_rate
|
||||
self.voiced_threshold = voiced_threshold
|
||||
|
||||
def _f02uv(self, f0):
|
||||
# generate uv signal
|
||||
uv = torch.ones_like(f0)
|
||||
uv = uv * (f0 > self.voiced_threshold)
|
||||
return uv
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, f0, upp):
|
||||
""" sine_tensor, uv = forward(f0)
|
||||
input F0: tensor(batchsize=1, length, dim=1)
|
||||
f0 for unvoiced steps should be 0
|
||||
output sine_tensor: tensor(batchsize=1, length, dim)
|
||||
output uv: tensor(batchsize=1, length, 1)
|
||||
"""
|
||||
f0 = f0.unsqueeze(-1)
|
||||
fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1)))
|
||||
rad_values = (fn / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
|
||||
rand_ini = torch.rand(fn.shape[0], fn.shape[2], device=fn.device)
|
||||
rand_ini[:, 0] = 0
|
||||
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
|
||||
is_half = rad_values.dtype is not torch.float32
|
||||
tmp_over_one = torch.cumsum(rad_values.double(), 1) # % 1 #####%1意味着后面的cumsum无法再优化
|
||||
if is_half:
|
||||
tmp_over_one = tmp_over_one.half()
|
||||
else:
|
||||
tmp_over_one = tmp_over_one.float()
|
||||
tmp_over_one *= upp
|
||||
tmp_over_one = F.interpolate(
|
||||
tmp_over_one.transpose(2, 1), scale_factor=upp,
|
||||
mode='linear', align_corners=True
|
||||
).transpose(2, 1)
|
||||
rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
|
||||
tmp_over_one %= 1
|
||||
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
|
||||
cumsum_shift = torch.zeros_like(rad_values)
|
||||
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
|
||||
rad_values = rad_values.double()
|
||||
cumsum_shift = cumsum_shift.double()
|
||||
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
|
||||
if is_half:
|
||||
sine_waves = sine_waves.half()
|
||||
else:
|
||||
sine_waves = sine_waves.float()
|
||||
sine_waves = sine_waves * self.sine_amp
|
||||
return sine_waves
|
||||
|
||||
|
||||
class SourceModuleHnNSF(torch.nn.Module):
|
||||
""" SourceModule for hn-nsf
|
||||
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
|
||||
add_noise_std=0.003, voiced_threshod=0)
|
||||
sampling_rate: sampling_rate in Hz
|
||||
harmonic_num: number of harmonic above F0 (default: 0)
|
||||
sine_amp: amplitude of sine source signal (default: 0.1)
|
||||
add_noise_std: std of additive Gaussian noise (default: 0.003)
|
||||
note that amplitude of noise in unvoiced is decided
|
||||
by sine_amp
|
||||
voiced_threshold: threhold to set U/V given F0 (default: 0)
|
||||
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
|
||||
F0_sampled (batchsize, length, 1)
|
||||
Sine_source (batchsize, length, 1)
|
||||
noise_source (batchsize, length 1)
|
||||
uv (batchsize, length, 1)
|
||||
"""
|
||||
|
||||
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
|
||||
add_noise_std=0.003, voiced_threshod=0):
|
||||
super(SourceModuleHnNSF, self).__init__()
|
||||
|
||||
self.sine_amp = sine_amp
|
||||
self.noise_std = add_noise_std
|
||||
|
||||
# to produce sine waveforms
|
||||
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
|
||||
sine_amp, add_noise_std, voiced_threshod)
|
||||
|
||||
# to merge source harmonics into a single excitation
|
||||
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
|
||||
self.l_tanh = torch.nn.Tanh()
|
||||
|
||||
def forward(self, x, upp):
|
||||
sine_wavs = self.l_sin_gen(x, upp)
|
||||
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
|
||||
return sine_merge
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, h):
|
||||
super(Generator, self).__init__()
|
||||
self.h = h
|
||||
self.num_kernels = len(h.resblock_kernel_sizes)
|
||||
self.num_upsamples = len(h.upsample_rates)
|
||||
self.m_source = SourceModuleHnNSF(
|
||||
sampling_rate=h.sampling_rate,
|
||||
harmonic_num=8
|
||||
)
|
||||
self.noise_convs = nn.ModuleList()
|
||||
self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
|
||||
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
|
||||
c_cur = h.upsample_initial_channel // (2 ** (i + 1))
|
||||
self.ups.append(weight_norm(
|
||||
ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
|
||||
k, u, padding=(k - u) // 2)))
|
||||
if i + 1 < len(h.upsample_rates): #
|
||||
stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
|
||||
self.noise_convs.append(Conv1d(
|
||||
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
|
||||
else:
|
||||
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
|
||||
self.resblocks = nn.ModuleList()
|
||||
ch = h.upsample_initial_channel
|
||||
for i in range(len(self.ups)):
|
||||
ch //= 2
|
||||
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(h, ch, k, d))
|
||||
|
||||
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
|
||||
self.ups.apply(init_weights)
|
||||
self.conv_post.apply(init_weights)
|
||||
self.upp = int(np.prod(h.upsample_rates))
|
||||
|
||||
def forward(self, x, f0):
|
||||
har_source = self.m_source(f0, self.upp).transpose(1, 2)
|
||||
x = self.conv_pre(x)
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
x_source = self.noise_convs[i](har_source)
|
||||
x = x + x_source
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i * self.num_kernels + j](x)
|
||||
else:
|
||||
xs += self.resblocks[i * self.num_kernels + j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
remove_weight_norm(self.conv_pre)
|
||||
remove_weight_norm(self.conv_post)
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, periods=None):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
|
||||
self.discriminators = nn.ModuleList()
|
||||
for period in self.periods:
|
||||
self.discriminators.append(DiscriminatorP(period))
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
|
||||
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
])
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiScaleDiscriminator(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super(MultiScaleDiscriminator, self).__init__()
|
||||
self.discriminators = nn.ModuleList([
|
||||
DiscriminatorS(use_spectral_norm=True),
|
||||
DiscriminatorS(),
|
||||
DiscriminatorS(),
|
||||
])
|
||||
self.meanpools = nn.ModuleList([
|
||||
AvgPool1d(4, 2, padding=2),
|
||||
AvgPool1d(4, 2, padding=2)
|
||||
])
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
if i != 0:
|
||||
y = self.meanpools[i - 1](y)
|
||||
y_hat = self.meanpools[i - 1](y_hat)
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
fmap_rs.append(fmap_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
def feature_loss(fmap_r, fmap_g):
|
||||
loss = 0
|
||||
for dr, dg in zip(fmap_r, fmap_g):
|
||||
for rl, gl in zip(dr, dg):
|
||||
loss += torch.mean(torch.abs(rl - gl))
|
||||
|
||||
return loss * 2
|
||||
|
||||
|
||||
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
||||
loss = 0
|
||||
r_losses = []
|
||||
g_losses = []
|
||||
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
||||
r_loss = torch.mean((1 - dr) ** 2)
|
||||
g_loss = torch.mean(dg ** 2)
|
||||
loss += (r_loss + g_loss)
|
||||
r_losses.append(r_loss.item())
|
||||
g_losses.append(g_loss.item())
|
||||
|
||||
return loss, r_losses, g_losses
|
||||
|
||||
|
||||
def generator_loss(disc_outputs):
|
||||
loss = 0
|
||||
gen_losses = []
|
||||
for dg in disc_outputs:
|
||||
l = torch.mean((1 - dg) ** 2)
|
||||
gen_losses.append(l)
|
||||
loss += l
|
||||
|
||||
return loss, gen_losses
|
@ -0,0 +1,125 @@
|
||||
import os
|
||||
os.environ["LRU_CACHE_CAPACITY"] = "3"
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
import soundfile as sf
|
||||
import torch.nn.functional as F
|
||||
|
||||
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
|
||||
sampling_rate = None
|
||||
try:
|
||||
data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
|
||||
except Exception as ex:
|
||||
print(f"'{full_path}' failed to load.\nException:")
|
||||
print(ex)
|
||||
if return_empty_on_exception:
|
||||
return [], sampling_rate or target_sr or 48000
|
||||
else:
|
||||
raise Exception(ex)
|
||||
|
||||
if len(data.shape) > 1:
|
||||
data = data[:, 0]
|
||||
assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
|
||||
|
||||
if np.issubdtype(data.dtype, np.integer): # if audio data is type int
|
||||
max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
|
||||
else: # if audio data is type fp32
|
||||
max_mag = max(np.amax(data), -np.amin(data))
|
||||
max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
|
||||
|
||||
data = torch.FloatTensor(data.astype(np.float32))/max_mag
|
||||
|
||||
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
|
||||
return [], sampling_rate or target_sr or 48000
|
||||
if target_sr is not None and sampling_rate != target_sr:
|
||||
data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
|
||||
sampling_rate = target_sr
|
||||
|
||||
return data, sampling_rate
|
||||
|
||||
def dynamic_range_compression(x, C=1, clip_val=1e-5):
|
||||
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
|
||||
|
||||
def dynamic_range_decompression(x, C=1):
|
||||
return np.exp(x) / C
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
return torch.exp(x) / C
|
||||
|
||||
class STFT():
|
||||
def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
|
||||
self.target_sr = sr
|
||||
|
||||
self.n_mels = n_mels
|
||||
self.n_fft = n_fft
|
||||
self.win_size = win_size
|
||||
self.hop_length = hop_length
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.clip_val = clip_val
|
||||
self.mel_basis = {}
|
||||
self.hann_window = {}
|
||||
|
||||
def get_mel(self, y, keyshift=0, speed=1, center=False):
|
||||
sampling_rate = self.target_sr
|
||||
n_mels = self.n_mels
|
||||
n_fft = self.n_fft
|
||||
win_size = self.win_size
|
||||
hop_length = self.hop_length
|
||||
fmin = self.fmin
|
||||
fmax = self.fmax
|
||||
clip_val = self.clip_val
|
||||
|
||||
factor = 2 ** (keyshift / 12)
|
||||
n_fft_new = int(np.round(n_fft * factor))
|
||||
win_size_new = int(np.round(win_size * factor))
|
||||
hop_length_new = int(np.round(hop_length * speed))
|
||||
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
mel_basis_key = str(fmax)+'_'+str(y.device)
|
||||
if mel_basis_key not in self.mel_basis:
|
||||
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
||||
self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
|
||||
|
||||
keyshift_key = str(keyshift)+'_'+str(y.device)
|
||||
if keyshift_key not in self.hann_window:
|
||||
self.hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
|
||||
|
||||
pad_left = (win_size_new - hop_length_new) //2
|
||||
pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
|
||||
if pad_right < y.size(-1):
|
||||
mode = 'reflect'
|
||||
else:
|
||||
mode = 'constant'
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=self.hann_window[keyshift_key],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
|
||||
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
|
||||
if keyshift != 0:
|
||||
size = n_fft // 2 + 1
|
||||
resize = spec.size(1)
|
||||
if resize < size:
|
||||
spec = F.pad(spec, (0, 0, 0, size-resize))
|
||||
spec = spec[:, :size, :] * win_size / win_size_new
|
||||
spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
|
||||
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
|
||||
return spec
|
||||
|
||||
def __call__(self, audiopath):
|
||||
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
|
||||
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
|
||||
return spect
|
||||
|
||||
stft = STFT()
|
@ -0,0 +1,68 @@
|
||||
import glob
|
||||
import os
|
||||
import matplotlib
|
||||
import torch
|
||||
from torch.nn.utils import weight_norm
|
||||
matplotlib.use("Agg")
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
def plot_spectrogram(spectrogram):
|
||||
fig, ax = plt.subplots(figsize=(10, 2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||
interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
|
||||
fig.canvas.draw()
|
||||
plt.close()
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def apply_weight_norm(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
weight_norm(m)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size*dilation - dilation)/2)
|
||||
|
||||
|
||||
def load_checkpoint(filepath, device):
|
||||
assert os.path.isfile(filepath)
|
||||
print("Loading '{}'".format(filepath))
|
||||
checkpoint_dict = torch.load(filepath, map_location=device)
|
||||
print("Complete.")
|
||||
return checkpoint_dict
|
||||
|
||||
|
||||
def save_checkpoint(filepath, obj):
|
||||
print("Saving checkpoint to {}".format(filepath))
|
||||
torch.save(obj, filepath)
|
||||
print("Complete.")
|
||||
|
||||
|
||||
def del_old_checkpoints(cp_dir, prefix, n_models=2):
|
||||
pattern = os.path.join(cp_dir, prefix + '????????')
|
||||
cp_list = glob.glob(pattern) # get checkpoint paths
|
||||
cp_list = sorted(cp_list)# sort by iter
|
||||
if len(cp_list) > n_models: # if more than n_models models are found
|
||||
for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
|
||||
open(cp, 'w').close()# empty file contents
|
||||
os.unlink(cp)# delete file (move to trash when using Colab)
|
||||
|
||||
|
||||
def scan_checkpoint(cp_dir, prefix):
|
||||
pattern = os.path.join(cp_dir, prefix + '????????')
|
||||
cp_list = glob.glob(pattern)
|
||||
if len(cp_list) == 0:
|
||||
return None
|
||||
return sorted(cp_list)[-1]
|
||||
|
@ -0,0 +1,4 @@
|
||||
modules in this folder from https://github.com/CNChTu/Diffusion-SVC at ae4120a2b6399ed5657b16dc702b57220fe4a295
|
||||
|
||||
|
||||
|
@ -0,0 +1,165 @@
|
||||
import librosa
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
|
||||
class Slicer:
|
||||
def __init__(self,
|
||||
sr: int,
|
||||
threshold: float = -40.,
|
||||
min_length: int = 5000,
|
||||
min_interval: int = 300,
|
||||
hop_size: int = 20,
|
||||
max_sil_kept: int = 5000):
|
||||
if not min_length >= min_interval >= hop_size:
|
||||
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
|
||||
if not max_sil_kept >= hop_size:
|
||||
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
|
||||
min_interval = sr * min_interval / 1000
|
||||
self.threshold = 10 ** (threshold / 20.)
|
||||
self.hop_size = round(sr * hop_size / 1000)
|
||||
self.win_size = min(round(min_interval), 4 * self.hop_size)
|
||||
self.min_length = round(sr * min_length / 1000 / self.hop_size)
|
||||
self.min_interval = round(min_interval / self.hop_size)
|
||||
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
|
||||
|
||||
def _apply_slice(self, waveform, begin, end):
|
||||
if len(waveform.shape) > 1:
|
||||
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
|
||||
else:
|
||||
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
|
||||
|
||||
# @timeit
|
||||
def slice(self, waveform):
|
||||
if len(waveform.shape) > 1:
|
||||
samples = librosa.to_mono(waveform)
|
||||
else:
|
||||
samples = waveform
|
||||
if samples.shape[0] <= self.min_length:
|
||||
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
|
||||
rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
|
||||
sil_tags = []
|
||||
silence_start = None
|
||||
clip_start = 0
|
||||
for i, rms in enumerate(rms_list):
|
||||
# Keep looping while frame is silent.
|
||||
if rms < self.threshold:
|
||||
# Record start of silent frames.
|
||||
if silence_start is None:
|
||||
silence_start = i
|
||||
continue
|
||||
# Keep looping while frame is not silent and silence start has not been recorded.
|
||||
if silence_start is None:
|
||||
continue
|
||||
# Clear recorded silence start if interval is not enough or clip is too short
|
||||
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
|
||||
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
|
||||
if not is_leading_silence and not need_slice_middle:
|
||||
silence_start = None
|
||||
continue
|
||||
# Need slicing. Record the range of silent frames to be removed.
|
||||
if i - silence_start <= self.max_sil_kept:
|
||||
pos = rms_list[silence_start: i + 1].argmin() + silence_start
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos))
|
||||
else:
|
||||
sil_tags.append((pos, pos))
|
||||
clip_start = pos
|
||||
elif i - silence_start <= self.max_sil_kept * 2:
|
||||
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
|
||||
pos += i - self.max_sil_kept
|
||||
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
||||
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos_r))
|
||||
clip_start = pos_r
|
||||
else:
|
||||
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
|
||||
clip_start = max(pos_r, pos)
|
||||
else:
|
||||
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
|
||||
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
|
||||
if silence_start == 0:
|
||||
sil_tags.append((0, pos_r))
|
||||
else:
|
||||
sil_tags.append((pos_l, pos_r))
|
||||
clip_start = pos_r
|
||||
silence_start = None
|
||||
# Deal with trailing silence.
|
||||
total_frames = rms_list.shape[0]
|
||||
if silence_start is not None and total_frames - silence_start >= self.min_interval:
|
||||
silence_end = min(total_frames, silence_start + self.max_sil_kept)
|
||||
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
|
||||
sil_tags.append((pos, total_frames + 1))
|
||||
# Apply and return slices.
|
||||
if len(sil_tags) == 0:
|
||||
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
|
||||
else:
|
||||
chunks = []
|
||||
# 第一段静音并非从头开始,补上有声片段
|
||||
if sil_tags[0][0]:
|
||||
chunks.append(
|
||||
{"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
|
||||
for i in range(0, len(sil_tags)):
|
||||
# 标识有声片段(跳过第一段)
|
||||
if i:
|
||||
chunks.append({"slice": False,
|
||||
"split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
|
||||
# 标识所有静音片段
|
||||
chunks.append({"slice": True,
|
||||
"split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
|
||||
# 最后一段静音并非结尾,补上结尾片段
|
||||
if sil_tags[-1][1] * self.hop_size < len(waveform):
|
||||
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
|
||||
chunk_dict = {}
|
||||
for i in range(len(chunks)):
|
||||
chunk_dict[str(i)] = chunks[i]
|
||||
return chunk_dict
|
||||
|
||||
|
||||
def cut(audio_path, db_thresh=-30, min_len=5000, flask_mode=False, flask_sr=None):
|
||||
if not flask_mode:
|
||||
audio, sr = librosa.load(audio_path, sr=None)
|
||||
else:
|
||||
audio = audio_path
|
||||
sr = flask_sr
|
||||
slicer = Slicer(
|
||||
sr=sr,
|
||||
threshold=db_thresh,
|
||||
min_length=min_len
|
||||
)
|
||||
chunks = slicer.slice(audio)
|
||||
return chunks
|
||||
|
||||
|
||||
def chunks2audio(audio_path, chunks):
|
||||
chunks = dict(chunks)
|
||||
audio, sr = torchaudio.load(audio_path)
|
||||
if len(audio.shape) == 2 and audio.shape[1] >= 2:
|
||||
audio = torch.mean(audio, dim=0).unsqueeze(0)
|
||||
audio = audio.cpu().numpy()[0]
|
||||
result = []
|
||||
for k, v in chunks.items():
|
||||
tag = v["split_time"].split(",")
|
||||
if tag[0] != tag[1]:
|
||||
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
|
||||
return result, sr
|
||||
|
||||
|
||||
def split(audio, sample_rate, hop_size, db_thresh=-40, min_len=5000):
|
||||
slicer = Slicer(
|
||||
sr=sample_rate,
|
||||
threshold=db_thresh,
|
||||
min_length=min_len)
|
||||
chunks = dict(slicer.slice(audio))
|
||||
result = []
|
||||
for k, v in chunks.items():
|
||||
tag = v["split_time"].split(",")
|
||||
if tag[0] != tag[1]:
|
||||
start_frame = int(int(tag[0]) // hop_size)
|
||||
end_frame = int(int(tag[1]) // hop_size)
|
||||
if end_frame > start_frame:
|
||||
result.append((
|
||||
start_frame,
|
||||
audio[int(start_frame * hop_size): int(end_frame * hop_size)]))
|
||||
return result
|
@ -0,0 +1,808 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
import pyworld as pw
|
||||
import parselmouth
|
||||
import torchcrepe
|
||||
import librosa
|
||||
import fsspec
|
||||
from tqdm import tqdm
|
||||
from transformers import HubertModel, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
|
||||
from fairseq import checkpoint_utils
|
||||
from encoder.hubert.model import HubertSoft
|
||||
from encoder.speaker_encoder.model import SpeakerEncoder as TTSSpeakerEncoder
|
||||
import scipy.signal
|
||||
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
CREPE_RESAMPLE_KERNEL = {}
|
||||
|
||||
|
||||
class SpeakerEncoder:
|
||||
def __init__(self, speaker_encoder, speaker_encoder_config, speaker_encoder_ckpt, encoder_sample_rate,
|
||||
device='cuda',
|
||||
use_torchaudio=False):
|
||||
self.use_torchaudio = use_torchaudio
|
||||
self.encoder_sample_rate = encoder_sample_rate
|
||||
self.device = device
|
||||
self.resample_kernel = {}
|
||||
if speaker_encoder == "ge2e":
|
||||
self.encoder = GE2E(speaker_encoder_config, speaker_encoder_ckpt, device=device)
|
||||
else:
|
||||
raise ValueError(f" [x] Unknown speaker encoder: {speaker_encoder}")
|
||||
|
||||
def __call__(self, audio=None, audio_t=None,
|
||||
sample_rate=44100): # if use torchaudio, audio_t must be a tensor; else audio must be a np
|
||||
audio_res = None
|
||||
if sample_rate == self.encoder_sample_rate:
|
||||
if self.use_torchaudio and (audio_t is not None):
|
||||
audio_res = audio_t.cpu().numpy().squeeze(0)
|
||||
else:
|
||||
if audio is not None:
|
||||
audio_res = audio
|
||||
else:
|
||||
key_str = str(sample_rate)
|
||||
if self.use_torchaudio and (audio_t is not None):
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(sample_rate, self.encoder_sample_rate,
|
||||
lowpass_filter_width=128).to(self.device)
|
||||
audio_res = self.resample_kernel[key_str](audio_t).cpu().numpy().squeeze(0)
|
||||
else:
|
||||
if audio is not None:
|
||||
audio_res = librosa.resample(audio, orig_sr=sample_rate, target_sr=self.encoder_sample_rate)
|
||||
assert audio_res is not None
|
||||
return self.encoder(audio_res)
|
||||
|
||||
def mean_spk_emb_from_wav_list(self, audio_list, sr_list):
|
||||
assert len(audio_list) == len(sr_list)
|
||||
batch_spk_emb = None
|
||||
print("Get mean spk_emb from audio_list")
|
||||
for index in tqdm(range(len(audio_list))):
|
||||
audio = audio_list[index]
|
||||
sample_rate = sr_list[index]
|
||||
f_len = int(50 * len(audio) / sample_rate) # 50f/s is for sr=16000,hop_size=320
|
||||
spk_emb = self.__call__(audio=audio, sample_rate=sample_rate)
|
||||
spk_emb = np.tile(spk_emb, (f_len, 1))
|
||||
if batch_spk_emb is None:
|
||||
batch_spk_emb = spk_emb
|
||||
else:
|
||||
batch_spk_emb = np.concatenate([spk_emb, batch_spk_emb], axis=0)
|
||||
return np.mean(batch_spk_emb, axis=0)
|
||||
|
||||
def mean_spk_emb_from_path_list(self, path_list):
|
||||
batch_spk_emb = None
|
||||
print("Get mean spk_emb from path_list")
|
||||
for path in tqdm(path_list):
|
||||
audio, sample_rate = librosa.load(path, sr=None)
|
||||
f_len = int(50 * len(audio) / sample_rate) # 50f/s is for sr=16000,hop_size=320
|
||||
spk_emb = self.__call__(audio=audio, sample_rate=sample_rate)
|
||||
spk_emb = np.tile(spk_emb, (f_len, 1))
|
||||
if batch_spk_emb is None:
|
||||
batch_spk_emb = spk_emb
|
||||
else:
|
||||
batch_spk_emb = np.concatenate([spk_emb, batch_spk_emb], axis=0)
|
||||
return np.mean(batch_spk_emb, axis=0)
|
||||
|
||||
|
||||
class GE2E:
|
||||
def __init__(self, config_path, ckpt_path, device='cuda'):
|
||||
import json5
|
||||
with open(config_path) as f:
|
||||
self.config = json5.load(f)
|
||||
# load model
|
||||
self.model = TTSSpeakerEncoder(
|
||||
self.config['model']["input_dim"],
|
||||
self.config['model']["proj_dim"],
|
||||
self.config['model']["lstm_dim"],
|
||||
self.config['model']["num_lstm_layers"],
|
||||
)
|
||||
with fsspec.open(ckpt_path, "rb") as f:
|
||||
state = torch.load(f, map_location=device)
|
||||
self.model.load_state_dict(state["model"])
|
||||
self.model = self.model.to(device)
|
||||
self.model.eval()
|
||||
|
||||
self.preemphasis = self.config["audio"]["preemphasis"]
|
||||
self.do_amp_to_db_mel = True
|
||||
self.fft_size = self.config["audio"]["fft_size"]
|
||||
self.hop_length = self.config["audio"]["hop_length"]
|
||||
self.win_length = self.config["audio"]["win_length"]
|
||||
self.signal_norm = self.config['audio']['signal_norm']
|
||||
self.num_mels = self.config["audio"]["num_mels"]
|
||||
self.ref_level_db = self.config["audio"]['ref_level_db']
|
||||
self.min_level_db = self.config["audio"]['min_level_db']
|
||||
self.symmetric_norm = self.config["audio"]['symmetric_norm']
|
||||
self.clip_norm = self.config["audio"]['clip_norm']
|
||||
self.max_norm = self.config["audio"]['max_norm']
|
||||
self.stft_pad_mode = 'reflect'
|
||||
self.spec_gain = 20.0
|
||||
self.base = 10
|
||||
self.device = device
|
||||
mel_basis = librosa.filters.mel(
|
||||
sr=self.config["audio"]["sample_rate"], n_fft=self.config["audio"]['fft_size'],
|
||||
n_mels=self.num_mels, fmin=self.config["audio"]['mel_fmin'],
|
||||
fmax=self.config["audio"]['mel_fmax']
|
||||
)
|
||||
self.mel_basis = torch.from_numpy(mel_basis).float()
|
||||
|
||||
def __call__(self, audio, use_old_infer=True):
|
||||
y = audio
|
||||
if self.preemphasis != 0:
|
||||
y = scipy.signal.lfilter([1, -self.preemphasis], [1], y)
|
||||
D = librosa.stft(
|
||||
y=y,
|
||||
n_fft=self.fft_size, hop_length=self.hop_length, win_length=self.win_length, pad_mode=self.stft_pad_mode,
|
||||
window="hann", center=True)
|
||||
D = np.abs(D)
|
||||
D = np.dot(self.mel_basis, D)
|
||||
if self.base == 10:
|
||||
spec = self.spec_gain * np.log10(np.maximum(1e-5, D))
|
||||
else:
|
||||
spec = self.spec_gain * np.log(np.maximum(1e-5, D))
|
||||
spec = self.normalize(spec).astype(np.float32)
|
||||
spec = torch.from_numpy(spec.T)
|
||||
spec = spec.to(self.device)
|
||||
spec = spec.unsqueeze(0)
|
||||
if use_old_infer:
|
||||
spk_emb = self.compute_embedding_old(spec).detach().cpu().numpy()
|
||||
else:
|
||||
spk_emb = self.model.compute_embedding(spec).detach().cpu().numpy()
|
||||
return spk_emb.squeeze()
|
||||
|
||||
def normalize(self, S) -> np.ndarray:
|
||||
S = S.copy()
|
||||
if self.signal_norm:
|
||||
S -= self.ref_level_db
|
||||
S_norm = (S - self.min_level_db) / (-self.min_level_db)
|
||||
if self.symmetric_norm:
|
||||
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
S_norm = self.max_norm * S_norm
|
||||
if self.clip_norm:
|
||||
S_norm = np.clip(S_norm, 0, self.max_norm)
|
||||
return S_norm
|
||||
else:
|
||||
return S
|
||||
|
||||
def compute_embedding_old(self, x, num_frames=250, num_eval=10, return_mean=True):
|
||||
max_len = x.shape[1]
|
||||
|
||||
if max_len < num_frames:
|
||||
num_frames = max_len
|
||||
|
||||
offsets = np.linspace(0, max_len - num_frames, num=num_eval)
|
||||
|
||||
frames_batch = []
|
||||
for offset in offsets:
|
||||
offset = int(offset)
|
||||
end_offset = int(offset + num_frames)
|
||||
frames = x[:, offset:end_offset]
|
||||
frames_batch.append(frames)
|
||||
|
||||
frames_batch = torch.cat(frames_batch, dim=0)
|
||||
embeddings = self.model.inference(frames_batch)
|
||||
|
||||
if return_mean:
|
||||
embeddings = torch.mean(embeddings, dim=0, keepdim=True)
|
||||
|
||||
return embeddings
|
||||
|
||||
|
||||
class F0_Extractor:
|
||||
def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800,
|
||||
block_size=None, model_sampling_rate=None):
|
||||
self.block_size = block_size
|
||||
self.model_sampling_rate = model_sampling_rate
|
||||
self.f0_extractor = f0_extractor
|
||||
self.sample_rate = sample_rate
|
||||
self.hop_size = hop_size
|
||||
self.f0_min = f0_min
|
||||
self.f0_max = f0_max
|
||||
self.transformer_f0 = None
|
||||
if f0_extractor == 'crepe':
|
||||
key_str = str(sample_rate)
|
||||
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
|
||||
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||
if (self.block_size is not None) or (self.model_sampling_rate is not None):
|
||||
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
|
||||
self.hop_size_follow_input = True
|
||||
else:
|
||||
self.hop_size_follow_input = False
|
||||
|
||||
def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array
|
||||
if sr is not None:
|
||||
assert self.hop_size_follow_input
|
||||
self.hop_size = self.block_size * sr / self.model_sampling_rate
|
||||
if (self.f0_extractor == 'crepe') and (sr != self.sample_rate):
|
||||
key_str = str(sr)
|
||||
if key_str not in CREPE_RESAMPLE_KERNEL:
|
||||
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128)
|
||||
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
|
||||
self.sample_rate = sr
|
||||
|
||||
# extractor start time
|
||||
raw_audio = audio
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
|
||||
start_frame = int(silence_front * self.sample_rate / self.hop_size)
|
||||
real_silence_front = start_frame * self.hop_size / self.sample_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sample_rate)):]
|
||||
|
||||
# extract f0 using parselmouth
|
||||
if self.f0_extractor == 'parselmouth':
|
||||
f0 = parselmouth.Sound(audio, self.sample_rate).to_pitch_ac(
|
||||
time_step=self.hop_size / self.sample_rate,
|
||||
voicing_threshold=0.6,
|
||||
pitch_floor=self.f0_min,
|
||||
pitch_ceiling=self.f0_max).selected_array['frequency']
|
||||
pad_size = start_frame + (int(len(audio) // self.hop_size) - len(f0) + 1) // 2
|
||||
f0 = np.pad(f0, (pad_size, n_frames - len(f0) - pad_size))
|
||||
|
||||
# extract f0 using dio
|
||||
elif self.f0_extractor == 'dio':
|
||||
_f0, t = pw.dio(
|
||||
audio.astype('double'),
|
||||
self.sample_rate,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
channels_in_octave=2,
|
||||
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||
f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate)
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
|
||||
# extract f0 using harvest
|
||||
elif self.f0_extractor == 'harvest':
|
||||
f0, _ = pw.harvest(
|
||||
audio.astype('double'),
|
||||
self.sample_rate,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
frame_period=(1000 * self.hop_size / self.sample_rate))
|
||||
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
|
||||
# extract f0 using crepe
|
||||
elif self.f0_extractor == 'crepe':
|
||||
if device is None:
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
resample_kernel = self.resample_kernel.to(device)
|
||||
wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device))
|
||||
|
||||
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full',
|
||||
batch_size=512, device=device, return_periodicity=True)
|
||||
pd = median_pool_1d(pd, 4)
|
||||
f0 = torchcrepe.threshold.At(0.05)(f0, pd)
|
||||
f0 = masked_avg_pool_1d(f0, 4)
|
||||
|
||||
f0 = f0.squeeze(0).cpu().numpy()
|
||||
f0 = np.array(
|
||||
[f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in
|
||||
range(n_frames - start_frame)])
|
||||
f0 = np.pad(f0, (start_frame, 0))
|
||||
|
||||
elif self.f0_extractor == "transformer_f0":
|
||||
if self.transformer_f0 is None:
|
||||
from transformer_f0.model import TransformerF0Infer
|
||||
self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt')
|
||||
# raw_audio = audio
|
||||
f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate)
|
||||
# f0 = f0.transpose(1, 2)
|
||||
# f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest')
|
||||
# f0 = f0.transpose(1, 2)
|
||||
f0 = f0.squeeze().cpu().numpy()
|
||||
# f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
|
||||
else:
|
||||
raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}")
|
||||
|
||||
# interpolate the unvoiced f0
|
||||
if uv_interp:
|
||||
uv = f0 == 0
|
||||
if len(f0[~uv]) > 0:
|
||||
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
|
||||
f0[f0 < self.f0_min] = self.f0_min
|
||||
return f0
|
||||
|
||||
|
||||
class Volume_Extractor:
|
||||
def __init__(self, hop_size=512, block_size=None, model_sampling_rate=None):
|
||||
self.block_size = block_size
|
||||
self.model_sampling_rate = model_sampling_rate
|
||||
self.hop_size = hop_size
|
||||
if (self.block_size is not None) or (self.model_sampling_rate is not None):
|
||||
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
|
||||
self.hop_size_follow_input = True
|
||||
else:
|
||||
self.hop_size_follow_input = False
|
||||
|
||||
def extract(self, audio, sr=None): # audio: 1d numpy array
|
||||
if sr is not None:
|
||||
assert self.hop_size_follow_input
|
||||
self.hop_size = self.block_size * sr / self.model_sampling_rate
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
audio2 = audio ** 2
|
||||
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
|
||||
volume = np.array(
|
||||
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||
volume = np.sqrt(volume)
|
||||
'''
|
||||
if isinstance(audio, torch.Tensor):
|
||||
n_frames = int(audio.size(-1) // self.hop_size) + 1
|
||||
audio2 = audio ** 2
|
||||
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)),
|
||||
mode='reflect')
|
||||
audio_frame = torch.nn.functional.unfold(audio2[:, None, None, :], (1, int(self.hop_size)),
|
||||
stride=int(self.hop_size))[:, :, :n_frames]
|
||||
volume = audio_frame.mean(dim=1)[0]
|
||||
volume = torch.sqrt(volume).squeeze().cpu().numpy()
|
||||
else:
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
audio2 = audio ** 2
|
||||
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
|
||||
volume = np.array(
|
||||
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||
volume = np.sqrt(volume)
|
||||
'''
|
||||
return volume
|
||||
|
||||
def get_mask_from_volume(self, volume, threhold=-60.0,device='cpu'):
|
||||
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
|
||||
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
|
||||
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
|
||||
mask = upsample(mask, self.block_size).squeeze(-1)
|
||||
return mask
|
||||
|
||||
class Units_Encoder:
|
||||
def __init__(self, encoder, encoder_ckpt, encoder_sample_rate=16000, encoder_hop_size=320, device=None,
|
||||
cnhubertsoft_gate=10, units_forced_mode='nearest'):
|
||||
if device is None:
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.device = device
|
||||
|
||||
if cnhubertsoft_gate is None:
|
||||
cnhubertsoft_gate = 10
|
||||
if units_forced_mode is None:
|
||||
units_forced_mode = 'left'
|
||||
self.units_forced_mode = units_forced_mode
|
||||
|
||||
is_loaded_encoder = False
|
||||
if encoder == 'hubertsoft':
|
||||
self.model = Audio2HubertSoft(encoder_ckpt).to(device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'hubertbase':
|
||||
self.model = Audio2HubertBase(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'hubertbase768':
|
||||
self.model = Audio2HubertBase768(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'hubertbase768l12':
|
||||
self.model = Audio2HubertBase768L12(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'hubertlarge1024l24':
|
||||
self.model = Audio2HubertLarge1024L24(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'contentvec':
|
||||
self.model = Audio2ContentVec(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'contentvec768':
|
||||
self.model = Audio2ContentVec768(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'contentvec768l12':
|
||||
self.model = Audio2ContentVec768L12(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if encoder == 'cnhubertsoftfish':
|
||||
self.model = CNHubertSoftFish(encoder_ckpt, device=device, gate_size=cnhubertsoft_gate)
|
||||
is_loaded_encoder = True
|
||||
if encoder in ('wav2vec2', 'wav2vec2-xlsr-53-espeak-cv-ft'):
|
||||
self.model = Wav2Vec2(encoder_ckpt, device=device)
|
||||
is_loaded_encoder = True
|
||||
if not is_loaded_encoder:
|
||||
raise ValueError(f" [x] Unknown units encoder: {encoder}")
|
||||
print(f"Units Forced Mode:{self.units_forced_mode}")
|
||||
|
||||
if self.units_forced_mode == 'rfa512to441':
|
||||
encoder_sample_rate = encoder_sample_rate * 441 / 512
|
||||
if self.units_forced_mode == 'rfa441to512':
|
||||
encoder_sample_rate = encoder_sample_rate * 512 / 441
|
||||
|
||||
self.resample_kernel = {}
|
||||
self.encoder_sample_rate = encoder_sample_rate
|
||||
self.encoder_hop_size = encoder_hop_size
|
||||
|
||||
def encode(self,
|
||||
audio, # B, T
|
||||
sample_rate,
|
||||
hop_size,
|
||||
padding_mask=None):
|
||||
|
||||
# resample
|
||||
if self.units_forced_mode not in ('rfa441to512', 'rfa512to441'):
|
||||
if sample_rate == self.encoder_sample_rate:
|
||||
audio_res = audio
|
||||
else:
|
||||
key_str = str(sample_rate)
|
||||
if key_str not in self.resample_kernel:
|
||||
self.resample_kernel[key_str] = Resample(sample_rate, self.encoder_sample_rate,
|
||||
lowpass_filter_width=128).to(self.device)
|
||||
audio_res = self.resample_kernel[key_str](audio)
|
||||
else:
|
||||
if isinstance(audio, np.ndarray):
|
||||
_audio = audio
|
||||
else:
|
||||
_audio = audio.cpu().numpy()
|
||||
audio_res = librosa.resample(_audio, orig_sr=sample_rate, target_sr=self.encoder_sample_rate)
|
||||
audio_res = torch.from_numpy(audio_res).to(self.device)
|
||||
|
||||
# encode
|
||||
if audio_res.size(-1) < 400:
|
||||
audio_res = torch.nn.functional.pad(audio, (0, 400 - audio_res.size(-1)))
|
||||
units = self.model(audio_res, padding_mask=padding_mask)
|
||||
|
||||
# alignment
|
||||
if self.units_forced_mode == 'left':
|
||||
n_frames = audio.size(-1) // hop_size + 1
|
||||
ratio = (hop_size / sample_rate) / (self.encoder_hop_size / self.encoder_sample_rate)
|
||||
index = torch.clamp(torch.round(ratio * torch.arange(n_frames).to(self.device)).long(), max=units.size(1) - 1)
|
||||
units_aligned = torch.gather(units, 1, index.unsqueeze(0).unsqueeze(-1).repeat([1, 1, units.size(-1)]))
|
||||
|
||||
elif self.units_forced_mode == 'nearest':
|
||||
n_frames = int(audio.size(-1) // hop_size + 1)
|
||||
units = units.transpose(1, 2)
|
||||
units_aligned = torch.nn.functional.interpolate(units, size=int(n_frames), mode='nearest')
|
||||
units_aligned = units_aligned.transpose(1, 2)
|
||||
|
||||
elif self.units_forced_mode in ('rfa441to512', 'rfa512to441'):
|
||||
n_frames = int(audio.size(-1) // hop_size + 1)
|
||||
units = units.transpose(1, 2)
|
||||
units_aligned = torch.nn.functional.interpolate(units, size=int(n_frames), mode='nearest')
|
||||
units_aligned = units_aligned.transpose(1, 2)
|
||||
|
||||
else:
|
||||
raise ValueError(f'Unknow units_forced_mode:{self.units_forced_mode}')
|
||||
return units_aligned
|
||||
|
||||
|
||||
class Audio2HubertSoft(torch.nn.Module):
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320):
|
||||
super().__init__()
|
||||
print(' [Encoder Model] HuBERT Soft')
|
||||
self.hubert = HubertSoft()
|
||||
print(' [Loading] ' + path)
|
||||
checkpoint = torch.load(path)
|
||||
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
|
||||
self.hubert.load_state_dict(checkpoint)
|
||||
self.hubert.eval()
|
||||
|
||||
def forward(self, audio, padding_mask=None): # B, T
|
||||
with torch.inference_mode():
|
||||
units = self.hubert.units(audio.unsqueeze(1))
|
||||
return units
|
||||
|
||||
|
||||
class Audio2ContentVec():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] Content Vec')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
# wav_tensor = torch.from_numpy(audio).to(self.device)
|
||||
wav_tensor = audio
|
||||
feats = wav_tensor.view(1, -1)
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": feats.to(wav_tensor.device),
|
||||
"padding_mask": padding_mask.to(wav_tensor.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
feats = self.hubert.final_proj(logits[0])
|
||||
units = feats # .transpose(2, 1)
|
||||
return units
|
||||
|
||||
|
||||
class Audio2ContentVec768():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] Content Vec')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
# wav_tensor = torch.from_numpy(audio).to(self.device)
|
||||
wav_tensor = audio
|
||||
feats = wav_tensor.view(1, -1)
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": feats.to(wav_tensor.device),
|
||||
"padding_mask": padding_mask.to(wav_tensor.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
feats = logits[0]
|
||||
units = feats # .transpose(2, 1)
|
||||
return units
|
||||
|
||||
|
||||
class Audio2ContentVec768L12():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] Content Vec')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
# wav_tensor = torch.from_numpy(audio).to(self.device)
|
||||
wav_tensor = audio
|
||||
feats = wav_tensor.view(1, -1)
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": feats.to(wav_tensor.device),
|
||||
"padding_mask": padding_mask.to(wav_tensor.device),
|
||||
"output_layer": 12, # layer 12
|
||||
}
|
||||
with torch.no_grad():
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
feats = logits[0]
|
||||
units = feats # .transpose(2, 1)
|
||||
return units
|
||||
|
||||
|
||||
class CNHubertSoftFish(torch.nn.Module):
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu', gate_size=10):
|
||||
super().__init__()
|
||||
self.device = device
|
||||
print(' [Encoder Model] CN Hubert Soft fish')
|
||||
print(' [Loading] ' + path)
|
||||
self.gate_size = gate_size
|
||||
|
||||
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
|
||||
"./pretrain/TencentGameMate/chinese-hubert-base")
|
||||
self.model = HubertModel.from_pretrained("./pretrain/TencentGameMate/chinese-hubert-base")
|
||||
self.proj = torch.nn.Sequential(torch.nn.Dropout(0.1), torch.nn.Linear(768, 256))
|
||||
# self.label_embedding = nn.Embedding(128, 256)
|
||||
|
||||
state_dict = torch.load(path, map_location=device)
|
||||
self.load_state_dict(state_dict)
|
||||
|
||||
@torch.no_grad()
|
||||
def forward(self, audio, padding_mask=None): # B, T
|
||||
input_values = self.feature_extractor(
|
||||
audio, sampling_rate=16000, return_tensors="pt"
|
||||
).input_values
|
||||
input_values = input_values.to(self.model.device)
|
||||
|
||||
return self._forward(input_values[0])
|
||||
|
||||
@torch.no_grad()
|
||||
def _forward(self, input_values):
|
||||
features = self.model(input_values)
|
||||
features = self.proj(features.last_hidden_state)
|
||||
|
||||
# Top-k gating
|
||||
topk, indices = torch.topk(features, self.gate_size, dim=2)
|
||||
features = torch.zeros_like(features).scatter(2, indices, topk)
|
||||
features = features / features.sum(2, keepdim=True)
|
||||
|
||||
return features.to(self.device) # .transpose(1, 2)
|
||||
|
||||
|
||||
class Audio2HubertBase():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] HuBERT Base')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert = self.hubert.float()
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
with torch.no_grad():
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": audio.to(self.device),
|
||||
"padding_mask": padding_mask.to(self.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
units = self.hubert.final_proj(logits[0])
|
||||
return units
|
||||
|
||||
|
||||
class Audio2HubertBase768():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] HuBERT Base')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert = self.hubert.float()
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
with torch.no_grad():
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": audio.to(self.device),
|
||||
"padding_mask": padding_mask.to(self.device),
|
||||
"output_layer": 9, # layer 9
|
||||
}
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
units = logits[0]
|
||||
return units
|
||||
|
||||
|
||||
class Audio2HubertBase768L12():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] HuBERT Base')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert = self.hubert.float()
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
with torch.no_grad():
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": audio.to(self.device),
|
||||
"padding_mask": padding_mask.to(self.device),
|
||||
"output_layer": 12, # layer 12
|
||||
}
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
units = logits[0]
|
||||
return units
|
||||
|
||||
|
||||
class Audio2HubertLarge1024L24():
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
print(' [Encoder Model] HuBERT Large')
|
||||
print(' [Loading] ' + path)
|
||||
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
|
||||
self.hubert = self.models[0]
|
||||
self.hubert = self.hubert.to(self.device)
|
||||
self.hubert = self.hubert.float()
|
||||
self.hubert.eval()
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
with torch.no_grad():
|
||||
if padding_mask is None:
|
||||
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
|
||||
else:
|
||||
padding_mask = padding_mask.bool()
|
||||
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
|
||||
inputs = {
|
||||
"source": audio.to(self.device),
|
||||
"padding_mask": padding_mask.to(self.device),
|
||||
"output_layer": 24, # layer 24
|
||||
}
|
||||
logits = self.hubert.extract_features(**inputs)
|
||||
units = logits[0]
|
||||
return units
|
||||
|
||||
|
||||
class Wav2Vec2:
|
||||
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
|
||||
self.device = device
|
||||
self.model = Wav2Vec2ForCTC.from_pretrained(path)
|
||||
self.model.eval()
|
||||
self.model.to(device)
|
||||
|
||||
def __call__(self, audio, padding_mask=None): # B, T
|
||||
with torch.no_grad():
|
||||
logits = self.model(audio).logits
|
||||
return logits
|
||||
|
||||
|
||||
class DotDict(dict):
|
||||
def __getattr__(*args):
|
||||
val = dict.get(*args)
|
||||
return DotDict(val) if type(val) is dict else val
|
||||
|
||||
__setattr__ = dict.__setitem__
|
||||
__delattr__ = dict.__delitem__
|
||||
|
||||
|
||||
def masked_avg_pool_1d(x, kernel_size):
|
||||
x = x.unsqueeze(1)
|
||||
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||
mask = ~torch.isnan(x)
|
||||
masked_x = torch.where(mask, x, torch.zeros_like(x))
|
||||
ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device)
|
||||
|
||||
# Perform sum pooling
|
||||
sum_pooled = F.conv1d(
|
||||
masked_x,
|
||||
ones_kernel,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=x.size(1),
|
||||
)
|
||||
|
||||
# Count the non-masked (valid) elements in each pooling window
|
||||
valid_count = F.conv1d(
|
||||
mask.float(),
|
||||
ones_kernel,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=x.size(1),
|
||||
)
|
||||
valid_count = valid_count.clamp(min=1) # Avoid division by zero
|
||||
|
||||
# Perform masked average pooling
|
||||
avg_pooled = sum_pooled / valid_count
|
||||
|
||||
return avg_pooled.squeeze(1)
|
||||
|
||||
|
||||
def median_pool_1d(x, kernel_size):
|
||||
x = x.unsqueeze(1)
|
||||
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
|
||||
x = x.squeeze(1)
|
||||
x = x.unfold(1, kernel_size, 1)
|
||||
x, _ = torch.sort(x, dim=-1)
|
||||
return x[:, :, (kernel_size - 1) // 2]
|
||||
|
||||
|
||||
def upsample(signal, factor):
|
||||
signal = signal.permute(0, 2, 1)
|
||||
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1,
|
||||
mode='linear', align_corners=True)
|
||||
signal = signal[:, :, :-1]
|
||||
return signal.permute(0, 2, 1)
|
||||
|
||||
|
||||
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
|
||||
result = np.zeros(idx + b.shape[0])
|
||||
fade_len = a.shape[0] - idx
|
||||
np.copyto(dst=result[:idx], src=a[:idx])
|
||||
k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
|
||||
result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
|
||||
np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
|
||||
return result
|
@ -0,0 +1,80 @@
|
||||
import os
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
import pickle
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
def train_index(path):
|
||||
import faiss
|
||||
# from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
|
||||
# 获取文件列表
|
||||
listdir_res = []
|
||||
for file in os.listdir(path):
|
||||
listdir_res.append(os.path.join(path, file))
|
||||
npys = []
|
||||
# 读取文件
|
||||
print(" [INFO] Loading the Units files...")
|
||||
for name in tqdm(sorted(listdir_res)):
|
||||
phone = np.load(name)
|
||||
npys.append(phone)
|
||||
# 正式内容
|
||||
big_npy = np.concatenate(npys, 0)
|
||||
big_npy_idx = np.arange(big_npy.shape[0])
|
||||
np.random.shuffle(big_npy_idx)
|
||||
big_npy = big_npy[big_npy_idx]
|
||||
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
|
||||
index = faiss.index_factory(big_npy.shape[1], "IVF%s,Flat" % n_ivf)
|
||||
index_ivf = faiss.extract_index_ivf(index) #
|
||||
index_ivf.nprobe = 1
|
||||
index.train(big_npy)
|
||||
batch_size_add = 8192
|
||||
print(" [INFO] Training the Units indexes...")
|
||||
for i in tqdm(range(0, big_npy.shape[0], batch_size_add)):
|
||||
index.add(big_npy[i: i + batch_size_add])
|
||||
return index
|
||||
|
||||
|
||||
class UnitsIndexer:
|
||||
def __init__(self, exp_path):
|
||||
exp_path = Path(exp_path)
|
||||
self.model = None
|
||||
self.exp_path = exp_path
|
||||
self.spk_id = -1
|
||||
self.active = False
|
||||
self.big_all_npy = None
|
||||
|
||||
def load(self, spk_id=1, exp_path=None):
|
||||
if (exp_path is not None) and os.path.samefile(self.exp_path, Path(exp_path)):
|
||||
exp_path = Path(exp_path)
|
||||
self.exp_path = exp_path
|
||||
index_pkl_path = os.path.join(self.exp_path, 'units_index', f'spk{spk_id}.pkl')
|
||||
if not os.path.isfile(index_pkl_path):
|
||||
self.active = False
|
||||
print(f" [WARNING] No such file as {index_pkl_path}, Disable Units Indexer.")
|
||||
else:
|
||||
import faiss
|
||||
self.spk_id = spk_id
|
||||
self.active = True
|
||||
with open(index_pkl_path, "rb") as f:
|
||||
self.model = pickle.load(f)[str(spk_id)]
|
||||
self.big_all_npy = self.model.reconstruct_n(0, self.model.ntotal)
|
||||
print(f" [INFO] Successfully load Units Indexer from {index_pkl_path}.")
|
||||
|
||||
def __call__(self, units_t, spk_id=1, ratio=1):
|
||||
if self.spk_id != spk_id:
|
||||
self.load(spk_id=spk_id)
|
||||
if self.active:
|
||||
units = units_t.squeeze().to('cpu').numpy()
|
||||
# print(" [INFO] Starting feature retrieval...")
|
||||
score, ix = self.model.search(units, k=8)
|
||||
weight = np.square(1 / score)
|
||||
weight /= weight.sum(axis=1, keepdims=True)
|
||||
npy = np.sum(self.big_all_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
||||
units = ratio * npy + (1 - ratio) * units
|
||||
units_t = torch.from_numpy(units).unsqueeze(0).float().to(units_t.device)
|
||||
# print(f" [INFO] End feature retrieval...Ratio is {ratio}.")
|
||||
return units_t
|
||||
else:
|
||||
print(f" [WARNING] Units Indexer is not active, disable units index.")
|
||||
return units_t
|
258
server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
Normal file
258
server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
Normal file
@ -0,0 +1,258 @@
|
||||
import numpy as np
|
||||
from typing import Any
|
||||
import math
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch.cuda.amp import autocast
|
||||
from Exceptions import (
|
||||
DeviceCannotSupportHalfPrecisionException,
|
||||
DeviceChangingException,
|
||||
HalfPrecisionChangingException,
|
||||
NotEnoughDataExtimateF0,
|
||||
)
|
||||
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
embedder: Embedder
|
||||
inferencer: Inferencer
|
||||
pitchExtractor: PitchExtractor
|
||||
|
||||
index: Any | None
|
||||
big_npy: Any | None
|
||||
# feature: Any | None
|
||||
|
||||
targetSR: int
|
||||
device: torch.device
|
||||
isHalf: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedder: Embedder,
|
||||
inferencer: Inferencer,
|
||||
pitchExtractor: PitchExtractor,
|
||||
index: Any | None,
|
||||
# feature: Any | None,
|
||||
targetSR,
|
||||
device,
|
||||
isHalf,
|
||||
):
|
||||
self.embedder = embedder
|
||||
self.inferencer = inferencer
|
||||
self.pitchExtractor = pitchExtractor
|
||||
print("GENERATE INFERENCER", self.inferencer)
|
||||
print("GENERATE EMBEDDER", self.embedder)
|
||||
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
|
||||
|
||||
self.index = index
|
||||
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
|
||||
# self.feature = feature
|
||||
|
||||
self.targetSR = targetSR
|
||||
self.device = device
|
||||
self.isHalf = isHalf
|
||||
|
||||
self.sr = 16000
|
||||
self.window = 160
|
||||
|
||||
def getPipelineInfo(self):
|
||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||
embedderInfo = self.embedder.getEmbedderInfo()
|
||||
pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
|
||||
return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf}
|
||||
|
||||
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
||||
self.pitchExtractor = pitchExtractor
|
||||
|
||||
def exec(
|
||||
self,
|
||||
sid,
|
||||
audio, # torch.tensor [n]
|
||||
pitchf, # np.array [m]
|
||||
feature, # np.array [m, feat]
|
||||
f0_up_key,
|
||||
index_rate,
|
||||
if_f0,
|
||||
silence_front,
|
||||
embOutputLayer,
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect=0.5,
|
||||
out_size=None,
|
||||
):
|
||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
|
||||
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
|
||||
# self.t_pad = self.sr * repeat # 1秒
|
||||
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
|
||||
audio = audio.unsqueeze(0)
|
||||
|
||||
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
|
||||
|
||||
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
|
||||
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
|
||||
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
|
||||
# RVC QualityがOnのときにはsilence_frontをオフに。
|
||||
silence_front = silence_front if repeat == 0 else 0
|
||||
pitchf = pitchf if repeat == 0 else np.zeros(p_len)
|
||||
out_size = out_size if repeat == 0 else None
|
||||
|
||||
# ピッチ検出
|
||||
try:
|
||||
if if_f0 == 1:
|
||||
pitch, pitchf = self.pitchExtractor.extract(
|
||||
audio_pad,
|
||||
pitchf,
|
||||
f0_up_key,
|
||||
self.sr,
|
||||
self.window,
|
||||
silence_front=silence_front,
|
||||
)
|
||||
# pitch = pitch[:p_len]
|
||||
# pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
|
||||
else:
|
||||
pitch = None
|
||||
pitchf = None
|
||||
except IndexError:
|
||||
# print(e)
|
||||
raise NotEnoughDataExtimateF0()
|
||||
|
||||
# tensor型調整
|
||||
feats = audio_pad
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
|
||||
# embedding
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
with autocast(enabled=self.isHalf):
|
||||
try:
|
||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
||||
if torch.isnan(feats).all():
|
||||
raise DeviceCannotSupportHalfPrecisionException()
|
||||
except RuntimeError as e:
|
||||
if "HALF" in e.__str__().upper():
|
||||
raise HalfPrecisionChangingException()
|
||||
elif "same device" in e.__str__():
|
||||
raise DeviceChangingException()
|
||||
else:
|
||||
raise e
|
||||
if protect < 0.5 and search_index:
|
||||
feats0 = feats.clone()
|
||||
|
||||
# Index - feature抽出
|
||||
# if self.index is not None and self.feature is not None and index_rate != 0:
|
||||
if search_index:
|
||||
npy = feats[0].cpu().numpy()
|
||||
# apply silent front for indexsearch
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
npy = npy[npyOffset:]
|
||||
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float32")
|
||||
|
||||
# TODO: kは調整できるようにする
|
||||
k = 1
|
||||
if k == 1:
|
||||
_, ix = self.index.search(npy, 1)
|
||||
npy = self.big_npy[ix.squeeze()]
|
||||
else:
|
||||
score, ix = self.index.search(npy, k=8)
|
||||
weight = np.square(1 / score)
|
||||
weight /= weight.sum(axis=1, keepdims=True)
|
||||
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
||||
|
||||
# recover silient font
|
||||
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
|
||||
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
if protect < 0.5 and search_index:
|
||||
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
|
||||
# ピッチサイズ調整
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
if feats.shape[1] < p_len:
|
||||
p_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, :p_len]
|
||||
pitchf = pitchf[:, :p_len]
|
||||
|
||||
feats_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
|
||||
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
||||
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
||||
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
||||
if protect < 0.5 and search_index:
|
||||
pitchff = pitchf.clone()
|
||||
pitchff[pitchf > 0] = 1
|
||||
pitchff[pitchf < 1] = protect
|
||||
pitchff = pitchff.unsqueeze(-1)
|
||||
feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||
feats = feats.to(feats0.dtype)
|
||||
p_len = torch.tensor([p_len], device=self.device).long()
|
||||
|
||||
# apply silent front for inference
|
||||
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||
|
||||
feats_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
|
||||
# 推論実行
|
||||
try:
|
||||
with torch.no_grad():
|
||||
with autocast(enabled=self.isHalf):
|
||||
audio1 = (
|
||||
torch.clip(
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
|
||||
-1.0,
|
||||
1.0,
|
||||
)
|
||||
* 32767.5
|
||||
).data.to(dtype=torch.int16)
|
||||
except RuntimeError as e:
|
||||
if "HALF" in e.__str__().upper():
|
||||
print("11", e)
|
||||
raise HalfPrecisionChangingException()
|
||||
else:
|
||||
raise e
|
||||
|
||||
feats_buffer = feats.squeeze(0).detach().cpu()
|
||||
if pitchf is not None:
|
||||
pitchf_buffer = pitchf.squeeze(0).detach().cpu()
|
||||
else:
|
||||
pitchf_buffer = None
|
||||
|
||||
del p_len, padding_mask, pitch, pitchf, feats
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
|
||||
# pipelineに(入力されるときはhubertように16k)
|
||||
if self.t_pad_tgt != 0:
|
||||
offset = self.t_pad_tgt
|
||||
end = -1 * self.t_pad_tgt
|
||||
audio1 = audio1[offset:end]
|
||||
|
||||
del sid
|
||||
torch.cuda.empty_cache()
|
||||
return audio1, pitchf_buffer, feats_buffer
|
@ -0,0 +1,51 @@
|
||||
import os
|
||||
import traceback
|
||||
import faiss
|
||||
from data.ModelSlot import DiffusionSVCModelSlot, RVCModelSlot
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
|
||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
|
||||
# # Inferencer 生成
|
||||
# try:
|
||||
# inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
|
||||
# except Exception as e:
|
||||
# print("[Voice Changer] exception! loading inferencer", e)
|
||||
# traceback.print_exc()
|
||||
|
||||
# # Embedder 生成
|
||||
# try:
|
||||
# embedder = EmbedderManager.getEmbedder(
|
||||
# modelSlot.embedder,
|
||||
# # emmbedderFilename,
|
||||
# half,
|
||||
# dev,
|
||||
# )
|
||||
# except Exception as e:
|
||||
# print("[Voice Changer] exception! loading embedder", e)
|
||||
# traceback.print_exc()
|
||||
|
||||
# # pitchExtractor
|
||||
# pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||
|
||||
|
||||
# pipeline = Pipeline(
|
||||
# embedder,
|
||||
# inferencer,
|
||||
# pitchExtractor,
|
||||
# index,
|
||||
# modelSlot.samplingRate,
|
||||
# dev,
|
||||
# half,
|
||||
# )
|
||||
|
||||
# return pipeline
|
||||
|
@ -1,6 +1,6 @@
|
||||
from typing import Any, Union, cast
|
||||
|
||||
from const import TMP_DIR, ModelType
|
||||
from const import TMP_DIR
|
||||
import torch
|
||||
import os
|
||||
import traceback
|
||||
@ -71,7 +71,6 @@ class VoiceChanger:
|
||||
self.crossfadeSize = 0 # calculated
|
||||
|
||||
self.voiceChanger: VoiceChangerModel | None = None
|
||||
self.modelType: ModelType | None = None
|
||||
self.params = params
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
self.prev_audio = np.zeros(4096)
|
||||
@ -84,10 +83,7 @@ class VoiceChanger:
|
||||
self.voiceChanger = model
|
||||
|
||||
def getModelType(self):
|
||||
if self.modelType is not None:
|
||||
return {"status": "OK", "vc": self.modelType}
|
||||
else:
|
||||
return {"status": "OK", "vc": "none"}
|
||||
return {"status": "OK", "vc": "-----"}
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
|
@ -9,7 +9,7 @@ from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks
|
||||
from voice_changer.ModelSlotManager import ModelSlotManager
|
||||
from voice_changer.RVC.RVCModelMerger import RVCModelMerger
|
||||
from voice_changer.VoiceChanger import VoiceChanger
|
||||
from const import STORED_SETTING_FILE, UPLOAD_DIR, ModelType
|
||||
from const import STORED_SETTING_FILE, UPLOAD_DIR
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
|
||||
from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
@ -165,6 +165,11 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
||||
|
||||
slotInfo = DDSP_SVCModelSlotGenerator.loadModel(params)
|
||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||
elif params.voiceChangerType == "Diffusion-SVC":
|
||||
from voice_changer.DiffusionSVC.DiffusionSVCModelSlotGenerator import DiffusionSVCModelSlotGenerator
|
||||
|
||||
slotInfo = DiffusionSVCModelSlotGenerator.loadModel(params)
|
||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||
print("params", params)
|
||||
|
||||
def get_info(self):
|
||||
@ -232,6 +237,13 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
||||
self.voiceChangerModel = DDSP_SVC(self.params, slotInfo)
|
||||
self.voiceChanger = VoiceChanger(self.params)
|
||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||
elif slotInfo.voiceChangerType == "Diffusion-SVC":
|
||||
print("................Diffusion-SVC")
|
||||
from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC
|
||||
|
||||
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
|
||||
self.voiceChanger = VoiceChanger(self.params)
|
||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||
else:
|
||||
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
||||
if hasattr(self, "voiceChangerModel"):
|
||||
@ -267,9 +279,6 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
||||
print("Voice Change is not loaded. Did you load a correct model?")
|
||||
return np.zeros(1).astype(np.int16), []
|
||||
|
||||
def switchModelType(self, modelType: ModelType):
|
||||
return self.voiceChanger.switchModelType(modelType)
|
||||
|
||||
def getModelType(self):
|
||||
return self.voiceChanger.getModelType()
|
||||
|
||||
|
@ -18,6 +18,7 @@ LoadModelParamFileKind: TypeAlias = Literal[
|
||||
"ddspSvcModelConfig",
|
||||
"ddspSvcDiffusion",
|
||||
"ddspSvcDiffusionConfig",
|
||||
"diffusionSVCModel",
|
||||
]
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user