import diffusion svc core

This commit is contained in:
w-okada 2023-07-13 00:59:48 +09:00
parent ad013bf4d3
commit 9c829ac91a
33 changed files with 5384 additions and 73 deletions

View File

@ -1 +1,10 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
<!DOCTYPE html>
<html style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>Voice Changer Client Demo</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
</body>
</html>

File diff suppressed because one or more lines are too long

View File

@ -1,31 +0,0 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

View File

@ -78,6 +78,9 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
!!setting.files.find(x => { return x.kind == "ddspSvcDiffusion" }) &&
!!setting.files.find(x => { return x.kind == "ddspSvcDiffusionConfig" })
return enough
} else if (setting.voiceChangerType == "Diffusion-SVC") {
const enough = !!setting.files.find(x => { return x.kind == "diffusionSVCModel" })
return enough
}
return false
}
@ -132,6 +135,8 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
rows.push(generateFileRow(uploadSetting!, "Model", "ddspSvcModel", ["pth", "pt"], "model/"))
rows.push(generateFileRow(uploadSetting!, "Config(diff)", "ddspSvcDiffusionConfig", ["yaml"], "diff/"))
rows.push(generateFileRow(uploadSetting!, "Model(diff)", "ddspSvcDiffusion", ["pth", "pt"], "diff/"))
} else if(vcType == "Diffusion-SVC") {
rows.push(generateFileRow(uploadSetting!, "Model", "diffusionSVCModel", ["ptc"]))
}
return rows
}

View File

@ -9,7 +9,8 @@ export const VoiceChangerType = {
"MMVCv13": "MMVCv13",
"so-vits-svc-40": "so-vits-svc-40",
"DDSP-SVC": "DDSP-SVC",
"RVC": "RVC"
"RVC": "RVC",
"Diffusion-SVC":"Diffusion-SVC"
} as const
export type VoiceChangerType = typeof VoiceChangerType[keyof typeof VoiceChangerType]

View File

@ -27,6 +27,8 @@ export const ModelFileKind = {
"ddspSvcDiffusion": "ddspSvcDiffusion",
"ddspSvcDiffusionConfig": "ddspSvcDiffusionConfig",
"diffusionSVCModel": "diffusionSVCModel",
} as const
export type ModelFileKind = typeof ModelFileKind[keyof typeof ModelFileKind]

View File

@ -11,14 +11,7 @@ VoiceChangerType: TypeAlias = Literal[
"so-vits-svc-40",
"DDSP-SVC",
"RVC",
]
ModelType: TypeAlias = Literal[
"MMVCv15",
"MMVCv13",
"so-vits-svc-40",
"DDSP-SVC",
"RVC",
"Diffusion-SVC"
]
STORED_SETTING_FILE = "stored_setting.json"

View File

@ -102,7 +102,22 @@ class DDSPSVCModelSlot(ModelSlot):
speakers: dict = field(default_factory=lambda: {1: "user"})
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot]
@dataclass
class DiffusionSVCModelSlot(ModelSlot):
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
modelFile: str = ""
isONNX: bool = False
modelType: str = "combo"
dstId: int = 1
sampleId: str = ""
defaultTune: int = 0
kstep: int = 100
speakers: dict = field(default_factory=lambda: {1: "user"})
embedder: EmbedderType = "hubert_base"
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
@ -122,6 +137,8 @@ def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
return SoVitsSvc40ModelSlot(**jsonDict)
elif slotInfo.voiceChangerType == "DDSP-SVC":
return DDSPSVCModelSlot(**jsonDict)
elif slotInfo.voiceChangerType == "Diffusion-SVC":
return DiffusionSVCModelSlot(**jsonDict)
else:
return ModelSlot()

View File

@ -9,7 +9,7 @@ from fastapi import UploadFile, File, Form
from restapi.mods.FileUploader import upload_file, concat_file_chunks
from voice_changer.VoiceChangerManager import VoiceChangerManager
from const import MODEL_DIR, UPLOAD_DIR, ModelType
from const import MODEL_DIR, UPLOAD_DIR
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
@ -27,8 +27,6 @@ class MMVC_Rest_Fileuploader:
self.router.add_api_route("/concat_uploaded_file", self.post_concat_uploaded_file, methods=["POST"])
self.router.add_api_route("/update_settings", self.post_update_settings, methods=["POST"])
self.router.add_api_route("/load_model", self.post_load_model, methods=["POST"])
self.router.add_api_route("/model_type", self.post_model_type, methods=["POST"])
self.router.add_api_route("/model_type", self.get_model_type, methods=["GET"])
self.router.add_api_route("/onnx", self.get_onnx, methods=["GET"])
self.router.add_api_route("/merge_model", self.post_merge_models, methods=["POST"])
self.router.add_api_route("/update_model_default", self.post_update_model_default, methods=["POST"])
@ -97,22 +95,6 @@ class MMVC_Rest_Fileuploader:
except Exception as e:
print("[Voice Changer] post_load_model ex:", e)
def post_model_type(self, modelType: ModelType = Form(...)):
try:
info = self.voiceChangerManager.switchModelType(modelType)
json_compatible_item_data = jsonable_encoder(info)
return JSONResponse(content=json_compatible_item_data)
except Exception as e:
print("[Voice Changer] post_model_type ex:", e)
def get_model_type(self):
try:
info = self.voiceChangerManager.getModelType()
json_compatible_item_data = jsonable_encoder(info)
return JSONResponse(content=json_compatible_item_data)
except Exception as e:
print("[Voice Changer] get_model_type ex:", e)
def get_onnx(self):
try:
info = self.voiceChangerManager.export2onnx()

View File

@ -0,0 +1,219 @@
# import sys
# import os
from dataclasses import asdict
import numpy as np
import torch
import torchaudio
from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.pipeline.Pipeline import Pipeline
from Exceptions import DeviceCannotSupportHalfPrecisionException
class DiffusionSVC(VoiceChangerModel):
def __init__(self, params: VoiceChangerParams, slotInfo: DiffusionSVCModelSlot):
print("[Voice Changer] [DiffusionSVC] Creating instance ")
self.deviceManager = DeviceManager.get_instance()
EmbedderManager.initialize(params)
PitchExtractorManager.initialize(params)
self.settings = DiffusionSVCSettings()
self.params = params
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
self.pipeline: Pipeline | None = None
self.audio_buffer: AudioInOut | None = None
self.pitchf_buffer: PitchfInOut | None = None
self.feature_buffer: FeatureInOut | None = None
self.prevVol = 0.0
self.slotInfo = slotInfo
self.initialize()
def initialize(self):
print("[Voice Changer] [DiffusionSVC] Initializing... ")
# pipelineの生成
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector)
# その他の設定
self.settings.tran = self.slotInfo.defaultTune
self.settings.dstId = self.slotInfo.dstId
self.settings.kstep = self.slotInfo.kstep
print("[Voice Changer] [DiffusionSVC] Initializing... done")
def update_settings(self, key: str, val: int | float | str):
print("[Voice Changer][RVC]: update_settings", key, val)
if key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu":
self.deviceManager.setForceTensor(False)
self.initialize()
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
if key == "f0Detector" and self.pipeline is not None:
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
self.pipeline.setPitchExtractor(pitchExtractor)
else:
return False
return True
def get_info(self):
data = asdict(self.settings)
if self.pipeline is not None:
pipelineInfo = self.pipeline.getPipelineInfo()
data["pipelineInfo"] = pipelineInfo
return data
def get_processing_sampling_rate(self):
return self.slotInfo.samplingRate
def generate_input(
self,
newData: AudioInOut,
inputSize: int,
crossfadeSize: int,
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
else:
self.audio_buffer = newData
if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
outSize = convertSize - self.settings.extraConvertSize
# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
convertOffset = -1 * convertSize
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
if self.slotInfo.f0:
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
self.feature_buffer = self.feature_buffer[featureOffset:]
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd]
vol = np.sqrt(np.square(crop).mean())
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)
def inference(self, data):
audio = data[0]
pitchf = data[1]
feature = data[2]
convertSize = data[3]
vol = data[4]
outSize = data[5]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
if self.pipeline is not None:
device = self.pipeline.device
else:
device = torch.device("cpu")
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
repeat = 1 if self.settings.rvcQuality else 0
sid = self.settings.dstId
f0_up_key = self.settings.tran
index_rate = self.settings.indexRatio
protect = self.settings.protect
if_f0 = 1 if self.slotInfo.f0 else 0
embOutputLayer = self.slotInfo.embOutputLayer
useFinalProj = self.slotInfo.useFinalProj
try:
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid,
audio,
pitchf,
feature,
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
embOutputLayer,
useFinalProj,
repeat,
protect,
outSize
)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
return result
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
print("[Device Manager] Device cannot support half precision. Fallback to float....")
self.deviceManager.setForceTensor(True)
self.initialize()
# raise e
return
def __del__(self):
del self.pipeline
def export2onnx(self):
modelSlot = self.slotInfo
if modelSlot.isONNX:
print("[Voice Changer] export2onnx, No pyTorch filepath.")
return {"status": "ng", "path": ""}
output_file_simple = export2onnx(self.settings.gpu, modelSlot)
return {
"status": "ok",
"path": f"/tmp/{output_file_simple}",
"filename": output_file_simple,
}
def get_model_current(self):
return [
{
"key": "defaultTune",
"val": self.settings.tran,
},
{
"key": "defaultIndexRatio",
"val": self.settings.indexRatio,
},
{
"key": "defaultProtect",
"val": self.settings.protect,
},
]

View File

@ -0,0 +1,162 @@
import os
from const import EnumInferenceTypes
from dataclasses import asdict
import torch
import onnxruntime
import json
from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot, RVCModelSlot
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
@classmethod
def loadModel(cls, props: LoadModelParams):
slotInfo: DiffusionSVCModelSlot = DiffusionSVCModelSlot()
for file in props.files:
if file.kind == "diffusionSVCModel":
slotInfo.modelFile = file.name
slotInfo.defaultTune = 0
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
slotInfo.iconFile = "/assets/icons/noimage.png"
# if slotInfo.isONNX:
# slotInfo = cls._setInfoByONNX(slotInfo)
# else:
# slotInfo = cls._setInfoByPytorch(slotInfo)
return slotInfo
@classmethod
def _setInfoByPytorch(cls, slot: ModelSlot):
cpt = torch.load(slot.modelFile, map_location="cpu")
config_len = len(cpt["config"])
version = cpt.get("version", "v1")
slot = RVCModelSlot(**asdict(slot))
if version == "voras_beta":
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = EnumInferenceTypes.pyTorchVoRASbeta.value
slot.embChannels = 768
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
slot.useFinalProj = False
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
# if slot.embedder == "hubert":
# slot.embedder = "hubert"
# elif slot.embedder == "contentvec":
# slot.embedder = "contentvec"
# elif slot.embedder == "hubert_jp":
# slot.embedder = "hubert_jp"
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
elif config_len == 18:
# Original RVC
slot.f0 = True if cpt["f0"] == 1 else False
version = cpt.get("version", "v1")
if version is None or version == "v1":
slot.modelType = EnumInferenceTypes.pyTorchRVC.value if slot.f0 else EnumInferenceTypes.pyTorchRVCNono.value
slot.embChannels = 256
slot.embOutputLayer = 9
slot.useFinalProj = True
slot.embedder = "hubert_base"
print("[Voice Changer] Official Model(pyTorch) : v1")
else:
slot.modelType = EnumInferenceTypes.pyTorchRVCv2.value if slot.f0 else EnumInferenceTypes.pyTorchRVCv2Nono.value
slot.embChannels = 768
slot.embOutputLayer = 12
slot.useFinalProj = False
slot.embedder = "hubert_base"
print("[Voice Changer] Official Model(pyTorch) : v2")
else:
# DDPN RVC
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = EnumInferenceTypes.pyTorchWebUI.value if slot.f0 else EnumInferenceTypes.pyTorchWebUINono.value
slot.embChannels = cpt["config"][17]
slot.embOutputLayer = cpt["embedder_output_layer"] if "embedder_output_layer" in cpt else 9
if slot.embChannels == 256:
slot.useFinalProj = True
else:
slot.useFinalProj = False
# DDPNモデルの情報を表示
if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj is True:
print("[Voice Changer] DDPN Model(pyTorch) : Official v1 like")
elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False:
print("[Voice Changer] DDPN Model(pyTorch): Official v2 like")
else:
print(f"[Voice Changer] DDPN Model(pyTorch): ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}")
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
if "speaker_info" in cpt.keys():
for k, v in cpt["speaker_info"].items():
slot.speakers[int(k)] = str(v)
slot.samplingRate = cpt["config"][-1]
del cpt
return slot
@classmethod
def _setInfoByONNX(cls, slot: ModelSlot):
tmp_onnx_session = onnxruntime.InferenceSession(slot.modelFile, providers=["CPUExecutionProvider"])
modelmeta = tmp_onnx_session.get_modelmeta()
try:
slot = RVCModelSlot(**asdict(slot))
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
# slot.modelType = metadata["modelType"]
slot.embChannels = metadata["embChannels"]
slot.embOutputLayer = metadata["embOutputLayer"] if "embOutputLayer" in metadata else 9
slot.useFinalProj = metadata["useFinalProj"] if "useFinalProj" in metadata else True if slot.embChannels == 256 else False
if slot.embChannels == 256:
slot.useFinalProj = True
else:
slot.useFinalProj = False
# ONNXモデルの情報を表示
if slot.embChannels == 256 and slot.embOutputLayer == 9 and slot.useFinalProj is True:
print("[Voice Changer] ONNX Model: Official v1 like")
elif slot.embChannels == 768 and slot.embOutputLayer == 12 and slot.useFinalProj is False:
print("[Voice Changer] ONNX Model: Official v2 like")
else:
print(f"[Voice Changer] ONNX Model: ch:{slot.embChannels}, L:{slot.embOutputLayer}, FP:{slot.useFinalProj}")
if "embedder" not in metadata:
slot.embedder = "hubert_base"
else:
slot.embedder = metadata["embedder"]
slot.f0 = metadata["f0"]
slot.modelType = EnumInferenceTypes.onnxRVC.value if slot.f0 else EnumInferenceTypes.onnxRVCNono.value
slot.samplingRate = metadata["samplingRate"]
slot.deprecated = False
except Exception as e:
slot.modelType = EnumInferenceTypes.onnxRVC.value
slot.embChannels = 256
slot.embedder = "hubert_base"
slot.f0 = True
slot.samplingRate = 48000
slot.deprecated = True
print("[Voice Changer] setInfoByONNX", e)
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
del tmp_onnx_session
return slot

View File

@ -0,0 +1,32 @@
from dataclasses import dataclass, field
@dataclass
class DiffusionSVCSettings:
gpu: int = 0
dstId: int = 0
f0Detector: str = "harvest" # dio or harvest
tran: int = 12
silentThreshold: float = 0.00001
extraConvertSize: int = 1024 * 4
kstep: int = 100
silenceFront: int = 1 # 0:off, 1:on
modelSamplingRate: int = 44100
speakers: dict[str, int] = field(default_factory=lambda: {})
# isHalf: int = 1 # 0:off, 1:on
# enableDirectML: int = 0 # 0:off, 1:on
# ↓mutableな物だけ列挙
intData = [
"gpu",
"dstId",
"tran",
"extraConvertSize",
"kstep",
"silenceFront",
]
floatData = ["silentThreshold"]
strData = ["f0Detector"]

View File

@ -0,0 +1,35 @@
import torch
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
class RVCInferencer(Inferencer):
def loadModel(self, file: str, gpu: int):
self.setProps("DiffusionSVCCombo", file, True, gpu)
dev = DeviceManager.get_instance().getDevice(gpu)
isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
model = model.to(dev)
if isHalf:
model = model.half()
self.model = model
return self
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)

View File

@ -0,0 +1,480 @@
import numpy as np
import time
import os
import torch
import torch.nn.functional
from torchaudio.transforms import Resample
from tqdm import tqdm
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder, load_model_vocoder_from_combo
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.slicer import split
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.units_index import UnitsIndexer
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.tools.tools import F0_Extractor, Volume_Extractor, Units_Encoder, SpeakerEncoder, cross_fade
class DiffusionSVC:
def __init__(self, device=None):
if device is not None:
self.device = device
else:
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.model_path = None
self.model = None
self.vocoder = None
self.args = None
# 特征提取器
self.units_encoder = None
self.f0_extractor = None
self.f0_model = None
self.f0_min = None
self.f0_max = None
self.volume_extractor = None
self.speaker_encoder = None
self.spk_emb_dict = None
self.resample_dict_16000 = {}
self.units_indexer = None
self.naive_model_path = None
self.naive_model = None
self.naive_model_args = None
self.use_combo_model = False
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
if ('1234' + model_path)[-4:] == '.ptc':
self.use_combo_model = True
self.model_path = model_path
self.naive_model_path = model_path
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(model_path,
device=self.device)
self.model = diff_model
self.args = diff_args
self.naive_model = naive_model
self.naive_model_args = naive_args
self.vocoder = vocoder
else:
self.model_path = model_path
self.model, self.vocoder, self.args = load_model_vocoder(model_path, device=self.device)
self.units_encoder = Units_Encoder(
self.args.data.encoder,
self.args.data.encoder_ckpt,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
cnhubertsoft_gate=self.args.data.cnhubertsoft_gate,
device=self.device,
units_forced_mode=self.args.data.units_forced_mode
)
self.volume_extractor = Volume_Extractor(
hop_size=512,
block_size=self.args.data.block_size,
model_sampling_rate=self.args.data.sampling_rate
)
self.load_f0_extractor(f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
if self.args.model.use_speaker_encoder:
self.speaker_encoder = SpeakerEncoder(
self.args.data.speaker_encoder,
self.args.data.speaker_encoder_config,
self.args.data.speaker_encoder_ckpt,
self.args.data.speaker_encoder_sample_rate,
device=self.device
)
path_spk_emb_dict = os.path.join(os.path.split(model_path)[0], 'spk_emb_dict.npy')
self.set_spk_emb_dict(path_spk_emb_dict)
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
def flush(self, model_path=None, f0_model=None, f0_min=None, f0_max=None, naive_model_path=None):
assert (model_path is not None) or (naive_model_path is not None)
# flush model if changed
if ((self.model_path != model_path) or (self.f0_model != f0_model)
or (self.f0_min != f0_min) or (self.f0_max != f0_max)):
self.load_model(model_path, f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
if (self.naive_model_path != naive_model_path) and (naive_model_path is not None):
self.load_naive_model(naive_model_path)
# check args if use naive
if self.naive_model is not None:
if self.naive_model_args.data.encoder != self.args.data.encoder:
raise ValueError("encoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.model.n_spk != self.args.model.n_spk:
raise ValueError("n_spk of Naive Model and Diffusion Model are different")
if bool(self.naive_model_args.model.use_speaker_encoder) != bool(self.args.model.use_speaker_encoder):
raise ValueError("use_speaker_encoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.vocoder.type != self.args.vocoder.type:
raise ValueError("vocoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.data.block_size != self.args.data.block_size:
raise ValueError("block_size of Naive Model and Diffusion Model are different")
if self.naive_model_args.data.sampling_rate != self.args.data.sampling_rate:
raise ValueError("sampling_rate of Naive Model and Diffusion Model are different")
def flush_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
if (f0_model != self.f0_model) and (f0_model is not None):
self.load_f0_extractor(f0_model)
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
self.f0_max = f0_max if (f0_max is not None) else self.args.data.f0_max
self.f0_model = f0_model
self.f0_extractor = F0_Extractor(
f0_extractor=self.f0_model,
sample_rate=44100,
hop_size=512,
f0_min=self.f0_min,
f0_max=self.f0_max,
block_size=self.args.data.block_size,
model_sampling_rate=self.args.data.sampling_rate
)
def load_naive_model(self, naive_model_path):
self.naive_model_path = naive_model_path
model, _, args = load_model_vocoder(naive_model_path, device=self.device, loaded_vocoder=self.vocoder)
self.naive_model = model
self.naive_model_args = args
print(f" [INFO] Load naive model from {naive_model_path}")
@torch.no_grad()
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
aug_shift=0, spk_emb=None):
# spk_id
spk_emb_dict = None
if self.args.model.use_speaker_encoder: # with speaker encoder
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
# without speaker encoder
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.device)
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.device)
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer=True,
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
return out_spec
def set_spk_emb_dict(self, spk_emb_dict_or_path): # 从路径加载或直接设置
if spk_emb_dict_or_path is None:
return None
if spk_emb_dict_or_path is dict:
self.spk_emb_dict = spk_emb_dict_or_path
print(f" [INFO] Load spk_emb_dict from {spk_emb_dict_or_path}")
else:
self.spk_emb_dict = np.load(spk_emb_dict_or_path, allow_pickle=True).item()
print(f" [INFO] Load spk_emb_dict from {spk_emb_dict_or_path}")
@torch.no_grad()
def encode_units(self, audio, sr=44100, padding_mask=None):
assert self.units_encoder is not None
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
return self.units_encoder.encode(audio, sr, hop_size, padding_mask=padding_mask)
@torch.no_grad()
def extract_f0(self, audio, key=0, sr=44100, silence_front=0):
assert self.f0_extractor is not None
f0 = self.f0_extractor.extract(audio, uv_interp=True, device=self.device, silence_front=silence_front, sr=sr)
f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(key) / 12)
return f0
@torch.no_grad()
def extract_volume_and_mask(self, audio, sr=44100, threhold=-60.0):
assert self.volume_extractor is not None
volume = self.volume_extractor.extract(audio, sr)
mask = self.volume_extractor.get_mask_from_volume(volume, threhold=threhold, device=self.device)
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
return volume, mask
@torch.no_grad()
def extract_mel(self, audio, sr=44100):
assert sr == 441000
mel = self.vocoder.extract(audio, self.args.data.sampling_rate)
return mel
@torch.no_grad()
def encode_spk(self, audio, sr=44100):
assert self.speaker_encoder is not None
return self.speaker_encoder(audio=audio, sample_rate=sr)
@torch.no_grad()
def encode_spk_from_path(self, path): # 从path读取预先提取的声纹(必须是.npy文件), 或从声音文件提取声纹(此时可以是文件或目录)
if path is None:
return None
assert self.speaker_encoder is not None
if (('122333444455555' + path)[-4:] == '.npy') and os.path.isfile(path):
spk_emb = np.load(path)
else:
if os.path.isfile(path):
path_list = [path]
else:
path_list = os.listdir(path)
for _index in range(len(path_list)):
path_list[_index] = os.path.join(path, path_list[_index])
spk_emb = self.speaker_encoder.mean_spk_emb_from_path_list(path_list)
return spk_emb
def pre_spk_emb(self, spk_id, spk_mix_dict, units_len, spk_emb):
spk_emb_dict = self.spk_emb_dict
if (spk_mix_dict is not None) or (spk_emb is None):
assert spk_emb_dict is not None
if spk_emb is None:
spk_emb = spk_emb_dict[str(spk_id)]
# pad and to device
spk_emb = np.tile(spk_emb, (units_len, 1))
spk_emb = torch.from_numpy(spk_emb).float().to(self.device)
return spk_mix_dict, spk_emb
@torch.no_grad()
def mel2wav(self, mel, f0, start_frame=0):
if start_frame == 0:
return self.vocoder.infer(mel, f0)
else: # for realtime speedup
mel = mel[:, start_frame:, :]
f0 = f0[:, start_frame:, :]
out_wav = self.vocoder.infer(mel, f0)
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None):
if self.args.model.k_step_max is not None:
if k_step is None:
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
if k_step > int(self.args.model.k_step_max):
raise ValueError(f"k_step must <= k_step_max of Shallow Diffusion Model")
if gt_spec is None:
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
"input mel or output of naive model")
print(f' [INFO] k_step_max is {self.args.model.k_step_max}.')
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.device)
# spk_id
spk_emb_dict = None
if self.args.model.use_speaker_encoder: # with speaker encoder
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
# without speaker encoder
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.device)
if k_step is not None:
print(f' [INFO] get k_step, do shallow diffusion {k_step} steps')
else:
print(f' [INFO] Do full 1000 steps depth diffusion {k_step}')
print(f" [INFO] method:{method}; infer_speedup:{infer_speedup}")
return self.model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
@torch.no_grad() # 比__call__多了声码器代码输出波形
def infer(self, units, f0, volume, gt_spec=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None):
if k_step is not None:
if self.naive_model is not None:
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, spk_emb=spk_emb)
print(f" [INFO] get mel from naive model out.")
assert gt_spec is not None
if self.naive_model is None:
print(f" [INFO] get mel from input wav.")
if input(" [WARN] You are attempting shallow diffusion "
"on the mel of the input source,"
" Please enter 'gt_mel' to continue") != 'gt_mel':
raise ValueError("Please understand what you're doing")
k_step = int(k_step)
gt_spec = gt_spec
else:
gt_spec = None
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
return self.mel2wav(out_mel, f0)
@torch.no_grad() # 为实时浅扩散优化的推理代码可以切除pad省算力
def infer_for_realtime(self, units, f0, volume, audio_t=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, silence_front=0, diff_jump_silence_front=False):
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
if diff_jump_silence_front:
if audio_t is not None:
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
f0 = f0[:, start_frame:, :]
units = units[:, start_frame:, :]
volume = volume[:, start_frame:, :]
if k_step is not None:
assert audio_t is not None
k_step = int(k_step)
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
# 如果缺帧再开这行gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
if diff_jump_silence_front:
out_wav = self.mel2wav(out_mel, f0)
else:
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
return out_wav
@torch.no_grad() # 不切片从音频推理代码
def infer_from_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, threhold=-60, index_ratio=0):
units = self.encode_units(audio, sr)
if index_ratio > 0:
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
f0 = self.extract_f0(audio, key=key, sr=sr)
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
if k_step is not None:
assert 0 < int(k_step) <= 1000
k_step = int(k_step)
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
gt_spec = self.vocoder.extract(audio_t, sr)
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
output = self.infer(units, f0, volume, gt_spec=gt_spec, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
output *= mask
return output.squeeze().cpu().numpy(), self.args.data.sampling_rate
@torch.no_grad() # 切片从音频推理代码
def infer_from_long_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None,
threhold=-60, threhold_for_split=-40, min_len=5000, index_ratio=0):
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
segments = split(audio, sr, hop_size, db_thresh=threhold_for_split, min_len=min_len)
print(f' [INFO] Extract f0 volume and mask: Use {self.f0_model}, start...')
_f0_start_time = time.time()
f0 = self.extract_f0(audio, key=key, sr=sr)
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
_f0_end_time = time.time()
_f0_used_time = _f0_end_time - _f0_start_time
print(f' [INFO] Extract f0 volume and mask: Done. Use time:{_f0_used_time}')
if k_step is not None:
assert 0 < int(k_step) <= 1000
k_step = int(k_step)
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
gt_spec = self.vocoder.extract(audio_t, sr)
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
result = np.zeros(0)
current_length = 0
for segment in tqdm(segments):
start_frame = segment[0]
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(self.device)
seg_units = self.units_encoder.encode(seg_input, sr, hop_size)
if index_ratio > 0:
seg_units = self.units_indexer(units_t=seg_units, spk_id=spk_id, ratio=index_ratio)
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
seg_volume = volume[:, start_frame: start_frame + seg_units.size(1), :]
if gt_spec is not None:
seg_gt_spec = gt_spec[:, start_frame: start_frame + seg_units.size(1), :]
else:
seg_gt_spec = None
seg_output = self.infer(seg_units, seg_f0, seg_volume, gt_spec=seg_gt_spec, spk_id=spk_id,
spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
_left = start_frame * self.args.data.block_size
_right = (start_frame + seg_units.size(1)) * self.args.data.block_size
seg_output *= mask[:, _left:_right]
seg_output = seg_output.squeeze().cpu().numpy()
silent_length = round(start_frame * self.args.data.block_size) - current_length
if silent_length >= 0:
result = np.append(result, np.zeros(silent_length))
result = np.append(result, seg_output)
else:
result = cross_fade(result, seg_output, current_length + silent_length)
current_length = current_length + silent_length + len(seg_output)
return result, self.args.data.sampling_rate
@torch.no_grad() # 为实时优化的推理代码可以切除pad省算力
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, silence_front=0, diff_jump_silence_front=False, threhold=-60,
index_ratio=0, use_hubert_mask=False):
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
if self.naive_model is None:
print(" [INFO] No combo_model or naive_model, diffusion without shallow-model.")
else:
assert k_step is not None
print(" [INFO] Shallow Diffusion mode!")
key_str = str(sr)
if key_str not in self.resample_dict_16000:
self.resample_dict_16000[key_str] = Resample(sr, 16000, lowpass_filter_width=128).to(self.device)
if int(sr) != 16000:
audio_t_16k = self.resample_dict_16000[key_str](audio_t)
else:
audio_t_16k = audio_t
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
if use_hubert_mask:
mask16k = mask.clone().unsqueeze(0).unsqueeze(0)
mask16k = torch.nn.functional.interpolate(mask16k, size=tuple(audio_t_16k.shape), mode='nearest')
mask16k = ~(mask16k.squeeze(0).squeeze(0).bool())
else:
mask16k = None
units = self.encode_units(audio_t_16k, sr=16000, padding_mask=mask16k)
if index_ratio > 0:
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
f0 = self.extract_f0(audio, key=key, sr=sr, silence_front=silence_front)
if diff_jump_silence_front:
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
f0 = f0[:, start_frame:, :]
units = units[:, start_frame:, :]
volume = volume[:, start_frame:, :]
if k_step is not None:
k_step = int(k_step)
if (k_step >= 1000) or (k_step <= 0):
k_step = 300
print(f" [WARN] k_step must < 1000 and > 0, now set to {k_step}")
if self.args.model.k_step_max is not None:
k_step_max = int(self.args.model.k_step_max)
if k_step > k_step_max:
print(f" [WARN] k_step must <= k_step_max={k_step_max}, not k_step set to{k_step_max}.")
k_step = k_step_max
if int(k_step/infer_speedup) < 3:
infer_speedup = int(k_step/4)
print(f" [WARN] diffusion step must > 4 (3 when qndm), not set to{infer_speedup}")
if self.naive_model is not None:
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, spk_emb=spk_emb)
else:
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
if diff_jump_silence_front:
out_wav = self.mel2wav(out_mel, f0)
else:
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
out_wav *= mask
return out_wav.squeeze(), self.args.data.sampling_rate

View File

@ -0,0 +1,386 @@
from collections import deque
from functools import partial
from inspect import isfunction
import torch.nn.functional as F
import numpy as np
import torch
from torch import nn
from tqdm import tqdm
def exists(x):
return x is not None
def default(val, d):
if exists(val):
return val
return d() if isfunction(d) else d
def extract(a, t, x_shape):
b, *_ = t.shape
out = a.gather(-1, t)
return out.reshape(b, *((1,) * (len(x_shape) - 1)))
def noise_like(shape, device, repeat=False):
repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
noise = lambda: torch.randn(shape, device=device)
return repeat_noise() if repeat else noise()
def linear_beta_schedule(timesteps, max_beta=0.02):
"""
linear schedule
"""
betas = np.linspace(1e-4, max_beta, timesteps)
return betas
def cosine_beta_schedule(timesteps, s=0.008):
"""
cosine schedule
as proposed in https://openreview.net/forum?id=-NEXDKk8gZ
"""
steps = timesteps + 1
x = np.linspace(0, steps, steps)
alphas_cumprod = np.cos(((x / steps) + s) / (1 + s) * np.pi * 0.5) ** 2
alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
return np.clip(betas, a_min=0, a_max=0.999)
beta_schedule = {
"cosine": cosine_beta_schedule,
"linear": linear_beta_schedule,
}
class GaussianDiffusion(nn.Module):
def __init__(self,
denoise_fn,
out_dims=128,
timesteps=1000,
k_step=1000,
max_beta=0.02,
spec_min=-12,
spec_max=2):
super().__init__()
self.denoise_fn = denoise_fn
self.out_dims = out_dims
betas = beta_schedule['linear'](timesteps, max_beta=max_beta)
alphas = 1. - betas
alphas_cumprod = np.cumprod(alphas, axis=0)
alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
timesteps, = betas.shape
self.num_timesteps = int(timesteps)
self.k_step = k_step
self.noise_list = deque(maxlen=4)
to_torch = partial(torch.tensor, dtype=torch.float32)
self.register_buffer('betas', to_torch(betas))
self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
# calculations for diffusion q(x_t | x_{t-1}) and others
self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
# calculations for posterior q(x_{t-1} | x_t, x_0)
posterior_variance = betas * (1. - alphas_cumprod_prev) / (1. - alphas_cumprod)
# above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
self.register_buffer('posterior_variance', to_torch(posterior_variance))
# below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
self.register_buffer('posterior_mean_coef1', to_torch(
betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
self.register_buffer('posterior_mean_coef2', to_torch(
(1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
self.register_buffer('spec_min', torch.FloatTensor([spec_min])[None, None, :out_dims])
self.register_buffer('spec_max', torch.FloatTensor([spec_max])[None, None, :out_dims])
def q_mean_variance(self, x_start, t):
mean = extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
variance = extract(1. - self.alphas_cumprod, t, x_start.shape)
log_variance = extract(self.log_one_minus_alphas_cumprod, t, x_start.shape)
return mean, variance, log_variance
def predict_start_from_noise(self, x_t, t, noise):
return (
extract(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
extract(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
)
def q_posterior(self, x_start, x_t, t):
posterior_mean = (
extract(self.posterior_mean_coef1, t, x_t.shape) * x_start +
extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
)
posterior_variance = extract(self.posterior_variance, t, x_t.shape)
posterior_log_variance_clipped = extract(self.posterior_log_variance_clipped, t, x_t.shape)
return posterior_mean, posterior_variance, posterior_log_variance_clipped
def p_mean_variance(self, x, t, cond):
noise_pred = self.denoise_fn(x, t, cond=cond)
x_recon = self.predict_start_from_noise(x, t=t, noise=noise_pred)
x_recon.clamp_(-1., 1.)
model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
return model_mean, posterior_variance, posterior_log_variance
@torch.no_grad()
def p_sample(self, x, t, cond, clip_denoised=True, repeat_noise=False):
b, *_, device = *x.shape, x.device
model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, cond=cond)
noise = noise_like(x.shape, device, repeat_noise)
# no noise when t == 0
nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
@torch.no_grad()
def p_sample_ddim(self, x, t, interval, cond):
a_t = extract(self.alphas_cumprod, t, x.shape)
a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
noise_pred = self.denoise_fn(x, t, cond=cond)
x_prev = a_prev.sqrt() * (x / a_t.sqrt() + (((1 - a_prev) / a_prev).sqrt()-((1 - a_t) / a_t).sqrt()) * noise_pred)
return x_prev
@torch.no_grad()
def p_sample_plms(self, x, t, interval, cond, clip_denoised=True, repeat_noise=False):
"""
Use the PLMS method from
[Pseudo Numerical Methods for Diffusion Models on Manifolds](https://arxiv.org/abs/2202.09778).
"""
def get_x_pred(x, noise_t, t):
a_t = extract(self.alphas_cumprod, t, x.shape)
a_prev = extract(self.alphas_cumprod, torch.max(t - interval, torch.zeros_like(t)), x.shape)
a_t_sq, a_prev_sq = a_t.sqrt(), a_prev.sqrt()
x_delta = (a_prev - a_t) * ((1 / (a_t_sq * (a_t_sq + a_prev_sq))) * x - 1 / (
a_t_sq * (((1 - a_prev) * a_t).sqrt() + ((1 - a_t) * a_prev).sqrt())) * noise_t)
x_pred = x + x_delta
return x_pred
noise_list = self.noise_list
noise_pred = self.denoise_fn(x, t, cond=cond)
if len(noise_list) == 0:
x_pred = get_x_pred(x, noise_pred, t)
noise_pred_prev = self.denoise_fn(x_pred, max(t - interval, 0), cond=cond)
noise_pred_prime = (noise_pred + noise_pred_prev) / 2
elif len(noise_list) == 1:
noise_pred_prime = (3 * noise_pred - noise_list[-1]) / 2
elif len(noise_list) == 2:
noise_pred_prime = (23 * noise_pred - 16 * noise_list[-1] + 5 * noise_list[-2]) / 12
else:
noise_pred_prime = (55 * noise_pred - 59 * noise_list[-1] + 37 * noise_list[-2] - 9 * noise_list[-3]) / 24
x_prev = get_x_pred(x, noise_pred_prime, t)
noise_list.append(noise_pred)
return x_prev
def q_sample(self, x_start, t, noise=None):
noise = default(noise, lambda: torch.randn_like(x_start))
return (
extract(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
extract(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
)
def p_losses(self, x_start, t, cond, noise=None, loss_type='l2'):
noise = default(noise, lambda: torch.randn_like(x_start))
x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
x_recon = self.denoise_fn(x_noisy, t, cond)
if loss_type == 'l1':
loss = (noise - x_recon).abs().mean()
elif loss_type == 'l2':
loss = F.mse_loss(noise, x_recon)
else:
raise NotImplementedError()
return loss
def forward(self,
condition,
gt_spec=None,
infer=True,
infer_speedup=10,
method='dpm-solver',
k_step=None,
use_tqdm=True):
"""
conditioning diffusion, use fastspeech2 encoder output as the condition
"""
cond = condition.transpose(1, 2)
b, device = condition.shape[0], condition.device
if not infer:
spec = self.norm_spec(gt_spec)
if k_step is None:
t_max = self.k_step
else:
t_max = k_step
t = torch.randint(0, t_max, (b,), device=device).long()
norm_spec = spec.transpose(1, 2)[:, None, :, :] # [B, 1, M, T]
return self.p_losses(norm_spec, t, cond=cond)
else:
shape = (cond.shape[0], 1, self.out_dims, cond.shape[2])
if gt_spec is None or k_step is None:
t = self.k_step
x = torch.randn(shape, device=device)
else:
t = k_step
norm_spec = self.norm_spec(gt_spec)
norm_spec = norm_spec.transpose(1, 2)[:, None, :, :]
x = self.q_sample(x_start=norm_spec, t=torch.tensor([t - 1], device=device).long())
if method is not None and infer_speedup > 1:
if method == 'dpm-solver':
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
# 2. Convert your discrete-time `model` to the continuous-time
# noise prediction model. Here is an example for a diffusion model
# `model` with the noise prediction type ("noise") .
def my_wrapper(fn):
def wrapped(x, t, **kwargs):
ret = fn(x, t, **kwargs)
if use_tqdm:
self.bar.update(1)
return ret
return wrapped
model_fn = model_wrapper(
my_wrapper(self.denoise_fn),
noise_schedule,
model_type="noise", # or "x_start" or "v" or "score"
model_kwargs={"cond": cond}
)
# 3. Define dpm-solver and sample by singlestep DPM-Solver.
# (We recommend singlestep DPM-Solver for unconditional sampling)
# You can adjust the `steps` to balance the computation
# costs and the sample quality.
dpm_solver = DPM_Solver(model_fn, noise_schedule, algorithm_type="dpmsolver++")
steps = t // infer_speedup
if use_tqdm:
self.bar = tqdm(desc="sample time step", total=steps)
x = dpm_solver.sample(
x,
steps=steps,
order=2,
skip_type="time_uniform",
method="multistep",
)
if use_tqdm:
self.bar.close()
elif method == 'unipc':
from .uni_pc import NoiseScheduleVP, model_wrapper, UniPC
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
# 2. Convert your discrete-time `model` to the continuous-time
# noise prediction model. Here is an example for a diffusion model
# `model` with the noise prediction type ("noise") .
def my_wrapper(fn):
def wrapped(x, t, **kwargs):
ret = fn(x, t, **kwargs)
if use_tqdm:
self.bar.update(1)
return ret
return wrapped
model_fn = model_wrapper(
my_wrapper(self.denoise_fn),
noise_schedule,
model_type="noise", # or "x_start" or "v" or "score"
model_kwargs={"cond": cond}
)
# 3. Define uni_pc and sample by multistep UniPC.
# You can adjust the `steps` to balance the computation
# costs and the sample quality.
uni_pc = UniPC(model_fn, noise_schedule, variant='bh2')
steps = t // infer_speedup
if use_tqdm:
self.bar = tqdm(desc="sample time step", total=steps)
x = uni_pc.sample(
x,
steps=steps,
order=2,
skip_type="time_uniform",
method="multistep",
)
if use_tqdm:
self.bar.close()
elif method == 'pndm':
self.noise_list = deque(maxlen=4)
if use_tqdm:
for i in tqdm(
reversed(range(0, t, infer_speedup)), desc='sample time step',
total=t // infer_speedup,
):
x = self.p_sample_plms(
x, torch.full((b,), i, device=device, dtype=torch.long),
infer_speedup, cond=cond
)
else:
for i in reversed(range(0, t, infer_speedup)):
x = self.p_sample_plms(
x, torch.full((b,), i, device=device, dtype=torch.long),
infer_speedup, cond=cond
)
elif method == 'ddim':
if use_tqdm:
for i in tqdm(
reversed(range(0, t, infer_speedup)), desc='sample time step',
total=t // infer_speedup,
):
x = self.p_sample_ddim(
x, torch.full((b,), i, device=device, dtype=torch.long),
infer_speedup, cond=cond
)
else:
for i in reversed(range(0, t, infer_speedup)):
x = self.p_sample_ddim(
x, torch.full((b,), i, device=device, dtype=torch.long),
infer_speedup, cond=cond
)
else:
raise NotImplementedError(method)
else:
if use_tqdm:
for i in tqdm(reversed(range(0, t)), desc='sample time step', total=t):
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
else:
for i in reversed(range(0, t)):
x = self.p_sample(x, torch.full((b,), i, device=device, dtype=torch.long), cond)
x = x.squeeze(1).transpose(1, 2) # [B, T, M]
return self.denorm_spec(x)
def norm_spec(self, x):
return (x - self.spec_min) / (self.spec_max - self.spec_min) * 2 - 1
def denorm_spec(self, x):
return (x + 1) / 2 * (self.spec_max - self.spec_min) + self.spec_min

View File

@ -0,0 +1,122 @@
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from torch.nn.utils import weight_norm
from .pcmer import PCmer
class Unit2MelNaive(nn.Module):
def __init__(
self,
input_channel,
n_spk,
use_pitch_aug=False,
out_dims=128,
n_layers=3,
n_chans=256,
n_hidden=None, # 废弃
use_speaker_encoder=False,
speaker_encoder_out_channels=256,
use_full_siren=False,
l2reg_loss=0
):
super().__init__()
self.l2reg_loss = l2reg_loss if (l2reg_loss is not None) else 0
self.f0_embed = nn.Linear(1, n_chans)
self.volume_embed = nn.Linear(1, n_chans)
if use_pitch_aug:
self.aug_shift_embed = nn.Linear(1, n_chans, bias=False)
else:
self.aug_shift_embed = None
self.n_spk = n_spk
self.use_speaker_encoder = use_speaker_encoder
if use_speaker_encoder:
self.spk_embed = nn.Linear(speaker_encoder_out_channels, n_chans, bias=False)
else:
if n_spk is not None and n_spk > 1:
self.spk_embed = nn.Embedding(n_spk, n_chans)
# conv in stack
self.stack = nn.Sequential(
nn.Conv1d(input_channel, n_chans, 3, 1, 1),
nn.GroupNorm(4, n_chans),
nn.LeakyReLU(),
nn.Conv1d(n_chans, n_chans, 3, 1, 1))
# transformer
if use_full_siren:
from .pcmer_siren_full import PCmer as PCmerfs
self.decoder = PCmerfs(
num_layers=n_layers,
num_heads=8,
dim_model=n_chans,
dim_keys=n_chans,
dim_values=n_chans,
residual_dropout=0.1,
attention_dropout=0.1)
else:
self.decoder = PCmer(
num_layers=n_layers,
num_heads=8,
dim_model=n_chans,
dim_keys=n_chans,
dim_values=n_chans,
residual_dropout=0.1,
attention_dropout=0.1)
self.norm = nn.LayerNorm(n_chans)
# out
self.n_out = out_dims
self.dense_out = weight_norm(
nn.Linear(n_chans, self.n_out))
def forward(self, units, f0, volume, spk_id=None, spk_mix_dict=None, aug_shift=None,
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, spk_emb_dict=None):
'''
input:
B x n_frames x n_unit
return:
dict of B x n_frames x feat
'''
x = self.stack(units.transpose(1,2)).transpose(1,2)
x = x + self.f0_embed((1+ f0 / 700).log()) + self.volume_embed(volume)
if self.use_speaker_encoder:
if spk_mix_dict is not None:
assert spk_emb_dict is not None
for k, v in spk_mix_dict.items():
spk_id_torch = spk_emb_dict[str(k)]
spk_id_torch = np.tile(spk_id_torch, (len(units), 1))
spk_id_torch = torch.from_numpy(spk_id_torch).float().to(units.device)
x = x + v * self.spk_embed(spk_id_torch)
else:
x = x + self.spk_embed(spk_emb)
else:
if self.n_spk is not None and self.n_spk > 1:
if spk_mix_dict is not None:
for k, v in spk_mix_dict.items():
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
x = x + v * self.spk_embed(spk_id_torch - 1)
else:
x = x + self.spk_embed(spk_id - 1)
if self.aug_shift_embed is not None and aug_shift is not None:
x = x + self.aug_shift_embed(aug_shift / 5)
x = self.decoder(x)
x = self.norm(x)
x = self.dense_out(x)
if not infer:
x = F.mse_loss(x, gt_spec)
if self.l2reg_loss > 0:
x = x + l2_regularization(model=self, l2_alpha=self.l2reg_loss)
return x
def l2_regularization(model, l2_alpha):
l2_loss = []
for module in model.modules():
if type(module) is nn.Conv2d:
l2_loss.append((module.weight ** 2).sum() / 2.0)
return l2_alpha * sum(l2_loss)

View File

@ -0,0 +1,380 @@
import torch
from torch import nn
import math
from functools import partial
from einops import rearrange, repeat
from local_attention import LocalAttention
import torch.nn.functional as F
#import fast_transformers.causal_product.causal_product_cuda
def softmax_kernel(data, *, projection_matrix, is_query, normalize_data=True, eps=1e-4, device = None):
b, h, *_ = data.shape
# (batch size, head, length, model_dim)
# normalize model dim
data_normalizer = (data.shape[-1] ** -0.25) if normalize_data else 1.
# what is ration?, projection_matrix.shape[0] --> 266
ratio = (projection_matrix.shape[0] ** -0.5)
projection = repeat(projection_matrix, 'j d -> b h j d', b = b, h = h)
projection = projection.type_as(data)
#data_dash = w^T x
data_dash = torch.einsum('...id,...jd->...ij', (data_normalizer * data), projection)
# diag_data = D**2
diag_data = data ** 2
diag_data = torch.sum(diag_data, dim=-1)
diag_data = (diag_data / 2.0) * (data_normalizer ** 2)
diag_data = diag_data.unsqueeze(dim=-1)
#print ()
if is_query:
data_dash = ratio * (
torch.exp(data_dash - diag_data -
torch.max(data_dash, dim=-1, keepdim=True).values) + eps)
else:
data_dash = ratio * (
torch.exp(data_dash - diag_data + eps))#- torch.max(data_dash)) + eps)
return data_dash.type_as(data)
def orthogonal_matrix_chunk(cols, qr_uniform_q = False, device = None):
unstructured_block = torch.randn((cols, cols), device = device)
q, r = torch.linalg.qr(unstructured_block.cpu(), mode='reduced')
q, r = map(lambda t: t.to(device), (q, r))
# proposed by @Parskatt
# to make sure Q is uniform https://arxiv.org/pdf/math-ph/0609050.pdf
if qr_uniform_q:
d = torch.diag(r, 0)
q *= d.sign()
return q.t()
def exists(val):
return val is not None
def empty(tensor):
return tensor.numel() == 0
def default(val, d):
return val if exists(val) else d
def cast_tuple(val):
return (val,) if not isinstance(val, tuple) else val
class PCmer(nn.Module):
"""The encoder that is used in the Transformer model."""
def __init__(self,
num_layers,
num_heads,
dim_model,
dim_keys,
dim_values,
residual_dropout,
attention_dropout):
super().__init__()
self.num_layers = num_layers
self.num_heads = num_heads
self.dim_model = dim_model
self.dim_values = dim_values
self.dim_keys = dim_keys
self.residual_dropout = residual_dropout
self.attention_dropout = attention_dropout
self._layers = nn.ModuleList([_EncoderLayer(self) for _ in range(num_layers)])
# METHODS ########################################################################################################
def forward(self, phone, mask=None):
# apply all layers to the input
for (i, layer) in enumerate(self._layers):
phone = layer(phone, mask)
# provide the final sequence
return phone
# ==================================================================================================================== #
# CLASS _ E N C O D E R L A Y E R #
# ==================================================================================================================== #
class _EncoderLayer(nn.Module):
"""One layer of the encoder.
Attributes:
attn: (:class:`mha.MultiHeadAttention`): The attention mechanism that is used to read the input sequence.
feed_forward (:class:`ffl.FeedForwardLayer`): The feed-forward layer on top of the attention mechanism.
"""
def __init__(self, parent: PCmer):
"""Creates a new instance of ``_EncoderLayer``.
Args:
parent (Encoder): The encoder that the layers is created for.
"""
super().__init__()
self.conformer = ConformerConvModule(parent.dim_model)
self.norm = nn.LayerNorm(parent.dim_model)
self.dropout = nn.Dropout(parent.residual_dropout)
# selfatt -> fastatt: performer!
self.attn = SelfAttention(dim = parent.dim_model,
heads = parent.num_heads,
causal = False)
# METHODS ########################################################################################################
def forward(self, phone, mask=None):
# compute attention sub-layer
phone = phone + (self.attn(self.norm(phone), mask=mask))
phone = phone + (self.conformer(phone))
return phone
def calc_same_padding(kernel_size):
pad = kernel_size // 2
return (pad, pad - (kernel_size + 1) % 2)
# helper classes
class Swish(nn.Module):
def forward(self, x):
return x * x.sigmoid()
class Transpose(nn.Module):
def __init__(self, dims):
super().__init__()
assert len(dims) == 2, 'dims must be a tuple of two dimensions'
self.dims = dims
def forward(self, x):
return x.transpose(*self.dims)
class GLU(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, x):
out, gate = x.chunk(2, dim=self.dim)
return out * gate.sigmoid()
class DepthWiseConv1d(nn.Module):
def __init__(self, chan_in, chan_out, kernel_size, padding):
super().__init__()
self.padding = padding
self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)
def forward(self, x):
x = F.pad(x, self.padding)
return self.conv(x)
class ConformerConvModule(nn.Module):
def __init__(
self,
dim,
causal = False,
expansion_factor = 2,
kernel_size = 31,
dropout = 0.):
super().__init__()
inner_dim = dim * expansion_factor
padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)
self.net = nn.Sequential(
nn.LayerNorm(dim),
Transpose((1, 2)),
nn.Conv1d(dim, inner_dim * 2, 1),
GLU(dim=1),
DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
#nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
Swish(),
nn.Conv1d(inner_dim, dim, 1),
Transpose((1, 2)),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
def linear_attention(q, k, v):
if v is None:
#print (k.size(), q.size())
out = torch.einsum('...ed,...nd->...ne', k, q)
return out
else:
k_cumsum = k.sum(dim = -2)
#k_cumsum = k.sum(dim = -2)
D_inv = 1. / (torch.einsum('...nd,...d->...n', q, k_cumsum.type_as(q)) + 1e-8)
context = torch.einsum('...nd,...ne->...de', k, v)
#print ("TRUEEE: ", context.size(), q.size(), D_inv.size())
out = torch.einsum('...de,...nd,...n->...ne', context, q, D_inv)
return out
def gaussian_orthogonal_random_matrix(nb_rows, nb_columns, scaling = 0, qr_uniform_q = False, device = None):
nb_full_blocks = int(nb_rows / nb_columns)
#print (nb_full_blocks)
block_list = []
for _ in range(nb_full_blocks):
q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
block_list.append(q)
# block_list[n] is a orthogonal matrix ... (model_dim * model_dim)
#print (block_list[0].size(), torch.einsum('...nd,...nd->...n', block_list[0], torch.roll(block_list[0],1,1)))
#print (nb_rows, nb_full_blocks, nb_columns)
remaining_rows = nb_rows - nb_full_blocks * nb_columns
#print (remaining_rows)
if remaining_rows > 0:
q = orthogonal_matrix_chunk(nb_columns, qr_uniform_q = qr_uniform_q, device = device)
#print (q[:remaining_rows].size())
block_list.append(q[:remaining_rows])
final_matrix = torch.cat(block_list)
if scaling == 0:
multiplier = torch.randn((nb_rows, nb_columns), device = device).norm(dim = 1)
elif scaling == 1:
multiplier = math.sqrt((float(nb_columns))) * torch.ones((nb_rows,), device = device)
else:
raise ValueError(f'Invalid scaling {scaling}')
return torch.diag(multiplier) @ final_matrix
class FastAttention(nn.Module):
def __init__(self, dim_heads, nb_features = None, ortho_scaling = 0, causal = False, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, no_projection = False):
super().__init__()
nb_features = default(nb_features, int(dim_heads * math.log(dim_heads)))
self.dim_heads = dim_heads
self.nb_features = nb_features
self.ortho_scaling = ortho_scaling
self.create_projection = partial(gaussian_orthogonal_random_matrix, nb_rows = self.nb_features, nb_columns = dim_heads, scaling = ortho_scaling, qr_uniform_q = qr_uniform_q)
projection_matrix = self.create_projection()
self.register_buffer('projection_matrix', projection_matrix)
self.generalized_attention = generalized_attention
self.kernel_fn = kernel_fn
# if this is turned on, no projection will be used
# queries and keys will be softmax-ed as in the original efficient attention paper
self.no_projection = no_projection
self.causal = causal
if causal:
try:
import fast_transformers.causal_product.causal_product_cuda
self.causal_linear_fn = partial(causal_linear_attention)
except ImportError:
print('unable to import cuda code for auto-regressive Performer. will default to the memory inefficient non-cuda version')
self.causal_linear_fn = causal_linear_attention_noncuda
@torch.no_grad()
def redraw_projection_matrix(self):
projections = self.create_projection()
self.projection_matrix.copy_(projections)
del projections
def forward(self, q, k, v):
device = q.device
if self.no_projection:
q = q.softmax(dim = -1)
k = torch.exp(k) if self.causal else k.softmax(dim = -2)
elif self.generalized_attention:
create_kernel = partial(generalized_kernel, kernel_fn = self.kernel_fn, projection_matrix = self.projection_matrix, device = device)
q, k = map(create_kernel, (q, k))
else:
create_kernel = partial(softmax_kernel, projection_matrix = self.projection_matrix, device = device)
q = create_kernel(q, is_query = True)
k = create_kernel(k, is_query = False)
attn_fn = linear_attention if not self.causal else self.causal_linear_fn
if v is None:
out = attn_fn(q, k, None)
return out
else:
out = attn_fn(q, k, v)
return out
class SelfAttention(nn.Module):
def __init__(self, dim, causal = False, heads = 8, dim_head = 64, local_heads = 0, local_window_size = 256, nb_features = None, feature_redraw_interval = 1000, generalized_attention = False, kernel_fn = nn.ReLU(), qr_uniform_q = False, dropout = 0., no_projection = False):
super().__init__()
assert dim % heads == 0, 'dimension must be divisible by number of heads'
dim_head = default(dim_head, dim // heads)
inner_dim = dim_head * heads
self.fast_attention = FastAttention(dim_head, nb_features, causal = causal, generalized_attention = generalized_attention, kernel_fn = kernel_fn, qr_uniform_q = qr_uniform_q, no_projection = no_projection)
self.heads = heads
self.global_heads = heads - local_heads
self.local_attn = LocalAttention(window_size = local_window_size, causal = causal, autopad = True, dropout = dropout, look_forward = int(not causal), rel_pos_emb_config = (dim_head, local_heads)) if local_heads > 0 else None
#print (heads, nb_features, dim_head)
#name_embedding = torch.zeros(110, heads, dim_head, dim_head)
#self.name_embedding = nn.Parameter(name_embedding, requires_grad=True)
self.to_q = nn.Linear(dim, inner_dim)
self.to_k = nn.Linear(dim, inner_dim)
self.to_v = nn.Linear(dim, inner_dim)
self.to_out = nn.Linear(inner_dim, dim)
self.dropout = nn.Dropout(dropout)
@torch.no_grad()
def redraw_projection_matrix(self):
self.fast_attention.redraw_projection_matrix()
#torch.nn.init.zeros_(self.name_embedding)
#print (torch.sum(self.name_embedding))
def forward(self, x, context = None, mask = None, context_mask = None, name=None, inference=False, **kwargs):
b, n, _, h, gh = *x.shape, self.heads, self.global_heads
cross_attend = exists(context)
context = default(context, x)
context_mask = default(context_mask, mask) if not cross_attend else context_mask
#print (torch.sum(self.name_embedding))
q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
(q, lq), (k, lk), (v, lv) = map(lambda t: (t[:, :gh], t[:, gh:]), (q, k, v))
attn_outs = []
#print (name)
#print (self.name_embedding[name].size())
if not empty(q):
if exists(context_mask):
global_mask = context_mask[:, None, :, None]
v.masked_fill_(~global_mask, 0.)
if cross_attend:
pass
#print (torch.sum(self.name_embedding))
#out = self.fast_attention(q,self.name_embedding[name],None)
#print (torch.sum(self.name_embedding[...,-1:]))
else:
out = self.fast_attention(q, k, v)
attn_outs.append(out)
if not empty(lq):
assert not cross_attend, 'local attention is not compatible with cross attention'
out = self.local_attn(lq, lk, lv, input_mask = mask)
attn_outs.append(out)
out = torch.cat(attn_outs, dim = 1)
out = rearrange(out, 'b h n d -> b n (h d)')
out = self.to_out(out)
return self.dropout(out)

View File

@ -0,0 +1,178 @@
import os
import yaml
import torch
import torch.nn as nn
import numpy as np
from .diffusion import GaussianDiffusion
from .wavenet import WaveNet
from .vocoder import Vocoder
from .naive.naive import Unit2MelNaive
class DotDict(dict):
def __getattr__(*args):
val = dict.get(*args)
return DotDict(val) if type(val) is dict else val
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
def load_model_vocoder(
model_path,
device='cpu',
loaded_vocoder=None):
config_file = os.path.join(os.path.split(model_path)[0], 'config.yaml')
with open(config_file, "r") as config:
args = yaml.safe_load(config)
args = DotDict(args)
# load vocoder
if loaded_vocoder is None:
vocoder = Vocoder(args.vocoder.type, args.vocoder.ckpt, device=device)
else:
vocoder = loaded_vocoder
# load model
model = load_svc_model(args=args, vocoder_dimension=vocoder.dimension)
print(' [Loading] ' + model_path)
ckpt = torch.load(model_path, map_location=torch.device(device))
model.to(device)
model.load_state_dict(ckpt['model'])
model.eval()
return model, vocoder, args
def load_model_vocoder_from_combo(combo_model_path, device='cpu'):
read_dict = torch.load(combo_model_path, map_location=torch.device(device))
# args
diff_args = DotDict(read_dict["diff_config_dict"])
naive_args = DotDict(read_dict["naive_config_dict"])
# vocoder
vocoder = Vocoder(diff_args.vocoder.type, diff_args.vocoder.ckpt, device=device)
# diff_model
print(' [Loading] ' + combo_model_path)
diff_model = load_svc_model(args=diff_args, vocoder_dimension=vocoder.dimension)
diff_model.to(device)
diff_model.load_state_dict(read_dict["diff_model"]['model'])
diff_model.eval()
# naive_model
naive_model = load_svc_model(args=naive_args, vocoder_dimension=vocoder.dimension)
naive_model.to(device)
naive_model.load_state_dict(read_dict["naive_model"]['model'])
naive_model.eval()
return diff_model, diff_args, naive_model, naive_args, vocoder
def load_svc_model(args, vocoder_dimension):
if args.model.type == 'Diffusion':
model = Unit2Mel(
args.data.encoder_out_channels,
args.model.n_spk,
args.model.use_pitch_aug,
vocoder_dimension,
args.model.n_layers,
args.model.n_chans,
args.model.n_hidden,
use_speaker_encoder=args.model.use_speaker_encoder,
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels)
elif args.model.type == 'Naive':
model = Unit2MelNaive(
args.data.encoder_out_channels,
args.model.n_spk,
args.model.use_pitch_aug,
vocoder_dimension,
args.model.n_layers,
args.model.n_chans,
use_speaker_encoder=args.model.use_speaker_encoder,
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels)
elif args.model.type == 'NaiveFS':
model = Unit2MelNaive(
args.data.encoder_out_channels,
args.model.n_spk,
args.model.use_pitch_aug,
vocoder_dimension,
args.model.n_layers,
args.model.n_chans,
use_speaker_encoder=args.model.use_speaker_encoder,
speaker_encoder_out_channels=args.data.speaker_encoder_out_channels,
use_full_siren=True,
l2reg_loss=args.model.l2_reg_loss)
else:
raise ("Unknow model")
return model
class Unit2Mel(nn.Module):
def __init__(
self,
input_channel,
n_spk,
use_pitch_aug=False,
out_dims=128,
n_layers=20,
n_chans=384,
n_hidden=256,
use_speaker_encoder=False,
speaker_encoder_out_channels=256):
super().__init__()
self.unit_embed = nn.Linear(input_channel, n_hidden)
self.f0_embed = nn.Linear(1, n_hidden)
self.volume_embed = nn.Linear(1, n_hidden)
if use_pitch_aug:
self.aug_shift_embed = nn.Linear(1, n_hidden, bias=False)
else:
self.aug_shift_embed = None
self.n_spk = n_spk
self.use_speaker_encoder = use_speaker_encoder
if use_speaker_encoder:
self.spk_embed = nn.Linear(speaker_encoder_out_channels, n_hidden, bias=False)
else:
if n_spk is not None and n_spk > 1:
self.spk_embed = nn.Embedding(n_spk, n_hidden)
# diffusion
self.decoder = GaussianDiffusion(WaveNet(out_dims, n_layers, n_chans, n_hidden), out_dims=out_dims)
def forward(self, units, f0, volume, spk_id=None, spk_mix_dict=None, aug_shift=None,
gt_spec=None, infer=True, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, spk_emb_dict=None):
'''
input:
B x n_frames x n_unit
return:
dict of B x n_frames x feat
'''
x = self.unit_embed(units) + self.f0_embed((1 + f0 / 700).log()) + self.volume_embed(volume)
if self.use_speaker_encoder:
if spk_mix_dict is not None:
assert spk_emb_dict is not None
for k, v in spk_mix_dict.items():
spk_id_torch = spk_emb_dict[str(k)]
spk_id_torch = np.tile(spk_id_torch, (len(units), 1))
spk_id_torch = torch.from_numpy(spk_id_torch).float().to(units.device)
x = x + v * self.spk_embed(spk_id_torch)
else:
x = x + self.spk_embed(spk_emb)
else:
if self.n_spk is not None and self.n_spk > 1:
if spk_mix_dict is not None:
for k, v in spk_mix_dict.items():
spk_id_torch = torch.LongTensor(np.array([[k]])).to(units.device)
x = x + v * self.spk_embed(spk_id_torch - 1)
else:
x = x + self.spk_embed(spk_id - 1)
if self.aug_shift_embed is not None and aug_shift is not None:
x = x + self.aug_shift_embed(aug_shift / 5)
x = self.decoder(x, gt_spec=gt_spec, infer=infer, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm)
return x

View File

@ -0,0 +1,96 @@
import torch
from nsf_hifigan.nvSTFT import STFT
from nsf_hifigan.models import load_model, load_config
from torchaudio.transforms import Resample
class Vocoder:
def __init__(self, vocoder_type, vocoder_ckpt, device=None):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device
if vocoder_type == 'nsf-hifigan':
self.vocoder = NsfHifiGAN(vocoder_ckpt, device=device)
elif vocoder_type == 'nsf-hifigan-log10':
self.vocoder = NsfHifiGANLog10(vocoder_ckpt, device=device)
else:
raise ValueError(f" [x] Unknown vocoder: {vocoder_type}")
self.resample_kernel = {}
self.vocoder_sample_rate = self.vocoder.sample_rate()
self.vocoder_hop_size = self.vocoder.hop_size()
self.dimension = self.vocoder.dimension()
def extract(self, audio, sample_rate, keyshift=0):
# resample
if sample_rate == self.vocoder_sample_rate:
audio_res = audio
else:
key_str = str(sample_rate)
if key_str not in self.resample_kernel:
self.resample_kernel[key_str] = Resample(sample_rate, self.vocoder_sample_rate,
lowpass_filter_width=128).to(self.device)
audio_res = self.resample_kernel[key_str](audio)
# extract
mel = self.vocoder.extract(audio_res, keyshift=keyshift) # B, n_frames, bins
return mel
def infer(self, mel, f0):
f0 = f0[:, :mel.size(1), 0] # B, n_frames
audio = self.vocoder(mel, f0)
return audio
class NsfHifiGAN(torch.nn.Module):
def __init__(self, model_path, device=None):
super().__init__()
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device
self.model_path = model_path
self.model = None
self.h = load_config(model_path)
self.stft = STFT(
self.h.sampling_rate,
self.h.num_mels,
self.h.n_fft,
self.h.win_size,
self.h.hop_size,
self.h.fmin,
self.h.fmax)
def sample_rate(self):
return self.h.sampling_rate
def hop_size(self):
return self.h.hop_size
def dimension(self):
return self.h.num_mels
def extract(self, audio, keyshift=0):
mel = self.stft.get_mel(audio, keyshift=keyshift).transpose(1, 2) # B, n_frames, bins
return mel
def forward(self, mel, f0):
if self.model is None:
print('| Load HifiGAN: ', self.model_path)
self.model, self.h = load_model(self.model_path, device=self.device)
with torch.no_grad():
c = mel.transpose(1, 2)
audio = self.model(c, f0)
return audio
class NsfHifiGANLog10(NsfHifiGAN):
def forward(self, mel, f0):
if self.model is None:
print('| Load HifiGAN: ', self.model_path)
self.model, self.h = load_model(self.model_path, device=self.device)
with torch.no_grad():
c = 0.434294 * mel.transpose(1, 2)
audio = self.model(c, f0)
return audio

View File

@ -0,0 +1,108 @@
import math
from math import sqrt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Mish
class Conv1d(torch.nn.Conv1d):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
nn.init.kaiming_normal_(self.weight)
class SinusoidalPosEmb(nn.Module):
def __init__(self, dim):
super().__init__()
self.dim = dim
def forward(self, x):
device = x.device
half_dim = self.dim // 2
emb = math.log(10000) / (half_dim - 1)
emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
emb = x[:, None] * emb[None, :]
emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
return emb
class ResidualBlock(nn.Module):
def __init__(self, encoder_hidden, residual_channels, dilation):
super().__init__()
self.residual_channels = residual_channels
self.dilated_conv = nn.Conv1d(
residual_channels,
2 * residual_channels,
kernel_size=3,
padding=dilation,
dilation=dilation
)
self.diffusion_projection = nn.Linear(residual_channels, residual_channels)
self.conditioner_projection = nn.Conv1d(encoder_hidden, 2 * residual_channels, 1)
self.output_projection = nn.Conv1d(residual_channels, 2 * residual_channels, 1)
def forward(self, x, conditioner, diffusion_step):
diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
conditioner = self.conditioner_projection(conditioner)
y = x + diffusion_step
y = self.dilated_conv(y) + conditioner
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
gate, filter = torch.split(y, [self.residual_channels, self.residual_channels], dim=1)
y = torch.sigmoid(gate) * torch.tanh(filter)
y = self.output_projection(y)
# Using torch.split instead of torch.chunk to avoid using onnx::Slice
residual, skip = torch.split(y, [self.residual_channels, self.residual_channels], dim=1)
return (x + residual) / math.sqrt(2.0), skip
class WaveNet(nn.Module):
def __init__(self, in_dims=128, n_layers=20, n_chans=384, n_hidden=256):
super().__init__()
self.input_projection = Conv1d(in_dims, n_chans, 1)
self.diffusion_embedding = SinusoidalPosEmb(n_chans)
self.mlp = nn.Sequential(
nn.Linear(n_chans, n_chans * 4),
Mish(),
nn.Linear(n_chans * 4, n_chans)
)
self.residual_layers = nn.ModuleList([
ResidualBlock(
encoder_hidden=n_hidden,
residual_channels=n_chans,
dilation=1
)
for i in range(n_layers)
])
self.skip_projection = Conv1d(n_chans, n_chans, 1)
self.output_projection = Conv1d(n_chans, in_dims, 1)
nn.init.zeros_(self.output_projection.weight)
def forward(self, spec, diffusion_step, cond):
"""
:param spec: [B, 1, M, T]
:param diffusion_step: [B, 1]
:param cond: [B, M, T]
:return:
"""
x = spec.squeeze(1)
x = self.input_projection(x) # [B, residual_channel, T]
x = F.relu(x)
diffusion_step = self.diffusion_embedding(diffusion_step)
diffusion_step = self.mlp(diffusion_step)
skip = []
for layer in self.residual_layers:
x, skip_connection = layer(x, cond, diffusion_step)
skip.append(skip_connection)
x = torch.sum(torch.stack(skip), dim=0) / sqrt(len(self.residual_layers))
x = self.skip_projection(x)
x = F.relu(x)
x = self.output_projection(x) # [B, mel_bins, T]
return x[:, None, :, :]

View File

@ -0,0 +1,15 @@
import os
import shutil
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
def build_env(config, config_name, path):
t_path = os.path.join(path, config_name)
if config != t_path:
os.makedirs(path, exist_ok=True)
shutil.copyfile(config, os.path.join(path, config_name))

View File

@ -0,0 +1,434 @@
import os
import json
from .env import AttrDict
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from .utils import init_weights, get_padding
LRELU_SLOPE = 0.1
def load_model(model_path, device='cuda'):
h = load_config(model_path)
generator = Generator(h).to(device)
cp_dict = torch.load(model_path, map_location=device)
generator.load_state_dict(cp_dict['generator'])
generator.eval()
generator.remove_weight_norm()
del cp_dict
return generator, h
def load_config(model_path):
config_file = os.path.join(os.path.split(model_path)[0], 'config.json')
with open(config_file) as f:
data = f.read()
json_config = json.loads(data)
h = AttrDict(json_config)
return h
class ResBlock1(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.h = h
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2])))
])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
xt = c2(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.h = h
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
xt = c(xt)
x = xt + x
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class SineGen(torch.nn.Module):
""" Definition of sine generator
SineGen(samp_rate, harmonic_num = 0,
sine_amp = 0.1, noise_std = 0.003,
voiced_threshold = 0,
flag_for_pulse=False)
samp_rate: sampling rate in Hz
harmonic_num: number of harmonic overtones (default 0)
sine_amp: amplitude of sine-wavefrom (default 0.1)
noise_std: std of Gaussian noise (default 0.003)
voiced_thoreshold: F0 threshold for U/V classification (default 0)
flag_for_pulse: this SinGen is used inside PulseGen (default False)
Note: when flag_for_pulse is True, the first time step of a voiced
segment is always sin(np.pi) or cos(0)
"""
def __init__(self, samp_rate, harmonic_num=0,
sine_amp=0.1, noise_std=0.003,
voiced_threshold=0):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sampling_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
# generate uv signal
uv = torch.ones_like(f0)
uv = uv * (f0 > self.voiced_threshold)
return uv
@torch.no_grad()
def forward(self, f0, upp):
""" sine_tensor, uv = forward(f0)
input F0: tensor(batchsize=1, length, dim=1)
f0 for unvoiced steps should be 0
output sine_tensor: tensor(batchsize=1, length, dim)
output uv: tensor(batchsize=1, length, 1)
"""
f0 = f0.unsqueeze(-1)
fn = torch.multiply(f0, torch.arange(1, self.dim + 1, device=f0.device).reshape((1, 1, -1)))
rad_values = (fn / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
rand_ini = torch.rand(fn.shape[0], fn.shape[2], device=fn.device)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
is_half = rad_values.dtype is not torch.float32
tmp_over_one = torch.cumsum(rad_values.double(), 1) # % 1 #####%1意味着后面的cumsum无法再优化
if is_half:
tmp_over_one = tmp_over_one.half()
else:
tmp_over_one = tmp_over_one.float()
tmp_over_one *= upp
tmp_over_one = F.interpolate(
tmp_over_one.transpose(2, 1), scale_factor=upp,
mode='linear', align_corners=True
).transpose(2, 1)
rad_values = F.interpolate(rad_values.transpose(2, 1), scale_factor=upp, mode='nearest').transpose(2, 1)
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
rad_values = rad_values.double()
cumsum_shift = cumsum_shift.double()
sine_waves = torch.sin(torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi)
if is_half:
sine_waves = sine_waves.half()
else:
sine_waves = sine_waves.float()
sine_waves = sine_waves * self.sine_amp
return sine_waves
class SourceModuleHnNSF(torch.nn.Module):
""" SourceModule for hn-nsf
SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0)
sampling_rate: sampling_rate in Hz
harmonic_num: number of harmonic above F0 (default: 0)
sine_amp: amplitude of sine source signal (default: 0.1)
add_noise_std: std of additive Gaussian noise (default: 0.003)
note that amplitude of noise in unvoiced is decided
by sine_amp
voiced_threshold: threhold to set U/V given F0 (default: 0)
Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
F0_sampled (batchsize, length, 1)
Sine_source (batchsize, length, 1)
noise_source (batchsize, length 1)
uv (batchsize, length, 1)
"""
def __init__(self, sampling_rate, harmonic_num=0, sine_amp=0.1,
add_noise_std=0.003, voiced_threshod=0):
super(SourceModuleHnNSF, self).__init__()
self.sine_amp = sine_amp
self.noise_std = add_noise_std
# to produce sine waveforms
self.l_sin_gen = SineGen(sampling_rate, harmonic_num,
sine_amp, add_noise_std, voiced_threshod)
# to merge source harmonics into a single excitation
self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
self.l_tanh = torch.nn.Tanh()
def forward(self, x, upp):
sine_wavs = self.l_sin_gen(x, upp)
sine_merge = self.l_tanh(self.l_linear(sine_wavs))
return sine_merge
class Generator(torch.nn.Module):
def __init__(self, h):
super(Generator, self).__init__()
self.h = h
self.num_kernels = len(h.resblock_kernel_sizes)
self.num_upsamples = len(h.upsample_rates)
self.m_source = SourceModuleHnNSF(
sampling_rate=h.sampling_rate,
harmonic_num=8
)
self.noise_convs = nn.ModuleList()
self.conv_pre = weight_norm(Conv1d(h.num_mels, h.upsample_initial_channel, 7, 1, padding=3))
resblock = ResBlock1 if h.resblock == '1' else ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
c_cur = h.upsample_initial_channel // (2 ** (i + 1))
self.ups.append(weight_norm(
ConvTranspose1d(h.upsample_initial_channel // (2 ** i), h.upsample_initial_channel // (2 ** (i + 1)),
k, u, padding=(k - u) // 2)))
if i + 1 < len(h.upsample_rates): #
stride_f0 = int(np.prod(h.upsample_rates[i + 1:]))
self.noise_convs.append(Conv1d(
1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2))
else:
self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
self.resblocks = nn.ModuleList()
ch = h.upsample_initial_channel
for i in range(len(self.ups)):
ch //= 2
for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
self.resblocks.append(resblock(h, ch, k, d))
self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
self.ups.apply(init_weights)
self.conv_post.apply(init_weights)
self.upp = int(np.prod(h.upsample_rates))
def forward(self, x, f0):
har_source = self.m_source(f0, self.upp).transpose(1, 2)
x = self.conv_pre(x)
for i in range(self.num_upsamples):
x = F.leaky_relu(x, LRELU_SLOPE)
x = self.ups[i](x)
x_source = self.noise_convs[i](har_source)
x = x + x_source
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i * self.num_kernels + j](x)
else:
xs += self.resblocks[i * self.num_kernels + j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
remove_weight_norm(self.conv_pre)
remove_weight_norm(self.conv_post)
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, periods=None):
super(MultiPeriodDiscriminator, self).__init__()
self.periods = periods if periods is not None else [2, 3, 5, 7, 11]
self.discriminators = nn.ModuleList()
for period in self.periods:
self.discriminators.append(DiscriminatorP(period))
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 128, 15, 1, padding=7)),
norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiScaleDiscriminator(torch.nn.Module):
def __init__(self):
super(MultiScaleDiscriminator, self).__init__()
self.discriminators = nn.ModuleList([
DiscriminatorS(use_spectral_norm=True),
DiscriminatorS(),
DiscriminatorS(),
])
self.meanpools = nn.ModuleList([
AvgPool1d(4, 2, padding=2),
AvgPool1d(4, 2, padding=2)
])
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
if i != 0:
y = self.meanpools[i - 1](y)
y_hat = self.meanpools[i - 1](y_hat)
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
fmap_rs.append(fmap_r)
y_d_gs.append(y_d_g)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
def feature_loss(fmap_r, fmap_g):
loss = 0
for dr, dg in zip(fmap_r, fmap_g):
for rl, gl in zip(dr, dg):
loss += torch.mean(torch.abs(rl - gl))
return loss * 2
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
loss = 0
r_losses = []
g_losses = []
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
r_loss = torch.mean((1 - dr) ** 2)
g_loss = torch.mean(dg ** 2)
loss += (r_loss + g_loss)
r_losses.append(r_loss.item())
g_losses.append(g_loss.item())
return loss, r_losses, g_losses
def generator_loss(disc_outputs):
loss = 0
gen_losses = []
for dg in disc_outputs:
l = torch.mean((1 - dg) ** 2)
gen_losses.append(l)
loss += l
return loss, gen_losses

View File

@ -0,0 +1,125 @@
import os
os.environ["LRU_CACHE_CAPACITY"] = "3"
import torch
import torch.utils.data
import numpy as np
import librosa
from librosa.filters import mel as librosa_mel_fn
import soundfile as sf
import torch.nn.functional as F
def load_wav_to_torch(full_path, target_sr=None, return_empty_on_exception=False):
sampling_rate = None
try:
data, sampling_rate = sf.read(full_path, always_2d=True)# than soundfile.
except Exception as ex:
print(f"'{full_path}' failed to load.\nException:")
print(ex)
if return_empty_on_exception:
return [], sampling_rate or target_sr or 48000
else:
raise Exception(ex)
if len(data.shape) > 1:
data = data[:, 0]
assert len(data) > 2# check duration of audio file is > 2 samples (because otherwise the slice operation was on the wrong dimension)
if np.issubdtype(data.dtype, np.integer): # if audio data is type int
max_mag = -np.iinfo(data.dtype).min # maximum magnitude = min possible value of intXX
else: # if audio data is type fp32
max_mag = max(np.amax(data), -np.amin(data))
max_mag = (2**31)+1 if max_mag > (2**15) else ((2**15)+1 if max_mag > 1.01 else 1.0) # data should be either 16-bit INT, 32-bit INT or [-1 to 1] float32
data = torch.FloatTensor(data.astype(np.float32))/max_mag
if (torch.isinf(data) | torch.isnan(data)).any() and return_empty_on_exception:# resample will crash with inf/NaN inputs. return_empty_on_exception will return empty arr instead of except
return [], sampling_rate or target_sr or 48000
if target_sr is not None and sampling_rate != target_sr:
data = torch.from_numpy(librosa.core.resample(data.numpy(), orig_sr=sampling_rate, target_sr=target_sr))
sampling_rate = target_sr
return data, sampling_rate
def dynamic_range_compression(x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_decompression(x, C=1):
return np.exp(x) / C
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
return torch.exp(x) / C
class STFT():
def __init__(self, sr=22050, n_mels=80, n_fft=1024, win_size=1024, hop_length=256, fmin=20, fmax=11025, clip_val=1e-5):
self.target_sr = sr
self.n_mels = n_mels
self.n_fft = n_fft
self.win_size = win_size
self.hop_length = hop_length
self.fmin = fmin
self.fmax = fmax
self.clip_val = clip_val
self.mel_basis = {}
self.hann_window = {}
def get_mel(self, y, keyshift=0, speed=1, center=False):
sampling_rate = self.target_sr
n_mels = self.n_mels
n_fft = self.n_fft
win_size = self.win_size
hop_length = self.hop_length
fmin = self.fmin
fmax = self.fmax
clip_val = self.clip_val
factor = 2 ** (keyshift / 12)
n_fft_new = int(np.round(n_fft * factor))
win_size_new = int(np.round(win_size * factor))
hop_length_new = int(np.round(hop_length * speed))
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
mel_basis_key = str(fmax)+'_'+str(y.device)
if mel_basis_key not in self.mel_basis:
mel = librosa_mel_fn(sr=sampling_rate, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax)
self.mel_basis[mel_basis_key] = torch.from_numpy(mel).float().to(y.device)
keyshift_key = str(keyshift)+'_'+str(y.device)
if keyshift_key not in self.hann_window:
self.hann_window[keyshift_key] = torch.hann_window(win_size_new).to(y.device)
pad_left = (win_size_new - hop_length_new) //2
pad_right = max((win_size_new- hop_length_new + 1) //2, win_size_new - y.size(-1) - pad_left)
if pad_right < y.size(-1):
mode = 'reflect'
else:
mode = 'constant'
y = torch.nn.functional.pad(y.unsqueeze(1), (pad_left, pad_right), mode = mode)
y = y.squeeze(1)
spec = torch.stft(y, n_fft_new, hop_length=hop_length_new, win_length=win_size_new, window=self.hann_window[keyshift_key],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.sqrt(spec.real.pow(2) + spec.imag.pow(2) + (1e-9))
if keyshift != 0:
size = n_fft // 2 + 1
resize = spec.size(1)
if resize < size:
spec = F.pad(spec, (0, 0, 0, size-resize))
spec = spec[:, :size, :] * win_size / win_size_new
spec = torch.matmul(self.mel_basis[mel_basis_key], spec)
spec = dynamic_range_compression_torch(spec, clip_val=clip_val)
return spec
def __call__(self, audiopath):
audio, sr = load_wav_to_torch(audiopath, target_sr=self.target_sr)
spect = self.get_mel(audio.unsqueeze(0)).squeeze(0)
return spect
stft = STFT()

View File

@ -0,0 +1,68 @@
import glob
import os
import matplotlib
import torch
from torch.nn.utils import weight_norm
matplotlib.use("Agg")
import matplotlib.pylab as plt
def plot_spectrogram(spectrogram):
fig, ax = plt.subplots(figsize=(10, 2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
fig.canvas.draw()
plt.close()
return fig
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def apply_weight_norm(m):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
weight_norm(m)
def get_padding(kernel_size, dilation=1):
return int((kernel_size*dilation - dilation)/2)
def load_checkpoint(filepath, device):
assert os.path.isfile(filepath)
print("Loading '{}'".format(filepath))
checkpoint_dict = torch.load(filepath, map_location=device)
print("Complete.")
return checkpoint_dict
def save_checkpoint(filepath, obj):
print("Saving checkpoint to {}".format(filepath))
torch.save(obj, filepath)
print("Complete.")
def del_old_checkpoints(cp_dir, prefix, n_models=2):
pattern = os.path.join(cp_dir, prefix + '????????')
cp_list = glob.glob(pattern) # get checkpoint paths
cp_list = sorted(cp_list)# sort by iter
if len(cp_list) > n_models: # if more than n_models models are found
for cp in cp_list[:-n_models]:# delete the oldest models other than lastest n_models
open(cp, 'w').close()# empty file contents
os.unlink(cp)# delete file (move to trash when using Colab)
def scan_checkpoint(cp_dir, prefix):
pattern = os.path.join(cp_dir, prefix + '????????')
cp_list = glob.glob(pattern)
if len(cp_list) == 0:
return None
return sorted(cp_list)[-1]

View File

@ -0,0 +1,4 @@
modules in this folder from https://github.com/CNChTu/Diffusion-SVC at ae4120a2b6399ed5657b16dc702b57220fe4a295

View File

@ -0,0 +1,165 @@
import librosa
import torch
import torchaudio
class Slicer:
def __init__(self,
sr: int,
threshold: float = -40.,
min_length: int = 5000,
min_interval: int = 300,
hop_size: int = 20,
max_sil_kept: int = 5000):
if not min_length >= min_interval >= hop_size:
raise ValueError('The following condition must be satisfied: min_length >= min_interval >= hop_size')
if not max_sil_kept >= hop_size:
raise ValueError('The following condition must be satisfied: max_sil_kept >= hop_size')
min_interval = sr * min_interval / 1000
self.threshold = 10 ** (threshold / 20.)
self.hop_size = round(sr * hop_size / 1000)
self.win_size = min(round(min_interval), 4 * self.hop_size)
self.min_length = round(sr * min_length / 1000 / self.hop_size)
self.min_interval = round(min_interval / self.hop_size)
self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
def _apply_slice(self, waveform, begin, end):
if len(waveform.shape) > 1:
return waveform[:, begin * self.hop_size: min(waveform.shape[1], end * self.hop_size)]
else:
return waveform[begin * self.hop_size: min(waveform.shape[0], end * self.hop_size)]
# @timeit
def slice(self, waveform):
if len(waveform.shape) > 1:
samples = librosa.to_mono(waveform)
else:
samples = waveform
if samples.shape[0] <= self.min_length:
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
rms_list = librosa.feature.rms(y=samples, frame_length=self.win_size, hop_length=self.hop_size).squeeze(0)
sil_tags = []
silence_start = None
clip_start = 0
for i, rms in enumerate(rms_list):
# Keep looping while frame is silent.
if rms < self.threshold:
# Record start of silent frames.
if silence_start is None:
silence_start = i
continue
# Keep looping while frame is not silent and silence start has not been recorded.
if silence_start is None:
continue
# Clear recorded silence start if interval is not enough or clip is too short
is_leading_silence = silence_start == 0 and i > self.max_sil_kept
need_slice_middle = i - silence_start >= self.min_interval and i - clip_start >= self.min_length
if not is_leading_silence and not need_slice_middle:
silence_start = None
continue
# Need slicing. Record the range of silent frames to be removed.
if i - silence_start <= self.max_sil_kept:
pos = rms_list[silence_start: i + 1].argmin() + silence_start
if silence_start == 0:
sil_tags.append((0, pos))
else:
sil_tags.append((pos, pos))
clip_start = pos
elif i - silence_start <= self.max_sil_kept * 2:
pos = rms_list[i - self.max_sil_kept: silence_start + self.max_sil_kept + 1].argmin()
pos += i - self.max_sil_kept
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
clip_start = pos_r
else:
sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
clip_start = max(pos_r, pos)
else:
pos_l = rms_list[silence_start: silence_start + self.max_sil_kept + 1].argmin() + silence_start
pos_r = rms_list[i - self.max_sil_kept: i + 1].argmin() + i - self.max_sil_kept
if silence_start == 0:
sil_tags.append((0, pos_r))
else:
sil_tags.append((pos_l, pos_r))
clip_start = pos_r
silence_start = None
# Deal with trailing silence.
total_frames = rms_list.shape[0]
if silence_start is not None and total_frames - silence_start >= self.min_interval:
silence_end = min(total_frames, silence_start + self.max_sil_kept)
pos = rms_list[silence_start: silence_end + 1].argmin() + silence_start
sil_tags.append((pos, total_frames + 1))
# Apply and return slices.
if len(sil_tags) == 0:
return {"0": {"slice": False, "split_time": f"0,{len(waveform)}"}}
else:
chunks = []
# 第一段静音并非从头开始,补上有声片段
if sil_tags[0][0]:
chunks.append(
{"slice": False, "split_time": f"0,{min(waveform.shape[0], sil_tags[0][0] * self.hop_size)}"})
for i in range(0, len(sil_tags)):
# 标识有声片段(跳过第一段)
if i:
chunks.append({"slice": False,
"split_time": f"{sil_tags[i - 1][1] * self.hop_size},{min(waveform.shape[0], sil_tags[i][0] * self.hop_size)}"})
# 标识所有静音片段
chunks.append({"slice": True,
"split_time": f"{sil_tags[i][0] * self.hop_size},{min(waveform.shape[0], sil_tags[i][1] * self.hop_size)}"})
# 最后一段静音并非结尾,补上结尾片段
if sil_tags[-1][1] * self.hop_size < len(waveform):
chunks.append({"slice": False, "split_time": f"{sil_tags[-1][1] * self.hop_size},{len(waveform)}"})
chunk_dict = {}
for i in range(len(chunks)):
chunk_dict[str(i)] = chunks[i]
return chunk_dict
def cut(audio_path, db_thresh=-30, min_len=5000, flask_mode=False, flask_sr=None):
if not flask_mode:
audio, sr = librosa.load(audio_path, sr=None)
else:
audio = audio_path
sr = flask_sr
slicer = Slicer(
sr=sr,
threshold=db_thresh,
min_length=min_len
)
chunks = slicer.slice(audio)
return chunks
def chunks2audio(audio_path, chunks):
chunks = dict(chunks)
audio, sr = torchaudio.load(audio_path)
if len(audio.shape) == 2 and audio.shape[1] >= 2:
audio = torch.mean(audio, dim=0).unsqueeze(0)
audio = audio.cpu().numpy()[0]
result = []
for k, v in chunks.items():
tag = v["split_time"].split(",")
if tag[0] != tag[1]:
result.append((v["slice"], audio[int(tag[0]):int(tag[1])]))
return result, sr
def split(audio, sample_rate, hop_size, db_thresh=-40, min_len=5000):
slicer = Slicer(
sr=sample_rate,
threshold=db_thresh,
min_length=min_len)
chunks = dict(slicer.slice(audio))
result = []
for k, v in chunks.items():
tag = v["split_time"].split(",")
if tag[0] != tag[1]:
start_frame = int(int(tag[0]) // hop_size)
end_frame = int(int(tag[1]) // hop_size)
if end_frame > start_frame:
result.append((
start_frame,
audio[int(start_frame * hop_size): int(end_frame * hop_size)]))
return result

View File

@ -0,0 +1,808 @@
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import pyworld as pw
import parselmouth
import torchcrepe
import librosa
import fsspec
from tqdm import tqdm
from transformers import HubertModel, Wav2Vec2FeatureExtractor, Wav2Vec2ForCTC
from fairseq import checkpoint_utils
from encoder.hubert.model import HubertSoft
from encoder.speaker_encoder.model import SpeakerEncoder as TTSSpeakerEncoder
import scipy.signal
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
from torchaudio.transforms import Resample
CREPE_RESAMPLE_KERNEL = {}
class SpeakerEncoder:
def __init__(self, speaker_encoder, speaker_encoder_config, speaker_encoder_ckpt, encoder_sample_rate,
device='cuda',
use_torchaudio=False):
self.use_torchaudio = use_torchaudio
self.encoder_sample_rate = encoder_sample_rate
self.device = device
self.resample_kernel = {}
if speaker_encoder == "ge2e":
self.encoder = GE2E(speaker_encoder_config, speaker_encoder_ckpt, device=device)
else:
raise ValueError(f" [x] Unknown speaker encoder: {speaker_encoder}")
def __call__(self, audio=None, audio_t=None,
sample_rate=44100): # if use torchaudio, audio_t must be a tensor; else audio must be a np
audio_res = None
if sample_rate == self.encoder_sample_rate:
if self.use_torchaudio and (audio_t is not None):
audio_res = audio_t.cpu().numpy().squeeze(0)
else:
if audio is not None:
audio_res = audio
else:
key_str = str(sample_rate)
if self.use_torchaudio and (audio_t is not None):
if key_str not in self.resample_kernel:
self.resample_kernel[key_str] = Resample(sample_rate, self.encoder_sample_rate,
lowpass_filter_width=128).to(self.device)
audio_res = self.resample_kernel[key_str](audio_t).cpu().numpy().squeeze(0)
else:
if audio is not None:
audio_res = librosa.resample(audio, orig_sr=sample_rate, target_sr=self.encoder_sample_rate)
assert audio_res is not None
return self.encoder(audio_res)
def mean_spk_emb_from_wav_list(self, audio_list, sr_list):
assert len(audio_list) == len(sr_list)
batch_spk_emb = None
print("Get mean spk_emb from audio_list")
for index in tqdm(range(len(audio_list))):
audio = audio_list[index]
sample_rate = sr_list[index]
f_len = int(50 * len(audio) / sample_rate) # 50f/s is for sr=16000hop_size=320
spk_emb = self.__call__(audio=audio, sample_rate=sample_rate)
spk_emb = np.tile(spk_emb, (f_len, 1))
if batch_spk_emb is None:
batch_spk_emb = spk_emb
else:
batch_spk_emb = np.concatenate([spk_emb, batch_spk_emb], axis=0)
return np.mean(batch_spk_emb, axis=0)
def mean_spk_emb_from_path_list(self, path_list):
batch_spk_emb = None
print("Get mean spk_emb from path_list")
for path in tqdm(path_list):
audio, sample_rate = librosa.load(path, sr=None)
f_len = int(50 * len(audio) / sample_rate) # 50f/s is for sr=16000hop_size=320
spk_emb = self.__call__(audio=audio, sample_rate=sample_rate)
spk_emb = np.tile(spk_emb, (f_len, 1))
if batch_spk_emb is None:
batch_spk_emb = spk_emb
else:
batch_spk_emb = np.concatenate([spk_emb, batch_spk_emb], axis=0)
return np.mean(batch_spk_emb, axis=0)
class GE2E:
def __init__(self, config_path, ckpt_path, device='cuda'):
import json5
with open(config_path) as f:
self.config = json5.load(f)
# load model
self.model = TTSSpeakerEncoder(
self.config['model']["input_dim"],
self.config['model']["proj_dim"],
self.config['model']["lstm_dim"],
self.config['model']["num_lstm_layers"],
)
with fsspec.open(ckpt_path, "rb") as f:
state = torch.load(f, map_location=device)
self.model.load_state_dict(state["model"])
self.model = self.model.to(device)
self.model.eval()
self.preemphasis = self.config["audio"]["preemphasis"]
self.do_amp_to_db_mel = True
self.fft_size = self.config["audio"]["fft_size"]
self.hop_length = self.config["audio"]["hop_length"]
self.win_length = self.config["audio"]["win_length"]
self.signal_norm = self.config['audio']['signal_norm']
self.num_mels = self.config["audio"]["num_mels"]
self.ref_level_db = self.config["audio"]['ref_level_db']
self.min_level_db = self.config["audio"]['min_level_db']
self.symmetric_norm = self.config["audio"]['symmetric_norm']
self.clip_norm = self.config["audio"]['clip_norm']
self.max_norm = self.config["audio"]['max_norm']
self.stft_pad_mode = 'reflect'
self.spec_gain = 20.0
self.base = 10
self.device = device
mel_basis = librosa.filters.mel(
sr=self.config["audio"]["sample_rate"], n_fft=self.config["audio"]['fft_size'],
n_mels=self.num_mels, fmin=self.config["audio"]['mel_fmin'],
fmax=self.config["audio"]['mel_fmax']
)
self.mel_basis = torch.from_numpy(mel_basis).float()
def __call__(self, audio, use_old_infer=True):
y = audio
if self.preemphasis != 0:
y = scipy.signal.lfilter([1, -self.preemphasis], [1], y)
D = librosa.stft(
y=y,
n_fft=self.fft_size, hop_length=self.hop_length, win_length=self.win_length, pad_mode=self.stft_pad_mode,
window="hann", center=True)
D = np.abs(D)
D = np.dot(self.mel_basis, D)
if self.base == 10:
spec = self.spec_gain * np.log10(np.maximum(1e-5, D))
else:
spec = self.spec_gain * np.log(np.maximum(1e-5, D))
spec = self.normalize(spec).astype(np.float32)
spec = torch.from_numpy(spec.T)
spec = spec.to(self.device)
spec = spec.unsqueeze(0)
if use_old_infer:
spk_emb = self.compute_embedding_old(spec).detach().cpu().numpy()
else:
spk_emb = self.model.compute_embedding(spec).detach().cpu().numpy()
return spk_emb.squeeze()
def normalize(self, S) -> np.ndarray:
S = S.copy()
if self.signal_norm:
S -= self.ref_level_db
S_norm = (S - self.min_level_db) / (-self.min_level_db)
if self.symmetric_norm:
S_norm = ((2 * self.max_norm) * S_norm) - self.max_norm
if self.clip_norm:
S_norm = np.clip(S_norm, -self.max_norm, self.max_norm)
return S_norm
else:
S_norm = self.max_norm * S_norm
if self.clip_norm:
S_norm = np.clip(S_norm, 0, self.max_norm)
return S_norm
else:
return S
def compute_embedding_old(self, x, num_frames=250, num_eval=10, return_mean=True):
max_len = x.shape[1]
if max_len < num_frames:
num_frames = max_len
offsets = np.linspace(0, max_len - num_frames, num=num_eval)
frames_batch = []
for offset in offsets:
offset = int(offset)
end_offset = int(offset + num_frames)
frames = x[:, offset:end_offset]
frames_batch.append(frames)
frames_batch = torch.cat(frames_batch, dim=0)
embeddings = self.model.inference(frames_batch)
if return_mean:
embeddings = torch.mean(embeddings, dim=0, keepdim=True)
return embeddings
class F0_Extractor:
def __init__(self, f0_extractor, sample_rate=44100, hop_size=512, f0_min=65, f0_max=800,
block_size=None, model_sampling_rate=None):
self.block_size = block_size
self.model_sampling_rate = model_sampling_rate
self.f0_extractor = f0_extractor
self.sample_rate = sample_rate
self.hop_size = hop_size
self.f0_min = f0_min
self.f0_max = f0_max
self.transformer_f0 = None
if f0_extractor == 'crepe':
key_str = str(sample_rate)
if key_str not in CREPE_RESAMPLE_KERNEL:
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sample_rate, 16000, lowpass_filter_width=128)
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
if (self.block_size is not None) or (self.model_sampling_rate is not None):
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
self.hop_size_follow_input = True
else:
self.hop_size_follow_input = False
def extract(self, audio, uv_interp=False, device=None, silence_front=0, sr=None): # audio: 1d numpy array
if sr is not None:
assert self.hop_size_follow_input
self.hop_size = self.block_size * sr / self.model_sampling_rate
if (self.f0_extractor == 'crepe') and (sr != self.sample_rate):
key_str = str(sr)
if key_str not in CREPE_RESAMPLE_KERNEL:
CREPE_RESAMPLE_KERNEL[key_str] = Resample(sr, 16000, lowpass_filter_width=128)
self.resample_kernel = CREPE_RESAMPLE_KERNEL[key_str]
self.sample_rate = sr
# extractor start time
raw_audio = audio
n_frames = int(len(audio) // self.hop_size) + 1
start_frame = int(silence_front * self.sample_rate / self.hop_size)
real_silence_front = start_frame * self.hop_size / self.sample_rate
audio = audio[int(np.round(real_silence_front * self.sample_rate)):]
# extract f0 using parselmouth
if self.f0_extractor == 'parselmouth':
f0 = parselmouth.Sound(audio, self.sample_rate).to_pitch_ac(
time_step=self.hop_size / self.sample_rate,
voicing_threshold=0.6,
pitch_floor=self.f0_min,
pitch_ceiling=self.f0_max).selected_array['frequency']
pad_size = start_frame + (int(len(audio) // self.hop_size) - len(f0) + 1) // 2
f0 = np.pad(f0, (pad_size, n_frames - len(f0) - pad_size))
# extract f0 using dio
elif self.f0_extractor == 'dio':
_f0, t = pw.dio(
audio.astype('double'),
self.sample_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
channels_in_octave=2,
frame_period=(1000 * self.hop_size / self.sample_rate))
f0 = pw.stonemask(audio.astype('double'), _f0, t, self.sample_rate)
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
# extract f0 using harvest
elif self.f0_extractor == 'harvest':
f0, _ = pw.harvest(
audio.astype('double'),
self.sample_rate,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=(1000 * self.hop_size / self.sample_rate))
f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
# extract f0 using crepe
elif self.f0_extractor == 'crepe':
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
resample_kernel = self.resample_kernel.to(device)
wav16k_torch = resample_kernel(torch.FloatTensor(audio).unsqueeze(0).to(device))
f0, pd = torchcrepe.predict(wav16k_torch, 16000, 80, self.f0_min, self.f0_max, pad=True, model='full',
batch_size=512, device=device, return_periodicity=True)
pd = median_pool_1d(pd, 4)
f0 = torchcrepe.threshold.At(0.05)(f0, pd)
f0 = masked_avg_pool_1d(f0, 4)
f0 = f0.squeeze(0).cpu().numpy()
f0 = np.array(
[f0[int(min(int(np.round(n * self.hop_size / self.sample_rate / 0.005)), len(f0) - 1))] for n in
range(n_frames - start_frame)])
f0 = np.pad(f0, (start_frame, 0))
elif self.f0_extractor == "transformer_f0":
if self.transformer_f0 is None:
from transformer_f0.model import TransformerF0Infer
self.transformer_f0 = TransformerF0Infer(model_path='exp/f0_test_genshin/model_540000.pt')
# raw_audio = audio
f0 = self.transformer_f0(audio=raw_audio, sr=self.sample_rate)
# f0 = f0.transpose(1, 2)
# f0 = torch.nn.functional.interpolate(f0, size=int(n_frames), mode='nearest')
# f0 = f0.transpose(1, 2)
f0 = f0.squeeze().cpu().numpy()
# f0 = np.pad(f0.astype('float'), (start_frame, n_frames - len(f0) - start_frame))
else:
raise ValueError(f" [x] Unknown f0 extractor: {self.f0_extractor}")
# interpolate the unvoiced f0
if uv_interp:
uv = f0 == 0
if len(f0[~uv]) > 0:
f0[uv] = np.interp(np.where(uv)[0], np.where(~uv)[0], f0[~uv])
f0[f0 < self.f0_min] = self.f0_min
return f0
class Volume_Extractor:
def __init__(self, hop_size=512, block_size=None, model_sampling_rate=None):
self.block_size = block_size
self.model_sampling_rate = model_sampling_rate
self.hop_size = hop_size
if (self.block_size is not None) or (self.model_sampling_rate is not None):
assert (self.block_size is not None) and (self.model_sampling_rate is not None)
self.hop_size_follow_input = True
else:
self.hop_size_follow_input = False
def extract(self, audio, sr=None): # audio: 1d numpy array
if sr is not None:
assert self.hop_size_follow_input
self.hop_size = self.block_size * sr / self.model_sampling_rate
n_frames = int(len(audio) // self.hop_size) + 1
audio2 = audio ** 2
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
volume = np.array(
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = np.sqrt(volume)
'''
if isinstance(audio, torch.Tensor):
n_frames = int(audio.size(-1) // self.hop_size) + 1
audio2 = audio ** 2
audio2 = torch.nn.functional.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)),
mode='reflect')
audio_frame = torch.nn.functional.unfold(audio2[:, None, None, :], (1, int(self.hop_size)),
stride=int(self.hop_size))[:, :, :n_frames]
volume = audio_frame.mean(dim=1)[0]
volume = torch.sqrt(volume).squeeze().cpu().numpy()
else:
n_frames = int(len(audio) // self.hop_size) + 1
audio2 = audio ** 2
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
volume = np.array(
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = np.sqrt(volume)
'''
return volume
def get_mask_from_volume(self, volume, threhold=-60.0,device='cpu'):
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, self.block_size).squeeze(-1)
return mask
class Units_Encoder:
def __init__(self, encoder, encoder_ckpt, encoder_sample_rate=16000, encoder_hop_size=320, device=None,
cnhubertsoft_gate=10, units_forced_mode='nearest'):
if device is None:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = device
if cnhubertsoft_gate is None:
cnhubertsoft_gate = 10
if units_forced_mode is None:
units_forced_mode = 'left'
self.units_forced_mode = units_forced_mode
is_loaded_encoder = False
if encoder == 'hubertsoft':
self.model = Audio2HubertSoft(encoder_ckpt).to(device)
is_loaded_encoder = True
if encoder == 'hubertbase':
self.model = Audio2HubertBase(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'hubertbase768':
self.model = Audio2HubertBase768(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'hubertbase768l12':
self.model = Audio2HubertBase768L12(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'hubertlarge1024l24':
self.model = Audio2HubertLarge1024L24(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'contentvec':
self.model = Audio2ContentVec(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'contentvec768':
self.model = Audio2ContentVec768(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'contentvec768l12':
self.model = Audio2ContentVec768L12(encoder_ckpt, device=device)
is_loaded_encoder = True
if encoder == 'cnhubertsoftfish':
self.model = CNHubertSoftFish(encoder_ckpt, device=device, gate_size=cnhubertsoft_gate)
is_loaded_encoder = True
if encoder in ('wav2vec2', 'wav2vec2-xlsr-53-espeak-cv-ft'):
self.model = Wav2Vec2(encoder_ckpt, device=device)
is_loaded_encoder = True
if not is_loaded_encoder:
raise ValueError(f" [x] Unknown units encoder: {encoder}")
print(f"Units Forced Mode:{self.units_forced_mode}")
if self.units_forced_mode == 'rfa512to441':
encoder_sample_rate = encoder_sample_rate * 441 / 512
if self.units_forced_mode == 'rfa441to512':
encoder_sample_rate = encoder_sample_rate * 512 / 441
self.resample_kernel = {}
self.encoder_sample_rate = encoder_sample_rate
self.encoder_hop_size = encoder_hop_size
def encode(self,
audio, # B, T
sample_rate,
hop_size,
padding_mask=None):
# resample
if self.units_forced_mode not in ('rfa441to512', 'rfa512to441'):
if sample_rate == self.encoder_sample_rate:
audio_res = audio
else:
key_str = str(sample_rate)
if key_str not in self.resample_kernel:
self.resample_kernel[key_str] = Resample(sample_rate, self.encoder_sample_rate,
lowpass_filter_width=128).to(self.device)
audio_res = self.resample_kernel[key_str](audio)
else:
if isinstance(audio, np.ndarray):
_audio = audio
else:
_audio = audio.cpu().numpy()
audio_res = librosa.resample(_audio, orig_sr=sample_rate, target_sr=self.encoder_sample_rate)
audio_res = torch.from_numpy(audio_res).to(self.device)
# encode
if audio_res.size(-1) < 400:
audio_res = torch.nn.functional.pad(audio, (0, 400 - audio_res.size(-1)))
units = self.model(audio_res, padding_mask=padding_mask)
# alignment
if self.units_forced_mode == 'left':
n_frames = audio.size(-1) // hop_size + 1
ratio = (hop_size / sample_rate) / (self.encoder_hop_size / self.encoder_sample_rate)
index = torch.clamp(torch.round(ratio * torch.arange(n_frames).to(self.device)).long(), max=units.size(1) - 1)
units_aligned = torch.gather(units, 1, index.unsqueeze(0).unsqueeze(-1).repeat([1, 1, units.size(-1)]))
elif self.units_forced_mode == 'nearest':
n_frames = int(audio.size(-1) // hop_size + 1)
units = units.transpose(1, 2)
units_aligned = torch.nn.functional.interpolate(units, size=int(n_frames), mode='nearest')
units_aligned = units_aligned.transpose(1, 2)
elif self.units_forced_mode in ('rfa441to512', 'rfa512to441'):
n_frames = int(audio.size(-1) // hop_size + 1)
units = units.transpose(1, 2)
units_aligned = torch.nn.functional.interpolate(units, size=int(n_frames), mode='nearest')
units_aligned = units_aligned.transpose(1, 2)
else:
raise ValueError(f'Unknow units_forced_mode:{self.units_forced_mode}')
return units_aligned
class Audio2HubertSoft(torch.nn.Module):
def __init__(self, path, h_sample_rate=16000, h_hop_size=320):
super().__init__()
print(' [Encoder Model] HuBERT Soft')
self.hubert = HubertSoft()
print(' [Loading] ' + path)
checkpoint = torch.load(path)
consume_prefix_in_state_dict_if_present(checkpoint, "module.")
self.hubert.load_state_dict(checkpoint)
self.hubert.eval()
def forward(self, audio, padding_mask=None): # B, T
with torch.inference_mode():
units = self.hubert.units(audio.unsqueeze(1))
return units
class Audio2ContentVec():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] Content Vec')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
# wav_tensor = torch.from_numpy(audio).to(self.device)
wav_tensor = audio
feats = wav_tensor.view(1, -1)
if padding_mask is None:
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": feats.to(wav_tensor.device),
"padding_mask": padding_mask.to(wav_tensor.device),
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = self.hubert.extract_features(**inputs)
feats = self.hubert.final_proj(logits[0])
units = feats # .transpose(2, 1)
return units
class Audio2ContentVec768():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] Content Vec')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
# wav_tensor = torch.from_numpy(audio).to(self.device)
wav_tensor = audio
feats = wav_tensor.view(1, -1)
if padding_mask is None:
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": feats.to(wav_tensor.device),
"padding_mask": padding_mask.to(wav_tensor.device),
"output_layer": 9, # layer 9
}
with torch.no_grad():
logits = self.hubert.extract_features(**inputs)
feats = logits[0]
units = feats # .transpose(2, 1)
return units
class Audio2ContentVec768L12():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] Content Vec')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
# wav_tensor = torch.from_numpy(audio).to(self.device)
wav_tensor = audio
feats = wav_tensor.view(1, -1)
if padding_mask is None:
padding_mask = torch.BoolTensor(feats.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": feats.to(wav_tensor.device),
"padding_mask": padding_mask.to(wav_tensor.device),
"output_layer": 12, # layer 12
}
with torch.no_grad():
logits = self.hubert.extract_features(**inputs)
feats = logits[0]
units = feats # .transpose(2, 1)
return units
class CNHubertSoftFish(torch.nn.Module):
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu', gate_size=10):
super().__init__()
self.device = device
print(' [Encoder Model] CN Hubert Soft fish')
print(' [Loading] ' + path)
self.gate_size = gate_size
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"./pretrain/TencentGameMate/chinese-hubert-base")
self.model = HubertModel.from_pretrained("./pretrain/TencentGameMate/chinese-hubert-base")
self.proj = torch.nn.Sequential(torch.nn.Dropout(0.1), torch.nn.Linear(768, 256))
# self.label_embedding = nn.Embedding(128, 256)
state_dict = torch.load(path, map_location=device)
self.load_state_dict(state_dict)
@torch.no_grad()
def forward(self, audio, padding_mask=None): # B, T
input_values = self.feature_extractor(
audio, sampling_rate=16000, return_tensors="pt"
).input_values
input_values = input_values.to(self.model.device)
return self._forward(input_values[0])
@torch.no_grad()
def _forward(self, input_values):
features = self.model(input_values)
features = self.proj(features.last_hidden_state)
# Top-k gating
topk, indices = torch.topk(features, self.gate_size, dim=2)
features = torch.zeros_like(features).scatter(2, indices, topk)
features = features / features.sum(2, keepdim=True)
return features.to(self.device) # .transpose(1, 2)
class Audio2HubertBase():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] HuBERT Base')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert = self.hubert.float()
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
with torch.no_grad():
if padding_mask is None:
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": audio.to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 9, # layer 9
}
logits = self.hubert.extract_features(**inputs)
units = self.hubert.final_proj(logits[0])
return units
class Audio2HubertBase768():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] HuBERT Base')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert = self.hubert.float()
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
with torch.no_grad():
if padding_mask is None:
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": audio.to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 9, # layer 9
}
logits = self.hubert.extract_features(**inputs)
units = logits[0]
return units
class Audio2HubertBase768L12():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] HuBERT Base')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert = self.hubert.float()
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
with torch.no_grad():
if padding_mask is None:
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": audio.to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 12, # layer 12
}
logits = self.hubert.extract_features(**inputs)
units = logits[0]
return units
class Audio2HubertLarge1024L24():
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
print(' [Encoder Model] HuBERT Large')
print(' [Loading] ' + path)
self.models, self.saved_cfg, self.task = checkpoint_utils.load_model_ensemble_and_task([path], suffix="", )
self.hubert = self.models[0]
self.hubert = self.hubert.to(self.device)
self.hubert = self.hubert.float()
self.hubert.eval()
def __call__(self, audio, padding_mask=None): # B, T
with torch.no_grad():
if padding_mask is None:
padding_mask = torch.BoolTensor(audio.shape).fill_(False)
else:
padding_mask = padding_mask.bool()
padding_mask = ~padding_mask if torch.all(padding_mask) else padding_mask
inputs = {
"source": audio.to(self.device),
"padding_mask": padding_mask.to(self.device),
"output_layer": 24, # layer 24
}
logits = self.hubert.extract_features(**inputs)
units = logits[0]
return units
class Wav2Vec2:
def __init__(self, path, h_sample_rate=16000, h_hop_size=320, device='cpu'):
self.device = device
self.model = Wav2Vec2ForCTC.from_pretrained(path)
self.model.eval()
self.model.to(device)
def __call__(self, audio, padding_mask=None): # B, T
with torch.no_grad():
logits = self.model(audio).logits
return logits
class DotDict(dict):
def __getattr__(*args):
val = dict.get(*args)
return DotDict(val) if type(val) is dict else val
__setattr__ = dict.__setitem__
__delattr__ = dict.__delitem__
def masked_avg_pool_1d(x, kernel_size):
x = x.unsqueeze(1)
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
mask = ~torch.isnan(x)
masked_x = torch.where(mask, x, torch.zeros_like(x))
ones_kernel = torch.ones(x.size(1), 1, kernel_size, device=x.device)
# Perform sum pooling
sum_pooled = F.conv1d(
masked_x,
ones_kernel,
stride=1,
padding=0,
groups=x.size(1),
)
# Count the non-masked (valid) elements in each pooling window
valid_count = F.conv1d(
mask.float(),
ones_kernel,
stride=1,
padding=0,
groups=x.size(1),
)
valid_count = valid_count.clamp(min=1) # Avoid division by zero
# Perform masked average pooling
avg_pooled = sum_pooled / valid_count
return avg_pooled.squeeze(1)
def median_pool_1d(x, kernel_size):
x = x.unsqueeze(1)
x = F.pad(x, ((kernel_size - 1) // 2, kernel_size // 2), mode="reflect")
x = x.squeeze(1)
x = x.unfold(1, kernel_size, 1)
x, _ = torch.sort(x, dim=-1)
return x[:, :, (kernel_size - 1) // 2]
def upsample(signal, factor):
signal = signal.permute(0, 2, 1)
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1,
mode='linear', align_corners=True)
signal = signal[:, :, :-1]
return signal.permute(0, 2, 1)
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
result = np.zeros(idx + b.shape[0])
fade_len = a.shape[0] - idx
np.copyto(dst=result[:idx], src=a[:idx])
k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
return result

View File

@ -0,0 +1,80 @@
import os
import numpy as np
from tqdm import tqdm
import pickle
import torch
from pathlib import Path
def train_index(path):
import faiss
# from: RVC https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI
# 获取文件列表
listdir_res = []
for file in os.listdir(path):
listdir_res.append(os.path.join(path, file))
npys = []
# 读取文件
print(" [INFO] Loading the Units files...")
for name in tqdm(sorted(listdir_res)):
phone = np.load(name)
npys.append(phone)
# 正式内容
big_npy = np.concatenate(npys, 0)
big_npy_idx = np.arange(big_npy.shape[0])
np.random.shuffle(big_npy_idx)
big_npy = big_npy[big_npy_idx]
n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39)
index = faiss.index_factory(big_npy.shape[1], "IVF%s,Flat" % n_ivf)
index_ivf = faiss.extract_index_ivf(index) #
index_ivf.nprobe = 1
index.train(big_npy)
batch_size_add = 8192
print(" [INFO] Training the Units indexes...")
for i in tqdm(range(0, big_npy.shape[0], batch_size_add)):
index.add(big_npy[i: i + batch_size_add])
return index
class UnitsIndexer:
def __init__(self, exp_path):
exp_path = Path(exp_path)
self.model = None
self.exp_path = exp_path
self.spk_id = -1
self.active = False
self.big_all_npy = None
def load(self, spk_id=1, exp_path=None):
if (exp_path is not None) and os.path.samefile(self.exp_path, Path(exp_path)):
exp_path = Path(exp_path)
self.exp_path = exp_path
index_pkl_path = os.path.join(self.exp_path, 'units_index', f'spk{spk_id}.pkl')
if not os.path.isfile(index_pkl_path):
self.active = False
print(f" [WARNING] No such file as {index_pkl_path}, Disable Units Indexer.")
else:
import faiss
self.spk_id = spk_id
self.active = True
with open(index_pkl_path, "rb") as f:
self.model = pickle.load(f)[str(spk_id)]
self.big_all_npy = self.model.reconstruct_n(0, self.model.ntotal)
print(f" [INFO] Successfully load Units Indexer from {index_pkl_path}.")
def __call__(self, units_t, spk_id=1, ratio=1):
if self.spk_id != spk_id:
self.load(spk_id=spk_id)
if self.active:
units = units_t.squeeze().to('cpu').numpy()
# print(" [INFO] Starting feature retrieval...")
score, ix = self.model.search(units, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_all_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
units = ratio * npy + (1 - ratio) * units
units_t = torch.from_numpy(units).unsqueeze(0).float().to(units_t.device)
# print(f" [INFO] End feature retrieval...Ratio is {ratio}.")
return units_t
else:
print(f" [WARNING] Units Indexer is not active, disable units index.")
return units_t

View File

@ -0,0 +1,258 @@
import numpy as np
from typing import Any
import math
import torch
import torch.nn.functional as F
from torch.cuda.amp import autocast
from Exceptions import (
DeviceCannotSupportHalfPrecisionException,
DeviceChangingException,
HalfPrecisionChangingException,
NotEnoughDataExtimateF0,
)
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class Pipeline(object):
embedder: Embedder
inferencer: Inferencer
pitchExtractor: PitchExtractor
index: Any | None
big_npy: Any | None
# feature: Any | None
targetSR: int
device: torch.device
isHalf: bool
def __init__(
self,
embedder: Embedder,
inferencer: Inferencer,
pitchExtractor: PitchExtractor,
index: Any | None,
# feature: Any | None,
targetSR,
device,
isHalf,
):
self.embedder = embedder
self.inferencer = inferencer
self.pitchExtractor = pitchExtractor
print("GENERATE INFERENCER", self.inferencer)
print("GENERATE EMBEDDER", self.embedder)
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
self.index = index
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
# self.feature = feature
self.targetSR = targetSR
self.device = device
self.isHalf = isHalf
self.sr = 16000
self.window = 160
def getPipelineInfo(self):
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
embedderInfo = self.embedder.getEmbedderInfo()
pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf}
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
self.pitchExtractor = pitchExtractor
def exec(
self,
sid,
audio, # torch.tensor [n]
pitchf, # np.array [m]
feature, # np.array [m, feat]
f0_up_key,
index_rate,
if_f0,
silence_front,
embOutputLayer,
useFinalProj,
repeat,
protect=0.5,
out_size=None,
):
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
# self.t_pad = self.sr * repeat # 1秒
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
audio = audio.unsqueeze(0)
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
p_len = audio_pad.shape[0] // self.window
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
# RVC QualityがOnのときにはsilence_frontをオフに。
silence_front = silence_front if repeat == 0 else 0
pitchf = pitchf if repeat == 0 else np.zeros(p_len)
out_size = out_size if repeat == 0 else None
# ピッチ検出
try:
if if_f0 == 1:
pitch, pitchf = self.pitchExtractor.extract(
audio_pad,
pitchf,
f0_up_key,
self.sr,
self.window,
silence_front=silence_front,
)
# pitch = pitch[:p_len]
# pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
else:
pitch = None
pitchf = None
except IndexError:
# print(e)
raise NotEnoughDataExtimateF0()
# tensor型調整
feats = audio_pad
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
# embedding
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
with autocast(enabled=self.isHalf):
try:
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
if torch.isnan(feats).all():
raise DeviceCannotSupportHalfPrecisionException()
except RuntimeError as e:
if "HALF" in e.__str__().upper():
raise HalfPrecisionChangingException()
elif "same device" in e.__str__():
raise DeviceChangingException()
else:
raise e
if protect < 0.5 and search_index:
feats0 = feats.clone()
# Index - feature抽出
# if self.index is not None and self.feature is not None and index_rate != 0:
if search_index:
npy = feats[0].cpu().numpy()
# apply silent front for indexsearch
npyOffset = math.floor(silence_front * 16000) // 360
npy = npy[npyOffset:]
if self.isHalf is True:
npy = npy.astype("float32")
# TODO: kは調整できるようにする
k = 1
if k == 1:
_, ix = self.index.search(npy, 1)
npy = self.big_npy[ix.squeeze()]
else:
score, ix = self.index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
# recover silient font
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and search_index:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
# ピッチサイズ調整
p_len = audio_pad.shape[0] // self.window
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
if protect < 0.5 and search_index:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
# apply silent front for inference
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
npyOffset = math.floor(silence_front * 16000) // 360
feats = feats[:, npyOffset * 2 :, :] # NOQA
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# 推論実行
try:
with torch.no_grad():
with autocast(enabled=self.isHalf):
audio1 = (
torch.clip(
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
-1.0,
1.0,
)
* 32767.5
).data.to(dtype=torch.int16)
except RuntimeError as e:
if "HALF" in e.__str__().upper():
print("11", e)
raise HalfPrecisionChangingException()
else:
raise e
feats_buffer = feats.squeeze(0).detach().cpu()
if pitchf is not None:
pitchf_buffer = pitchf.squeeze(0).detach().cpu()
else:
pitchf_buffer = None
del p_len, padding_mask, pitch, pitchf, feats
torch.cuda.empty_cache()
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
# pipelineに入力されるときはhubertように16k
if self.t_pad_tgt != 0:
offset = self.t_pad_tgt
end = -1 * self.t_pad_tgt
audio1 = audio1[offset:end]
del sid
torch.cuda.empty_cache()
return audio1, pitchf_buffer, feats_buffer

View File

@ -0,0 +1,51 @@
import os
import traceback
import faiss
from data.ModelSlot import DiffusionSVCModelSlot, RVCModelSlot
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
from voice_changer.RVC.pipeline.Pipeline import Pipeline
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
dev = DeviceManager.get_instance().getDevice(gpu)
half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
# # Inferencer 生成
# try:
# inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
# except Exception as e:
# print("[Voice Changer] exception! loading inferencer", e)
# traceback.print_exc()
# # Embedder 生成
# try:
# embedder = EmbedderManager.getEmbedder(
# modelSlot.embedder,
# # emmbedderFilename,
# half,
# dev,
# )
# except Exception as e:
# print("[Voice Changer] exception! loading embedder", e)
# traceback.print_exc()
# # pitchExtractor
# pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
# pipeline = Pipeline(
# embedder,
# inferencer,
# pitchExtractor,
# index,
# modelSlot.samplingRate,
# dev,
# half,
# )
# return pipeline

View File

@ -1,6 +1,6 @@
from typing import Any, Union, cast
from const import TMP_DIR, ModelType
from const import TMP_DIR
import torch
import os
import traceback
@ -71,7 +71,6 @@ class VoiceChanger:
self.crossfadeSize = 0 # calculated
self.voiceChanger: VoiceChangerModel | None = None
self.modelType: ModelType | None = None
self.params = params
self.gpu_num = torch.cuda.device_count()
self.prev_audio = np.zeros(4096)
@ -84,10 +83,7 @@ class VoiceChanger:
self.voiceChanger = model
def getModelType(self):
if self.modelType is not None:
return {"status": "OK", "vc": self.modelType}
else:
return {"status": "OK", "vc": "none"}
return {"status": "OK", "vc": "-----"}
def get_info(self):
data = asdict(self.settings)

View File

@ -9,7 +9,7 @@ from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks
from voice_changer.ModelSlotManager import ModelSlotManager
from voice_changer.RVC.RVCModelMerger import RVCModelMerger
from voice_changer.VoiceChanger import VoiceChanger
from const import STORED_SETTING_FILE, UPLOAD_DIR, ModelType
from const import STORED_SETTING_FILE, UPLOAD_DIR
from voice_changer.utils.LoadModelParams import LoadModelParamFile, LoadModelParams
from voice_changer.utils.ModelMerger import MergeElement, ModelMergerRequest
from voice_changer.utils.VoiceChangerModel import AudioInOut
@ -165,6 +165,11 @@ class VoiceChangerManager(ServerDeviceCallbacks):
slotInfo = DDSP_SVCModelSlotGenerator.loadModel(params)
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
elif params.voiceChangerType == "Diffusion-SVC":
from voice_changer.DiffusionSVC.DiffusionSVCModelSlotGenerator import DiffusionSVCModelSlotGenerator
slotInfo = DiffusionSVCModelSlotGenerator.loadModel(params)
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
print("params", params)
def get_info(self):
@ -232,6 +237,13 @@ class VoiceChangerManager(ServerDeviceCallbacks):
self.voiceChangerModel = DDSP_SVC(self.params, slotInfo)
self.voiceChanger = VoiceChanger(self.params)
self.voiceChanger.setModel(self.voiceChangerModel)
elif slotInfo.voiceChangerType == "Diffusion-SVC":
print("................Diffusion-SVC")
from voice_changer.DiffusionSVC.DiffusionSVC import DiffusionSVC
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
self.voiceChanger = VoiceChanger(self.params)
self.voiceChanger.setModel(self.voiceChangerModel)
else:
print(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
if hasattr(self, "voiceChangerModel"):
@ -267,9 +279,6 @@ class VoiceChangerManager(ServerDeviceCallbacks):
print("Voice Change is not loaded. Did you load a correct model?")
return np.zeros(1).astype(np.int16), []
def switchModelType(self, modelType: ModelType):
return self.voiceChanger.switchModelType(modelType)
def getModelType(self):
return self.voiceChanger.getModelType()

View File

@ -18,6 +18,7 @@ LoadModelParamFileKind: TypeAlias = Literal[
"ddspSvcModelConfig",
"ddspSvcDiffusion",
"ddspSvcDiffusionConfig",
"diffusionSVCModel",
]