WIP: diffusion svc rt badf0

This commit is contained in:
w-okada 2023-07-14 03:33:04 +09:00
parent 9c829ac91a
commit 5bf1202215
19 changed files with 1709 additions and 384 deletions

View File

@ -66,6 +66,11 @@ class EnumInferenceTypes(Enum):
onnxRVCNono = "onnxRVCNono"
DiffusionSVCInferenceType: TypeAlias = Literal[
"combo",
]
PitchExtractorType: TypeAlias = Literal[
"harvest",
"dio",

View File

@ -1,5 +1,5 @@
from typing import TypeAlias, Union
from const import MAX_SLOT_NUM, EnumInferenceTypes, EmbedderType, VoiceChangerType
from const import MAX_SLOT_NUM, DiffusionSVCInferenceType, EnumInferenceTypes, EmbedderType, VoiceChangerType
from dataclasses import dataclass, asdict, field
@ -107,7 +107,7 @@ class DiffusionSVCModelSlot(ModelSlot):
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
modelFile: str = ""
isONNX: bool = False
modelType: str = "combo"
modelType: DiffusionSVCInferenceType = "combo"
dstId: int = 1
sampleId: str = ""
@ -115,6 +115,8 @@ class DiffusionSVCModelSlot(ModelSlot):
kstep: int = 100
speakers: dict = field(default_factory=lambda: {1: "user"})
embedder: EmbedderType = "hubert_base"
samplingRate: int = 44100
embChannels: int = 768
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]

View File

@ -52,7 +52,7 @@ class DiffusionSVC(VoiceChangerModel):
print("[Voice Changer] [DiffusionSVC] Initializing... done")
def update_settings(self, key: str, val: int | float | str):
print("[Voice Changer][RVC]: update_settings", key, val)
print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
if key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "gpu":
@ -86,18 +86,17 @@ class DiffusionSVC(VoiceChangerModel):
crossfadeSize: int,
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate # 100 は hubertのhosizeから (16000 / 160)
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
print("^^^self.feature_buffer.shape, self.slotInfo.embChannels",self.feature_buffer.shape, self.slotInfo.embChannels)
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
else:
self.audio_buffer = newData
if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
@ -110,14 +109,12 @@ class DiffusionSVC(VoiceChangerModel):
# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
convertOffset = -1 * convertSize
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
if self.slotInfo.f0:
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
self.feature_buffer = self.feature_buffer[featureOffset:]
@ -145,18 +142,18 @@ class DiffusionSVC(VoiceChangerModel):
if self.pipeline is not None:
device = self.pipeline.device
else:
device = torch.device("cpu")
device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
repeat = 1 if self.settings.rvcQuality else 0
repeat = 0
sid = self.settings.dstId
f0_up_key = self.settings.tran
index_rate = self.settings.indexRatio
protect = self.settings.protect
index_rate = 0
protect = 0
if_f0 = 1 if self.slotInfo.f0 else 0
embOutputLayer = self.slotInfo.embOutputLayer
useFinalProj = self.slotInfo.useFinalProj
if_f0 = 1
embOutputLayer = 12
useFinalProj = False
try:
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
@ -167,14 +164,17 @@ class DiffusionSVC(VoiceChangerModel):
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
embOutputLayer,
useFinalProj,
repeat,
protect,
outSize
)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
result = audio_out.detach().cpu().numpy()
print("RESULT", result)
return result
except DeviceCannotSupportHalfPrecisionException as e: # NOQA

View File

@ -21,6 +21,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
slotInfo.iconFile = "/assets/icons/noimage.png"
slotInfo.embChannels = 768
# if slotInfo.isONNX:
# slotInfo = cls._setInfoByONNX(slotInfo)

View File

@ -1,35 +1,134 @@
import numpy as np
import torch
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
class RVCInferencer(Inferencer):
class DiffusionSVCInferencer(Inferencer):
def __init__(self):
self.diff_model: Unit2Mel | None = None
self.naive_model: Unit2MelNaive | None = None
self.vocoder: Vocoder | None = None
def loadModel(self, file: str, gpu: int):
self.setProps("DiffusionSVCCombo", file, True, gpu)
dev = DeviceManager.get_instance().getDevice(gpu)
isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
self.dev = DeviceManager.get_instance().getDevice(gpu)
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
cpt = torch.load(file, map_location="cpu")
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
self.diff_model = diff_model
self.naive_model = naive_model
self.vocoder = vocoder
self.diff_args = diff_args
print("-----------------> diff_args", diff_args)
print("-----------------> naive_args", naive_args)
model.eval()
model.load_state_dict(cpt["weight"], strict=False)
# cpt = torch.load(file, map_location="cpu")
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
model = model.to(dev)
if isHalf:
model = model.half()
# model.eval()
# model.load_state_dict(cpt["weight"], strict=False)
self.model = model
# model = model.to(dev)
# if isHalf:
# model = model.half()
# self.model = model
return self
def getConfig(self) -> tuple[int, int]:
model_sampling_rate = int(self.diff_args.data.sampling_rate)
model_block_size = int(self.diff_args.data.block_size)
return model_block_size, model_sampling_rate
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None):
if self.diff_args.model.k_step_max is not None:
if k_step is None:
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
if k_step > int(self.diff_args.model.k_step_max):
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
if gt_spec is None:
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
"input mel or output of naive model")
print(f' [INFO] k_step_max is {self.diff_args.model.k_step_max}.')
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
# spk_id
spk_emb_dict = None
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
# without speaker encoder
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
if k_step is not None:
print(f' [INFO] get k_step, do shallow diffusion {k_step} steps')
else:
print(f' [INFO] Do full 1000 steps depth diffusion {k_step}')
print(f" [INFO] method:{method}; infer_speedup:{infer_speedup}")
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
@torch.no_grad()
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
aug_shift=0, spk_emb=None):
# spk_id
spk_emb_dict = None
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
# without speaker encoder
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
print("====> unit, f0, vol", units.shape, f0.shape, volume.shape)
print("====> *unit, f0, vol", units)
print("====> unit, *f0, vol", f0)
print("====> unit, f0, *vol", volume)
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer=True,
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
return out_spec
@torch.no_grad()
def mel2wav(self, mel, f0, start_frame=0):
if start_frame == 0:
return self.vocoder.infer(mel, f0)
else: # for realtime speedup
mel = mel[:, start_frame:, :]
f0 = f0[:, start_frame:, :]
out_wav = self.vocoder.infer(mel, f0)
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
@torch.no_grad()
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor,
pitchf: torch.Tensor,
volume: torch.Tensor,
mask: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
infer_speedup: int,
k_step: int,
silence_front: float,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
print("---------------------------------shape", feats.shape, pitch.shape, volume.shape)
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
print("======================>>>>>gt_spec", gt_spec)
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
print("======================>>>>>out_mel", out_mel)
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
print("======================>>>>>out_wav.shape, mask.shape", out_wav.shape, mask.shape)
out_wav *= mask
print("out_wav:::::::::::", out_wav)
return out_wav.squeeze()

View File

@ -0,0 +1,50 @@
from typing import Any, Protocol
import torch
import onnxruntime
from const import DiffusionSVCInferenceType
class Inferencer(Protocol):
inferencerType: DiffusionSVCInferenceType = "combo"
file: str
isHalf: bool = True
gpu: int = 0
model: onnxruntime.InferenceSession | Any | None = None
def loadModel(self, file: str, gpu: int):
...
def getConfig(self) -> tuple[int, int]:
...
def infer(
self,
feats: torch.Tensor,
pitch_length: torch.Tensor,
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
) -> torch.Tensor:
...
def setProps(
self,
inferencerType: DiffusionSVCInferenceType,
file: str,
isHalf: bool,
gpu: int,
):
self.inferencerType = inferencerType
self.file = file
self.isHalf = isHalf
self.gpu = gpu
def getInferencerInfo(self):
return {
"inferencerType": self.inferencerType,
"file": self.file,
"isHalf": self.isHalf,
"gpu": self.gpu,
}

View File

@ -0,0 +1,29 @@
from const import DiffusionSVCInferenceType
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
from voice_changer.RVC.inferencer.Inferencer import Inferencer
class InferencerManager:
currentInferencer: Inferencer | None = None
@classmethod
def getInferencer(
cls,
inferencerType: DiffusionSVCInferenceType,
file: str,
gpu: int,
) -> Inferencer:
cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu)
return cls.currentInferencer
@classmethod
def loadInferencer(
cls,
inferencerType: DiffusionSVCInferenceType,
file: str,
gpu: int,
) -> Inferencer:
if inferencerType == "combo":
return DiffusionSVCInferencer().loadModel(file, gpu)
else:
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)

View File

@ -38,8 +38,6 @@ class DiffusionSVC:
self.use_combo_model = False
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
if ('1234' + model_path)[-4:] == '.ptc':
self.use_combo_model = True
self.model_path = model_path
self.naive_model_path = model_path
@ -50,9 +48,6 @@ class DiffusionSVC:
self.naive_model = naive_model
self.naive_model_args = naive_args
self.vocoder = vocoder
else:
self.model_path = model_path
self.model, self.vocoder, self.args = load_model_vocoder(model_path, device=self.device)
self.units_encoder = Units_Encoder(
self.args.data.encoder,
@ -85,33 +80,6 @@ class DiffusionSVC:
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
def flush(self, model_path=None, f0_model=None, f0_min=None, f0_max=None, naive_model_path=None):
assert (model_path is not None) or (naive_model_path is not None)
# flush model if changed
if ((self.model_path != model_path) or (self.f0_model != f0_model)
or (self.f0_min != f0_min) or (self.f0_max != f0_max)):
self.load_model(model_path, f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
if (self.naive_model_path != naive_model_path) and (naive_model_path is not None):
self.load_naive_model(naive_model_path)
# check args if use naive
if self.naive_model is not None:
if self.naive_model_args.data.encoder != self.args.data.encoder:
raise ValueError("encoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.model.n_spk != self.args.model.n_spk:
raise ValueError("n_spk of Naive Model and Diffusion Model are different")
if bool(self.naive_model_args.model.use_speaker_encoder) != bool(self.args.model.use_speaker_encoder):
raise ValueError("use_speaker_encoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.vocoder.type != self.args.vocoder.type:
raise ValueError("vocoder of Naive Model and Diffusion Model are different")
if self.naive_model_args.data.block_size != self.args.data.block_size:
raise ValueError("block_size of Naive Model and Diffusion Model are different")
if self.naive_model_args.data.sampling_rate != self.args.data.sampling_rate:
raise ValueError("sampling_rate of Naive Model and Diffusion Model are different")
def flush_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
if (f0_model != self.f0_model) and (f0_model is not None):
self.load_f0_extractor(f0_model)
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
@ -127,12 +95,6 @@ class DiffusionSVC:
model_sampling_rate=self.args.data.sampling_rate
)
def load_naive_model(self, naive_model_path):
self.naive_model_path = naive_model_path
model, _, args = load_model_vocoder(naive_model_path, device=self.device, loaded_vocoder=self.vocoder)
self.naive_model = model
self.naive_model_args = args
print(f" [INFO] Load naive model from {naive_model_path}")
@torch.no_grad()
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
@ -265,144 +227,6 @@ class DiffusionSVC:
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
@torch.no_grad() # 比__call__多了声码器代码输出波形
def infer(self, units, f0, volume, gt_spec=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None):
if k_step is not None:
if self.naive_model is not None:
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, spk_emb=spk_emb)
print(f" [INFO] get mel from naive model out.")
assert gt_spec is not None
if self.naive_model is None:
print(f" [INFO] get mel from input wav.")
if input(" [WARN] You are attempting shallow diffusion "
"on the mel of the input source,"
" Please enter 'gt_mel' to continue") != 'gt_mel':
raise ValueError("Please understand what you're doing")
k_step = int(k_step)
gt_spec = gt_spec
else:
gt_spec = None
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
return self.mel2wav(out_mel, f0)
@torch.no_grad() # 为实时浅扩散优化的推理代码可以切除pad省算力
def infer_for_realtime(self, units, f0, volume, audio_t=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, silence_front=0, diff_jump_silence_front=False):
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
if diff_jump_silence_front:
if audio_t is not None:
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
f0 = f0[:, start_frame:, :]
units = units[:, start_frame:, :]
volume = volume[:, start_frame:, :]
if k_step is not None:
assert audio_t is not None
k_step = int(k_step)
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
# 如果缺帧再开这行gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
if diff_jump_silence_front:
out_wav = self.mel2wav(out_mel, f0)
else:
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
return out_wav
@torch.no_grad() # 不切片从音频推理代码
def infer_from_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None, threhold=-60, index_ratio=0):
units = self.encode_units(audio, sr)
if index_ratio > 0:
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
f0 = self.extract_f0(audio, key=key, sr=sr)
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
if k_step is not None:
assert 0 < int(k_step) <= 1000
k_step = int(k_step)
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
gt_spec = self.vocoder.extract(audio_t, sr)
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
output = self.infer(units, f0, volume, gt_spec=gt_spec, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
output *= mask
return output.squeeze().cpu().numpy(), self.args.data.sampling_rate
@torch.no_grad() # 切片从音频推理代码
def infer_from_long_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
spk_emb=None,
threhold=-60, threhold_for_split=-40, min_len=5000, index_ratio=0):
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
segments = split(audio, sr, hop_size, db_thresh=threhold_for_split, min_len=min_len)
print(f' [INFO] Extract f0 volume and mask: Use {self.f0_model}, start...')
_f0_start_time = time.time()
f0 = self.extract_f0(audio, key=key, sr=sr)
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
_f0_end_time = time.time()
_f0_used_time = _f0_end_time - _f0_start_time
print(f' [INFO] Extract f0 volume and mask: Done. Use time:{_f0_used_time}')
if k_step is not None:
assert 0 < int(k_step) <= 1000
k_step = int(k_step)
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
gt_spec = self.vocoder.extract(audio_t, sr)
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
else:
gt_spec = None
result = np.zeros(0)
current_length = 0
for segment in tqdm(segments):
start_frame = segment[0]
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(self.device)
seg_units = self.units_encoder.encode(seg_input, sr, hop_size)
if index_ratio > 0:
seg_units = self.units_indexer(units_t=seg_units, spk_id=spk_id, ratio=index_ratio)
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
seg_volume = volume[:, start_frame: start_frame + seg_units.size(1), :]
if gt_spec is not None:
seg_gt_spec = gt_spec[:, start_frame: start_frame + seg_units.size(1), :]
else:
seg_gt_spec = None
seg_output = self.infer(seg_units, seg_f0, seg_volume, gt_spec=seg_gt_spec, spk_id=spk_id,
spk_mix_dict=spk_mix_dict,
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
use_tqdm=use_tqdm, spk_emb=spk_emb)
_left = start_frame * self.args.data.block_size
_right = (start_frame + seg_units.size(1)) * self.args.data.block_size
seg_output *= mask[:, _left:_right]
seg_output = seg_output.squeeze().cpu().numpy()
silent_length = round(start_frame * self.args.data.block_size) - current_length
if silent_length >= 0:
result = np.append(result, np.zeros(silent_length))
result = np.append(result, seg_output)
else:
result = cross_fade(result, seg_output, current_length + silent_length)
current_length = current_length + silent_length + len(seg_output)
return result, self.args.data.sampling_rate
@torch.no_grad() # 为实时优化的推理代码可以切除pad省算力
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,

View File

@ -252,7 +252,7 @@ class GaussianDiffusion(nn.Module):
if method is not None and infer_speedup > 1:
if method == 'dpm-solver':
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
# 1. Define the noise schedule.
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])

View File

@ -56,6 +56,7 @@ class Unit2MelNaive(nn.Module):
residual_dropout=0.1,
attention_dropout=0.1)
else:
print("[[[[[PCmer]]]]]")
self.decoder = PCmer(
num_layers=n_layers,
num_heads=8,
@ -83,6 +84,7 @@ class Unit2MelNaive(nn.Module):
'''
x = self.stack(units.transpose(1, 2)).transpose(1, 2)
x = x + self.f0_embed((1 + f0 / 700).log()) + self.volume_embed(volume)
print("-----------------x1>", x)
if self.use_speaker_encoder:
if spk_mix_dict is not None:
assert spk_emb_dict is not None
@ -104,9 +106,13 @@ class Unit2MelNaive(nn.Module):
if self.aug_shift_embed is not None and aug_shift is not None:
x = x + self.aug_shift_embed(aug_shift / 5)
print("-----------------x2>", x)
x = self.decoder(x)
print("-----------------x3>", x)
x = self.norm(x)
print("-----------------x4>", x)
x = self.dense_out(x)
print("-----------------x5>", x)
if not infer:
x = F.mse_loss(x, gt_spec)
if self.l2reg_loss > 0:

View File

@ -94,9 +94,12 @@ class PCmer(nn.Module):
def forward(self, phone, mask=None):
# apply all layers to the input
print("[[[[[PCmer]]]]1]", phone, mask)
for (i, layer) in enumerate(self._layers):
phone = layer(phone, mask)
# print("[[[[[PCmer]]]] 2 ]", phone)
# provide the final sequence
print("[[[[[PCmer]]]]3]", phone)
return phone
@ -136,9 +139,13 @@ class _EncoderLayer(nn.Module):
def forward(self, phone, mask=None):
# compute attention sub-layer
print("Phone:::::1:", phone)
print("Phone:::::16:", self.norm(phone))
phone = phone + (self.attn(self.norm(phone), mask=mask))
print("Phone:::::2:", phone)
phone = phone + (self.conformer(phone))
print("Phone:::::3:", phone)
return phone

View File

@ -3,10 +3,10 @@ import yaml
import torch
import torch.nn as nn
import numpy as np
from .diffusion import GaussianDiffusion
from .wavenet import WaveNet
from .vocoder import Vocoder
from .naive.naive import Unit2MelNaive
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.diffusion import GaussianDiffusion
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.wavenet import WaveNet
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
class DotDict(dict):

View File

@ -1,6 +1,6 @@
import torch
from nsf_hifigan.nvSTFT import STFT
from nsf_hifigan.models import load_model, load_config
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.nvSTFT import STFT
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.models import load_model, load_config
from torchaudio.transforms import Resample

View File

@ -1,7 +1,7 @@
import numpy as np
import torch
import torch.nn.functional as F
import torch.nn as nn
import pyworld as pw
import parselmouth
import torchcrepe
@ -789,15 +789,6 @@ def median_pool_1d(x, kernel_size):
x, _ = torch.sort(x, dim=-1)
return x[:, :, (kernel_size - 1) // 2]
def upsample(signal, factor):
signal = signal.permute(0, 2, 1)
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1,
mode='linear', align_corners=True)
signal = signal[:, :, :-1]
return signal.permute(0, 2, 1)
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
result = np.zeros(idx + b.shape[0])
fade_len = a.shape[0] - idx

View File

@ -1,4 +1,3 @@
import numpy as np
from typing import Any
import math
import torch
@ -10,13 +9,14 @@ from Exceptions import (
HalfPrecisionChangingException,
NotEnoughDataExtimateF0,
)
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.common.VolumeExtractor import VolumeExtractor
class Pipeline(object):
@ -37,29 +37,30 @@ class Pipeline(object):
embedder: Embedder,
inferencer: Inferencer,
pitchExtractor: PitchExtractor,
index: Any | None,
# feature: Any | None,
# index: Any | None,
targetSR,
device,
isHalf,
):
model_block_size, model_sampling_rate = inferencer.getConfig()
self.hop_size = model_block_size * 16000 / model_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。
self.volumeExtractor = VolumeExtractor(self.hop_size, model_block_size, model_sampling_rate, audio_sampling_rate=16000)
self.embedder = embedder
self.inferencer = inferencer
self.pitchExtractor = pitchExtractor
print("GENERATE INFERENCER", self.inferencer)
print("GENERATE EMBEDDER", self.embedder)
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
self.index = index
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
# self.feature = feature
self.targetSR = targetSR
self.device = device
self.isHalf = isHalf
# self.isHalf = isHalf
self.isHalf = False
self.sr = 16000
self.window = 160
# self.sr = 16000
# self.window = 160
def getPipelineInfo(self):
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
@ -70,6 +71,13 @@ class Pipeline(object):
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
self.pitchExtractor = pitchExtractor
@torch.no_grad()
def extract_volume_and_mask(self, audio, threhold):
volume = self.volumeExtractor.extract(audio)
mask = self.volumeExtractor.get_mask_from_volume(volume, threhold=threhold, device=self.device)
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
return volume, mask
def exec(
self,
sid,
@ -87,56 +95,45 @@ class Pipeline(object):
out_size=None,
):
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
# self.t_pad = self.sr * repeat # 1秒
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
audio = audio.unsqueeze(0)
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
self.t_pad = 0
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
p_len = audio_pad.shape[0] // self.window
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
# RVC QualityがOnのときにはsilence_frontをオフに。
silence_front = silence_front if repeat == 0 else 0
pitchf = pitchf if repeat == 0 else np.zeros(p_len)
out_size = out_size if repeat == 0 else None
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
print("--------------------> n_frames:", n_frames)
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
print("--------------------> volume:", volume.shape)
# ピッチ検出
try:
if if_f0 == 1:
pitch, pitchf = self.pitchExtractor.extract(
audio_pad,
pitchf,
f0_up_key,
self.sr,
self.window,
16000, # 音声のサンプリングレート(既に16000)
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
int(self.hop_size), # 処理のwindowサイズ (44100における512)
silence_front=silence_front,
)
# pitch = pitch[:p_len]
# pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
else:
pitch = None
pitchf = None
except IndexError:
print("--------------------> pitch11111111111111111111111111111111:", pitch[1:], pitch.shape)
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
except IndexError as e:
print(e)
# print(e)
raise NotEnoughDataExtimateF0()
print("--------------------> pitch:", pitch, pitch.shape)
# tensor型調整
feats = audio_pad
if feats.dim() == 2: # double channels
feats = feats.mean(-1)
assert feats.dim() == 1, feats.dim()
feats = feats.view(1, -1)
# embedding
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
with autocast(enabled=self.isHalf):
try:
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
@ -149,74 +146,46 @@ class Pipeline(object):
raise DeviceChangingException()
else:
raise e
if protect < 0.5 and search_index:
print("--------------------> feats1:", feats, feats.shape)
# feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
if protect < 0.5:
feats0 = feats.clone()
print("--------------------> feats2:", feats, feats.shape)
# Index - feature抽出
# if self.index is not None and self.feature is not None and index_rate != 0:
if search_index:
npy = feats[0].cpu().numpy()
# apply silent front for indexsearch
npyOffset = math.floor(silence_front * 16000) // 360
npy = npy[npyOffset:]
# # ピッチサイズ調整
# p_len = audio_pad.shape[0] // self.window
# feats_len = feats.shape[1]
# if feats.shape[1] < p_len:
# p_len = feats_len
# pitch = pitch[:, :feats_len]
# pitchf = pitchf[:, :feats_len]
if self.isHalf is True:
npy = npy.astype("float32")
# pitch = pitch[:, -feats_len:]
# pitchf = pitchf[:, -feats_len:]
# p_len = torch.tensor([feats_len], device=self.device).long()
# TODO: kは調整できるようにする
k = 1
if k == 1:
_, ix = self.index.search(npy, 1)
npy = self.big_npy[ix.squeeze()]
else:
score, ix = self.index.search(npy, k=8)
weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
# recover silient font
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and search_index:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
# ピッチサイズ調整
p_len = audio_pad.shape[0] // self.window
if feats.shape[1] < p_len:
p_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, :p_len]
pitchf = pitchf[:, :p_len]
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# print("----------plen::1:", p_len)
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
if protect < 0.5 and search_index:
if protect < 0.5:
pitchff = pitchf.clone()
pitchff[pitchf > 0] = 1
pitchff[pitchf < 1] = protect
pitchff = pitchff.unsqueeze(-1)
feats = feats * pitchff + feats0 * (1 - pitchff)
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
# p_len = torch.tensor([p_len], device=self.device).long()
# apply silent front for inference
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
npyOffset = math.floor(silence_front * 16000) // 360
feats = feats[:, npyOffset * 2 :, :] # NOQA
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# # apply silent front for inference
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
# npyOffset = math.floor(silence_front * 16000) // 360 # 160x2 = 360
# feats = feats[:, npyOffset * 2 :, :] # NOQA
# 推論実行
try:
@ -224,7 +193,16 @@ class Pipeline(object):
with autocast(enabled=self.isHalf):
audio1 = (
torch.clip(
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
self.inferencer.infer(
feats,
pitch.unsqueeze(-1),
volume,
mask,
sid,
infer_speedup=10,
k_step=20,
silence_front=silence_front
).to(dtype=torch.float32),
-1.0,
1.0,
)
@ -243,16 +221,7 @@ class Pipeline(object):
else:
pitchf_buffer = None
del p_len, padding_mask, pitch, pitchf, feats
del pitch, pitchf, feats, sid
torch.cuda.empty_cache()
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
# pipelineに入力されるときはhubertように16k
if self.t_pad_tgt != 0:
offset = self.t_pad_tgt
end = -1 * self.t_pad_tgt
audio1 = audio1[offset:end]
del sid
torch.cuda.empty_cache()
return audio1, pitchf_buffer, feats_buffer

View File

@ -1,51 +1,48 @@
import os
import traceback
import faiss
from data.ModelSlot import DiffusionSVCModelSlot, RVCModelSlot
from data.ModelSlot import DiffusionSVCModelSlot
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
from voice_changer.RVC.pipeline.Pipeline import Pipeline
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
dev = DeviceManager.get_instance().getDevice(gpu)
half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
half = False
# # Inferencer 生成
# try:
# inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
# except Exception as e:
# print("[Voice Changer] exception! loading inferencer", e)
# traceback.print_exc()
# Inferencer 生成
try:
inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
except Exception as e:
print("[Voice Changer] exception! loading inferencer", e)
traceback.print_exc()
# # Embedder 生成
# try:
# embedder = EmbedderManager.getEmbedder(
# modelSlot.embedder,
# # emmbedderFilename,
# half,
# dev,
# )
# except Exception as e:
# print("[Voice Changer] exception! loading embedder", e)
# traceback.print_exc()
# Embedder 生成
try:
embedder = EmbedderManager.getEmbedder(
modelSlot.embedder,
# emmbedderFilename,
half,
dev,
)
except Exception as e:
print("[Voice Changer] exception! loading embedder", e)
traceback.print_exc()
# # pitchExtractor
# pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
# pitchExtractor
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
pipeline = Pipeline(
embedder,
inferencer,
pitchExtractor,
modelSlot.samplingRate,
dev,
half,
)
# pipeline = Pipeline(
# embedder,
# inferencer,
# pitchExtractor,
# index,
# modelSlot.samplingRate,
# dev,
# half,
# )
# return pipeline
return pipeline

View File

@ -11,6 +11,7 @@ class ModelSlotManager:
def __init__(self, model_dir: str):
self.model_dir = model_dir
self.modelSlots = loadAllSlotInfo(self.model_dir)
print("MODEL SLOT INFO-------------->>>>>", self.modelSlots)
@classmethod
def get_instance(cls, model_dir: str):

View File

@ -0,0 +1,41 @@
import numpy as np
import torch
import torch.nn as nn
class VolumeExtractor:
def __init__(self, hop_size: float, block_size: int, model_sampling_rate: int, audio_sampling_rate: int):
self.hop_size = hop_size
self.block_size = block_size
self.model_sampling_rate = model_sampling_rate
self.audio_sampling_rate = audio_sampling_rate
# self.hop_size = self.block_size * self.audio_sampling_rate / self.model_sampling_rate # モデルの処理単位が512(Diffusion-SVC), 入力のサンプリングレートのサイズにhopsizeを合わせる。
def extract(self, audio): # audio: 1d numpy array
audio = audio.squeeze().cpu()
print("----VolExtractor2", audio.shape, self.block_size, self.model_sampling_rate, self.audio_sampling_rate, self.hop_size)
n_frames = int(len(audio) // self.hop_size) + 1
print("=======> n_frames", n_frames)
audio2 = audio ** 2
print("----VolExtractor3", audio2.shape)
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
print("----VolExtractor4", audio2.shape)
volume = np.array(
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
volume = np.sqrt(volume)
return volume
def get_mask_from_volume(self, volume, threhold=-60.0, device='cpu') -> torch.Tensor:
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, self.block_size).squeeze(-1)
return mask
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
signal = signal.permute(0, 2, 1)
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1, mode='linear', align_corners=True)
signal = signal[:, :, :-1]
return signal.permute(0, 2, 1)