mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 21:45:00 +03:00
WIP: diffusion svc rt badf0
This commit is contained in:
parent
9c829ac91a
commit
5bf1202215
@ -66,6 +66,11 @@ class EnumInferenceTypes(Enum):
|
||||
onnxRVCNono = "onnxRVCNono"
|
||||
|
||||
|
||||
DiffusionSVCInferenceType: TypeAlias = Literal[
|
||||
"combo",
|
||||
]
|
||||
|
||||
|
||||
PitchExtractorType: TypeAlias = Literal[
|
||||
"harvest",
|
||||
"dio",
|
||||
|
@ -1,5 +1,5 @@
|
||||
from typing import TypeAlias, Union
|
||||
from const import MAX_SLOT_NUM, EnumInferenceTypes, EmbedderType, VoiceChangerType
|
||||
from const import MAX_SLOT_NUM, DiffusionSVCInferenceType, EnumInferenceTypes, EmbedderType, VoiceChangerType
|
||||
|
||||
from dataclasses import dataclass, asdict, field
|
||||
|
||||
@ -107,7 +107,7 @@ class DiffusionSVCModelSlot(ModelSlot):
|
||||
voiceChangerType: VoiceChangerType = "Diffusion-SVC"
|
||||
modelFile: str = ""
|
||||
isONNX: bool = False
|
||||
modelType: str = "combo"
|
||||
modelType: DiffusionSVCInferenceType = "combo"
|
||||
dstId: int = 1
|
||||
|
||||
sampleId: str = ""
|
||||
@ -115,6 +115,8 @@ class DiffusionSVCModelSlot(ModelSlot):
|
||||
kstep: int = 100
|
||||
speakers: dict = field(default_factory=lambda: {1: "user"})
|
||||
embedder: EmbedderType = "hubert_base"
|
||||
samplingRate: int = 44100
|
||||
embChannels: int = 768
|
||||
|
||||
|
||||
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
|
||||
|
@ -52,7 +52,7 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
print("[Voice Changer] [DiffusionSVC] Initializing... done")
|
||||
|
||||
def update_settings(self, key: str, val: int | float | str):
|
||||
print("[Voice Changer][RVC]: update_settings", key, val)
|
||||
print("[Voice Changer][DiffusionSVC]: update_settings", key, val)
|
||||
if key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
if key == "gpu":
|
||||
@ -86,18 +86,17 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
crossfadeSize: int,
|
||||
solaSearchFrame: int = 0,
|
||||
):
|
||||
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||
newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||
|
||||
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
|
||||
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate # 100 は hubertのhosizeから (16000 / 160)
|
||||
if self.audio_buffer is not None:
|
||||
# 過去のデータに連結
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
|
||||
print("^^^self.feature_buffer.shape, self.slotInfo.embChannels",self.feature_buffer.shape, self.slotInfo.embChannels)
|
||||
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
|
||||
else:
|
||||
self.audio_buffer = newData
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
||||
|
||||
@ -110,14 +109,12 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
# バッファがたまっていない場合はzeroで補う
|
||||
if self.audio_buffer.shape[0] < convertSize:
|
||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
|
||||
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
|
||||
|
||||
convertOffset = -1 * convertSize
|
||||
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
|
||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||
if self.slotInfo.f0:
|
||||
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
|
||||
self.feature_buffer = self.feature_buffer[featureOffset:]
|
||||
|
||||
@ -145,18 +142,18 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
if self.pipeline is not None:
|
||||
device = self.pipeline.device
|
||||
else:
|
||||
device = torch.device("cpu")
|
||||
device = torch.device("cpu") # TODO:pipelineが存在しない場合はzeroを返してもいいかも(要確認)。
|
||||
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
|
||||
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
|
||||
repeat = 1 if self.settings.rvcQuality else 0
|
||||
repeat = 0
|
||||
sid = self.settings.dstId
|
||||
f0_up_key = self.settings.tran
|
||||
index_rate = self.settings.indexRatio
|
||||
protect = self.settings.protect
|
||||
index_rate = 0
|
||||
protect = 0
|
||||
|
||||
if_f0 = 1 if self.slotInfo.f0 else 0
|
||||
embOutputLayer = self.slotInfo.embOutputLayer
|
||||
useFinalProj = self.slotInfo.useFinalProj
|
||||
if_f0 = 1
|
||||
embOutputLayer = 12
|
||||
useFinalProj = False
|
||||
|
||||
try:
|
||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||
@ -167,14 +164,17 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
f0_up_key,
|
||||
index_rate,
|
||||
if_f0,
|
||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
|
||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
||||
embOutputLayer,
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect,
|
||||
outSize
|
||||
)
|
||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
result = audio_out.detach().cpu().numpy()
|
||||
|
||||
print("RESULT", result)
|
||||
|
||||
return result
|
||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||
|
@ -21,6 +21,7 @@ class DiffusionSVCModelSlotGenerator(ModelSlotGenerator):
|
||||
slotInfo.isONNX = slotInfo.modelFile.endswith(".onnx")
|
||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||
slotInfo.iconFile = "/assets/icons/noimage.png"
|
||||
slotInfo.embChannels = 768
|
||||
|
||||
# if slotInfo.isONNX:
|
||||
# slotInfo = cls._setInfoByONNX(slotInfo)
|
||||
|
@ -1,35 +1,134 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import Unit2Mel, load_model_vocoder_from_combo
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
|
||||
|
||||
class RVCInferencer(Inferencer):
|
||||
class DiffusionSVCInferencer(Inferencer):
|
||||
def __init__(self):
|
||||
self.diff_model: Unit2Mel | None = None
|
||||
self.naive_model: Unit2MelNaive | None = None
|
||||
self.vocoder: Vocoder | None = None
|
||||
|
||||
def loadModel(self, file: str, gpu: int):
|
||||
self.setProps("DiffusionSVCCombo", file, True, gpu)
|
||||
|
||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
self.dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
# isHalf = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||
diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
|
||||
self.diff_model = diff_model
|
||||
self.naive_model = naive_model
|
||||
self.vocoder = vocoder
|
||||
self.diff_args = diff_args
|
||||
print("-----------------> diff_args", diff_args)
|
||||
print("-----------------> naive_args", naive_args)
|
||||
|
||||
model.eval()
|
||||
model.load_state_dict(cpt["weight"], strict=False)
|
||||
# cpt = torch.load(file, map_location="cpu")
|
||||
# model = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=isHalf)
|
||||
|
||||
model = model.to(dev)
|
||||
if isHalf:
|
||||
model = model.half()
|
||||
# model.eval()
|
||||
# model.load_state_dict(cpt["weight"], strict=False)
|
||||
|
||||
self.model = model
|
||||
# model = model.to(dev)
|
||||
# if isHalf:
|
||||
# model = model.half()
|
||||
|
||||
# self.model = model
|
||||
return self
|
||||
|
||||
def getConfig(self) -> tuple[int, int]:
|
||||
model_sampling_rate = int(self.diff_args.data.sampling_rate)
|
||||
model_block_size = int(self.diff_args.data.block_size)
|
||||
return model_block_size, model_sampling_rate
|
||||
|
||||
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
||||
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None):
|
||||
|
||||
if self.diff_args.model.k_step_max is not None:
|
||||
if k_step is None:
|
||||
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
|
||||
if k_step > int(self.diff_args.model.k_step_max):
|
||||
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
|
||||
if gt_spec is None:
|
||||
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
|
||||
"input mel or output of naive model")
|
||||
print(f' [INFO] k_step_max is {self.diff_args.model.k_step_max}.')
|
||||
|
||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||
|
||||
# spk_id
|
||||
spk_emb_dict = None
|
||||
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||
# without speaker encoder
|
||||
else:
|
||||
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||
|
||||
if k_step is not None:
|
||||
print(f' [INFO] get k_step, do shallow diffusion {k_step} steps')
|
||||
else:
|
||||
print(f' [INFO] Do full 1000 steps depth diffusion {k_step}')
|
||||
print(f" [INFO] method:{method}; infer_speedup:{infer_speedup}")
|
||||
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||
|
||||
@torch.no_grad()
|
||||
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
||||
aug_shift=0, spk_emb=None):
|
||||
# spk_id
|
||||
spk_emb_dict = None
|
||||
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||
spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
|
||||
# without speaker encoder
|
||||
else:
|
||||
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||
print("====> unit, f0, vol", units.shape, f0.shape, volume.shape)
|
||||
print("====> *unit, f0, vol", units)
|
||||
print("====> unit, *f0, vol", f0)
|
||||
print("====> unit, f0, *vol", volume)
|
||||
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer=True,
|
||||
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||
return out_spec
|
||||
|
||||
@torch.no_grad()
|
||||
def mel2wav(self, mel, f0, start_frame=0):
|
||||
if start_frame == 0:
|
||||
return self.vocoder.infer(mel, f0)
|
||||
else: # for realtime speedup
|
||||
mel = mel[:, start_frame:, :]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
out_wav = self.vocoder.infer(mel, f0)
|
||||
return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
|
||||
|
||||
@torch.no_grad()
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor,
|
||||
pitchf: torch.Tensor,
|
||||
volume: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
sid: torch.Tensor,
|
||||
convert_length: int | None,
|
||||
infer_speedup: int,
|
||||
k_step: int,
|
||||
silence_front: float,
|
||||
) -> torch.Tensor:
|
||||
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)
|
||||
print("---------------------------------shape", feats.shape, pitch.shape, volume.shape)
|
||||
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||
print("======================>>>>>gt_spec", gt_spec)
|
||||
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||
print("======================>>>>>out_mel", out_mel)
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||
|
||||
print("======================>>>>>out_wav.shape, mask.shape", out_wav.shape, mask.shape)
|
||||
out_wav *= mask
|
||||
print("out_wav:::::::::::", out_wav)
|
||||
return out_wav.squeeze()
|
||||
|
50
server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
Normal file
50
server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
Normal file
@ -0,0 +1,50 @@
|
||||
from typing import Any, Protocol
|
||||
import torch
|
||||
import onnxruntime
|
||||
|
||||
from const import DiffusionSVCInferenceType
|
||||
|
||||
|
||||
class Inferencer(Protocol):
|
||||
inferencerType: DiffusionSVCInferenceType = "combo"
|
||||
file: str
|
||||
isHalf: bool = True
|
||||
gpu: int = 0
|
||||
|
||||
model: onnxruntime.InferenceSession | Any | None = None
|
||||
|
||||
def loadModel(self, file: str, gpu: int):
|
||||
...
|
||||
|
||||
def getConfig(self) -> tuple[int, int]:
|
||||
...
|
||||
|
||||
def infer(
|
||||
self,
|
||||
feats: torch.Tensor,
|
||||
pitch_length: torch.Tensor,
|
||||
pitch: torch.Tensor | None,
|
||||
pitchf: torch.Tensor | None,
|
||||
sid: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
...
|
||||
|
||||
def setProps(
|
||||
self,
|
||||
inferencerType: DiffusionSVCInferenceType,
|
||||
file: str,
|
||||
isHalf: bool,
|
||||
gpu: int,
|
||||
):
|
||||
self.inferencerType = inferencerType
|
||||
self.file = file
|
||||
self.isHalf = isHalf
|
||||
self.gpu = gpu
|
||||
|
||||
def getInferencerInfo(self):
|
||||
return {
|
||||
"inferencerType": self.inferencerType,
|
||||
"file": self.file,
|
||||
"isHalf": self.isHalf,
|
||||
"gpu": self.gpu,
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
from const import DiffusionSVCInferenceType
|
||||
from voice_changer.DiffusionSVC.inferencer.DiffusionSVCInferencer import DiffusionSVCInferencer
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
|
||||
|
||||
class InferencerManager:
|
||||
currentInferencer: Inferencer | None = None
|
||||
|
||||
@classmethod
|
||||
def getInferencer(
|
||||
cls,
|
||||
inferencerType: DiffusionSVCInferenceType,
|
||||
file: str,
|
||||
gpu: int,
|
||||
) -> Inferencer:
|
||||
cls.currentInferencer = cls.loadInferencer(inferencerType, file, gpu)
|
||||
return cls.currentInferencer
|
||||
|
||||
@classmethod
|
||||
def loadInferencer(
|
||||
cls,
|
||||
inferencerType: DiffusionSVCInferenceType,
|
||||
file: str,
|
||||
gpu: int,
|
||||
) -> Inferencer:
|
||||
if inferencerType == "combo":
|
||||
return DiffusionSVCInferencer().loadModel(file, gpu)
|
||||
else:
|
||||
raise RuntimeError("[Voice Changer] Inferencer not found", inferencerType)
|
@ -38,8 +38,6 @@ class DiffusionSVC:
|
||||
self.use_combo_model = False
|
||||
|
||||
def load_model(self, model_path, f0_model=None, f0_min=None, f0_max=None):
|
||||
|
||||
if ('1234' + model_path)[-4:] == '.ptc':
|
||||
self.use_combo_model = True
|
||||
self.model_path = model_path
|
||||
self.naive_model_path = model_path
|
||||
@ -50,9 +48,6 @@ class DiffusionSVC:
|
||||
self.naive_model = naive_model
|
||||
self.naive_model_args = naive_args
|
||||
self.vocoder = vocoder
|
||||
else:
|
||||
self.model_path = model_path
|
||||
self.model, self.vocoder, self.args = load_model_vocoder(model_path, device=self.device)
|
||||
|
||||
self.units_encoder = Units_Encoder(
|
||||
self.args.data.encoder,
|
||||
@ -85,33 +80,6 @@ class DiffusionSVC:
|
||||
|
||||
self.units_indexer = UnitsIndexer(os.path.split(model_path)[0])
|
||||
|
||||
def flush(self, model_path=None, f0_model=None, f0_min=None, f0_max=None, naive_model_path=None):
|
||||
assert (model_path is not None) or (naive_model_path is not None)
|
||||
# flush model if changed
|
||||
if ((self.model_path != model_path) or (self.f0_model != f0_model)
|
||||
or (self.f0_min != f0_min) or (self.f0_max != f0_max)):
|
||||
self.load_model(model_path, f0_model=f0_model, f0_min=f0_min, f0_max=f0_max)
|
||||
if (self.naive_model_path != naive_model_path) and (naive_model_path is not None):
|
||||
self.load_naive_model(naive_model_path)
|
||||
# check args if use naive
|
||||
if self.naive_model is not None:
|
||||
if self.naive_model_args.data.encoder != self.args.data.encoder:
|
||||
raise ValueError("encoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.model.n_spk != self.args.model.n_spk:
|
||||
raise ValueError("n_spk of Naive Model and Diffusion Model are different")
|
||||
if bool(self.naive_model_args.model.use_speaker_encoder) != bool(self.args.model.use_speaker_encoder):
|
||||
raise ValueError("use_speaker_encoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.vocoder.type != self.args.vocoder.type:
|
||||
raise ValueError("vocoder of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.data.block_size != self.args.data.block_size:
|
||||
raise ValueError("block_size of Naive Model and Diffusion Model are different")
|
||||
if self.naive_model_args.data.sampling_rate != self.args.data.sampling_rate:
|
||||
raise ValueError("sampling_rate of Naive Model and Diffusion Model are different")
|
||||
|
||||
def flush_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||
if (f0_model != self.f0_model) and (f0_model is not None):
|
||||
self.load_f0_extractor(f0_model)
|
||||
|
||||
def load_f0_extractor(self, f0_model, f0_min=None, f0_max=None):
|
||||
self.f0_model = f0_model if (f0_model is not None) else self.args.data.f0_extractor
|
||||
self.f0_min = f0_min if (f0_min is not None) else self.args.data.f0_min
|
||||
@ -127,12 +95,6 @@ class DiffusionSVC:
|
||||
model_sampling_rate=self.args.data.sampling_rate
|
||||
)
|
||||
|
||||
def load_naive_model(self, naive_model_path):
|
||||
self.naive_model_path = naive_model_path
|
||||
model, _, args = load_model_vocoder(naive_model_path, device=self.device, loaded_vocoder=self.vocoder)
|
||||
self.naive_model = model
|
||||
self.naive_model_args = args
|
||||
print(f" [INFO] Load naive model from {naive_model_path}")
|
||||
|
||||
@torch.no_grad()
|
||||
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
||||
@ -265,144 +227,6 @@ class DiffusionSVC:
|
||||
gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||
|
||||
@torch.no_grad() # 比__call__多了声码器代码,输出波形
|
||||
def infer(self, units, f0, volume, gt_spec=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None):
|
||||
if k_step is not None:
|
||||
if self.naive_model is not None:
|
||||
gt_spec = self.naive_model_call(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, spk_emb=spk_emb)
|
||||
print(f" [INFO] get mel from naive model out.")
|
||||
assert gt_spec is not None
|
||||
if self.naive_model is None:
|
||||
print(f" [INFO] get mel from input wav.")
|
||||
if input(" [WARN] You are attempting shallow diffusion "
|
||||
"on the mel of the input source,"
|
||||
" Please enter 'gt_mel' to continue") != 'gt_mel':
|
||||
raise ValueError("Please understand what you're doing")
|
||||
k_step = int(k_step)
|
||||
gt_spec = gt_spec
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
return self.mel2wav(out_mel, f0)
|
||||
|
||||
@torch.no_grad() # 为实时浅扩散优化的推理代码,可以切除pad省算力
|
||||
def infer_for_realtime(self, units, f0, volume, audio_t=None, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, silence_front=0, diff_jump_silence_front=False):
|
||||
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
if audio_t is not None:
|
||||
audio_t = audio_t[:, start_frame * self.vocoder.vocoder_hop_size:]
|
||||
f0 = f0[:, start_frame:, :]
|
||||
units = units[:, start_frame:, :]
|
||||
volume = volume[:, start_frame:, :]
|
||||
|
||||
if k_step is not None:
|
||||
assert audio_t is not None
|
||||
k_step = int(k_step)
|
||||
gt_spec = self.vocoder.extract(audio_t, self.args.data.sampling_rate)
|
||||
# 如果缺帧再开这行gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
out_mel = self.__call__(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift,
|
||||
gt_spec=gt_spec, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
|
||||
if diff_jump_silence_front:
|
||||
out_wav = self.mel2wav(out_mel, f0)
|
||||
else:
|
||||
out_wav = self.mel2wav(out_mel, f0, start_frame=start_frame)
|
||||
return out_wav
|
||||
|
||||
@torch.no_grad() # 不切片从音频推理代码
|
||||
def infer_from_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None, threhold=-60, index_ratio=0):
|
||||
units = self.encode_units(audio, sr)
|
||||
if index_ratio > 0:
|
||||
units = self.units_indexer(units_t=units, spk_id=spk_id, ratio=index_ratio)
|
||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
||||
if k_step is not None:
|
||||
assert 0 < int(k_step) <= 1000
|
||||
k_step = int(k_step)
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
output = self.infer(units, f0, volume, gt_spec=gt_spec, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
output *= mask
|
||||
return output.squeeze().cpu().numpy(), self.args.data.sampling_rate
|
||||
|
||||
@torch.no_grad() # 切片从音频推理代码
|
||||
def infer_from_long_audio(self, audio, sr=44100, key=0, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
||||
spk_emb=None,
|
||||
threhold=-60, threhold_for_split=-40, min_len=5000, index_ratio=0):
|
||||
|
||||
hop_size = self.args.data.block_size * sr / self.args.data.sampling_rate
|
||||
segments = split(audio, sr, hop_size, db_thresh=threhold_for_split, min_len=min_len)
|
||||
|
||||
print(f' [INFO] Extract f0 volume and mask: Use {self.f0_model}, start...')
|
||||
_f0_start_time = time.time()
|
||||
f0 = self.extract_f0(audio, key=key, sr=sr)
|
||||
volume, mask = self.extract_volume_and_mask(audio, sr, threhold=float(threhold))
|
||||
_f0_end_time = time.time()
|
||||
_f0_used_time = _f0_end_time - _f0_start_time
|
||||
print(f' [INFO] Extract f0 volume and mask: Done. Use time:{_f0_used_time}')
|
||||
|
||||
if k_step is not None:
|
||||
assert 0 < int(k_step) <= 1000
|
||||
k_step = int(k_step)
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
gt_spec = self.vocoder.extract(audio_t, sr)
|
||||
gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
else:
|
||||
gt_spec = None
|
||||
|
||||
result = np.zeros(0)
|
||||
current_length = 0
|
||||
for segment in tqdm(segments):
|
||||
start_frame = segment[0]
|
||||
seg_input = torch.from_numpy(segment[1]).float().unsqueeze(0).to(self.device)
|
||||
seg_units = self.units_encoder.encode(seg_input, sr, hop_size)
|
||||
if index_ratio > 0:
|
||||
seg_units = self.units_indexer(units_t=seg_units, spk_id=spk_id, ratio=index_ratio)
|
||||
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
seg_volume = volume[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
if gt_spec is not None:
|
||||
seg_gt_spec = gt_spec[:, start_frame: start_frame + seg_units.size(1), :]
|
||||
else:
|
||||
seg_gt_spec = None
|
||||
seg_output = self.infer(seg_units, seg_f0, seg_volume, gt_spec=seg_gt_spec, spk_id=spk_id,
|
||||
spk_mix_dict=spk_mix_dict,
|
||||
aug_shift=aug_shift, infer_speedup=infer_speedup, method=method, k_step=k_step,
|
||||
use_tqdm=use_tqdm, spk_emb=spk_emb)
|
||||
_left = start_frame * self.args.data.block_size
|
||||
_right = (start_frame + seg_units.size(1)) * self.args.data.block_size
|
||||
seg_output *= mask[:, _left:_right]
|
||||
seg_output = seg_output.squeeze().cpu().numpy()
|
||||
silent_length = round(start_frame * self.args.data.block_size) - current_length
|
||||
if silent_length >= 0:
|
||||
result = np.append(result, np.zeros(silent_length))
|
||||
result = np.append(result, seg_output)
|
||||
else:
|
||||
result = cross_fade(result, seg_output, current_length + silent_length)
|
||||
current_length = current_length + silent_length + len(seg_output)
|
||||
|
||||
return result, self.args.data.sampling_rate
|
||||
|
||||
@torch.no_grad() # 为实时优化的推理代码,可以切除pad省算力
|
||||
def infer_from_audio_for_realtime(self, audio, sr, key, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
||||
|
@ -252,7 +252,7 @@ class GaussianDiffusion(nn.Module):
|
||||
|
||||
if method is not None and infer_speedup > 1:
|
||||
if method == 'dpm-solver':
|
||||
from .dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.dpm_solver_pytorch import NoiseScheduleVP, model_wrapper, DPM_Solver
|
||||
# 1. Define the noise schedule.
|
||||
noise_schedule = NoiseScheduleVP(schedule='discrete', betas=self.betas[:t])
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -56,6 +56,7 @@ class Unit2MelNaive(nn.Module):
|
||||
residual_dropout=0.1,
|
||||
attention_dropout=0.1)
|
||||
else:
|
||||
print("[[[[[PCmer]]]]]")
|
||||
self.decoder = PCmer(
|
||||
num_layers=n_layers,
|
||||
num_heads=8,
|
||||
@ -83,6 +84,7 @@ class Unit2MelNaive(nn.Module):
|
||||
'''
|
||||
x = self.stack(units.transpose(1, 2)).transpose(1, 2)
|
||||
x = x + self.f0_embed((1 + f0 / 700).log()) + self.volume_embed(volume)
|
||||
print("-----------------x1>", x)
|
||||
if self.use_speaker_encoder:
|
||||
if spk_mix_dict is not None:
|
||||
assert spk_emb_dict is not None
|
||||
@ -104,9 +106,13 @@ class Unit2MelNaive(nn.Module):
|
||||
if self.aug_shift_embed is not None and aug_shift is not None:
|
||||
x = x + self.aug_shift_embed(aug_shift / 5)
|
||||
|
||||
print("-----------------x2>", x)
|
||||
x = self.decoder(x)
|
||||
print("-----------------x3>", x)
|
||||
x = self.norm(x)
|
||||
print("-----------------x4>", x)
|
||||
x = self.dense_out(x)
|
||||
print("-----------------x5>", x)
|
||||
if not infer:
|
||||
x = F.mse_loss(x, gt_spec)
|
||||
if self.l2reg_loss > 0:
|
||||
|
@ -94,9 +94,12 @@ class PCmer(nn.Module):
|
||||
def forward(self, phone, mask=None):
|
||||
|
||||
# apply all layers to the input
|
||||
print("[[[[[PCmer]]]]1]", phone, mask)
|
||||
for (i, layer) in enumerate(self._layers):
|
||||
phone = layer(phone, mask)
|
||||
# print("[[[[[PCmer]]]] 2 ]", phone)
|
||||
# provide the final sequence
|
||||
print("[[[[[PCmer]]]]3]", phone)
|
||||
return phone
|
||||
|
||||
|
||||
@ -136,9 +139,13 @@ class _EncoderLayer(nn.Module):
|
||||
def forward(self, phone, mask=None):
|
||||
|
||||
# compute attention sub-layer
|
||||
print("Phone:::::1:", phone)
|
||||
print("Phone:::::16:", self.norm(phone))
|
||||
phone = phone + (self.attn(self.norm(phone), mask=mask))
|
||||
print("Phone:::::2:", phone)
|
||||
|
||||
phone = phone + (self.conformer(phone))
|
||||
print("Phone:::::3:", phone)
|
||||
|
||||
return phone
|
||||
|
||||
|
@ -3,10 +3,10 @@ import yaml
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
from .diffusion import GaussianDiffusion
|
||||
from .wavenet import WaveNet
|
||||
from .vocoder import Vocoder
|
||||
from .naive.naive import Unit2MelNaive
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.diffusion import GaussianDiffusion
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.wavenet import WaveNet
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.naive.naive import Unit2MelNaive
|
||||
|
||||
|
||||
class DotDict(dict):
|
||||
|
@ -1,6 +1,6 @@
|
||||
import torch
|
||||
from nsf_hifigan.nvSTFT import STFT
|
||||
from nsf_hifigan.models import load_model, load_config
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.nvSTFT import STFT
|
||||
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.nsf_hifigan.models import load_model, load_config
|
||||
from torchaudio.transforms import Resample
|
||||
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import torch.nn as nn
|
||||
|
||||
import pyworld as pw
|
||||
import parselmouth
|
||||
import torchcrepe
|
||||
@ -789,15 +789,6 @@ def median_pool_1d(x, kernel_size):
|
||||
x, _ = torch.sort(x, dim=-1)
|
||||
return x[:, :, (kernel_size - 1) // 2]
|
||||
|
||||
|
||||
def upsample(signal, factor):
|
||||
signal = signal.permute(0, 2, 1)
|
||||
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1,
|
||||
mode='linear', align_corners=True)
|
||||
signal = signal[:, :, :-1]
|
||||
return signal.permute(0, 2, 1)
|
||||
|
||||
|
||||
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
|
||||
result = np.zeros(idx + b.shape[0])
|
||||
fade_len = a.shape[0] - idx
|
||||
|
@ -1,4 +1,3 @@
|
||||
import numpy as np
|
||||
from typing import Any
|
||||
import math
|
||||
import torch
|
||||
@ -10,13 +9,14 @@ from Exceptions import (
|
||||
HalfPrecisionChangingException,
|
||||
NotEnoughDataExtimateF0,
|
||||
)
|
||||
from voice_changer.DiffusionSVC.inferencer.Inferencer import Inferencer
|
||||
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
|
||||
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
@ -37,29 +37,30 @@ class Pipeline(object):
|
||||
embedder: Embedder,
|
||||
inferencer: Inferencer,
|
||||
pitchExtractor: PitchExtractor,
|
||||
index: Any | None,
|
||||
# feature: Any | None,
|
||||
# index: Any | None,
|
||||
targetSR,
|
||||
device,
|
||||
isHalf,
|
||||
):
|
||||
model_block_size, model_sampling_rate = inferencer.getConfig()
|
||||
self.hop_size = model_block_size * 16000 / model_sampling_rate # 16000はオーディオのサンプルレート。この時点で16Kになっている。
|
||||
|
||||
self.volumeExtractor = VolumeExtractor(self.hop_size, model_block_size, model_sampling_rate, audio_sampling_rate=16000)
|
||||
self.embedder = embedder
|
||||
|
||||
self.inferencer = inferencer
|
||||
self.pitchExtractor = pitchExtractor
|
||||
print("GENERATE INFERENCER", self.inferencer)
|
||||
print("GENERATE EMBEDDER", self.embedder)
|
||||
print("GENERATE PITCH EXTRACTOR", self.pitchExtractor)
|
||||
|
||||
self.index = index
|
||||
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
|
||||
# self.feature = feature
|
||||
|
||||
self.targetSR = targetSR
|
||||
self.device = device
|
||||
self.isHalf = isHalf
|
||||
# self.isHalf = isHalf
|
||||
self.isHalf = False
|
||||
|
||||
self.sr = 16000
|
||||
self.window = 160
|
||||
# self.sr = 16000
|
||||
# self.window = 160
|
||||
|
||||
def getPipelineInfo(self):
|
||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||
@ -70,6 +71,13 @@ class Pipeline(object):
|
||||
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
||||
self.pitchExtractor = pitchExtractor
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_volume_and_mask(self, audio, threhold):
|
||||
volume = self.volumeExtractor.extract(audio)
|
||||
mask = self.volumeExtractor.get_mask_from_volume(volume, threhold=threhold, device=self.device)
|
||||
volume = torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
return volume, mask
|
||||
|
||||
def exec(
|
||||
self,
|
||||
sid,
|
||||
@ -87,56 +95,45 @@ class Pipeline(object):
|
||||
out_size=None,
|
||||
):
|
||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
|
||||
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
|
||||
# self.t_pad = self.sr * repeat # 1秒
|
||||
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
|
||||
audio = audio.unsqueeze(0)
|
||||
|
||||
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
|
||||
|
||||
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
|
||||
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
|
||||
self.t_pad = 0
|
||||
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
|
||||
# RVC QualityがOnのときにはsilence_frontをオフに。
|
||||
silence_front = silence_front if repeat == 0 else 0
|
||||
pitchf = pitchf if repeat == 0 else np.zeros(p_len)
|
||||
out_size = out_size if repeat == 0 else None
|
||||
n_frames = int(audio_pad.size(-1) // self.hop_size + 1)
|
||||
print("--------------------> n_frames:", n_frames)
|
||||
|
||||
volume, mask = self.extract_volume_and_mask(audio, threhold=-60.0)
|
||||
print("--------------------> volume:", volume.shape)
|
||||
# ピッチ検出
|
||||
try:
|
||||
if if_f0 == 1:
|
||||
pitch, pitchf = self.pitchExtractor.extract(
|
||||
audio_pad,
|
||||
pitchf,
|
||||
f0_up_key,
|
||||
self.sr,
|
||||
self.window,
|
||||
16000, # 音声のサンプリングレート(既に16000)
|
||||
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||
silence_front=silence_front,
|
||||
)
|
||||
# pitch = pitch[:p_len]
|
||||
# pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
|
||||
else:
|
||||
pitch = None
|
||||
pitchf = None
|
||||
except IndexError:
|
||||
print("--------------------> pitch11111111111111111111111111111111:", pitch[1:], pitch.shape)
|
||||
|
||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() # 160window sizeを前提にバッファを作っているので切る。
|
||||
pitchf = torch.tensor(pitchf[-n_frames:], device=self.device, dtype=torch.float).unsqueeze(0) # 160window sizeを前提にバッファを作っているので切る。
|
||||
except IndexError as e:
|
||||
print(e)
|
||||
# print(e)
|
||||
raise NotEnoughDataExtimateF0()
|
||||
|
||||
print("--------------------> pitch:", pitch, pitch.shape)
|
||||
|
||||
# tensor型調整
|
||||
feats = audio_pad
|
||||
if feats.dim() == 2: # double channels
|
||||
feats = feats.mean(-1)
|
||||
assert feats.dim() == 1, feats.dim()
|
||||
feats = feats.view(1, -1)
|
||||
|
||||
# embedding
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
with autocast(enabled=self.isHalf):
|
||||
try:
|
||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
||||
@ -149,74 +146,46 @@ class Pipeline(object):
|
||||
raise DeviceChangingException()
|
||||
else:
|
||||
raise e
|
||||
if protect < 0.5 and search_index:
|
||||
|
||||
print("--------------------> feats1:", feats, feats.shape)
|
||||
|
||||
# feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
||||
|
||||
if protect < 0.5:
|
||||
feats0 = feats.clone()
|
||||
print("--------------------> feats2:", feats, feats.shape)
|
||||
|
||||
# Index - feature抽出
|
||||
# if self.index is not None and self.feature is not None and index_rate != 0:
|
||||
if search_index:
|
||||
npy = feats[0].cpu().numpy()
|
||||
# apply silent front for indexsearch
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
npy = npy[npyOffset:]
|
||||
# # ピッチサイズ調整
|
||||
# p_len = audio_pad.shape[0] // self.window
|
||||
# feats_len = feats.shape[1]
|
||||
# if feats.shape[1] < p_len:
|
||||
# p_len = feats_len
|
||||
# pitch = pitch[:, :feats_len]
|
||||
# pitchf = pitchf[:, :feats_len]
|
||||
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float32")
|
||||
# pitch = pitch[:, -feats_len:]
|
||||
# pitchf = pitchf[:, -feats_len:]
|
||||
# p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
|
||||
# TODO: kは調整できるようにする
|
||||
k = 1
|
||||
if k == 1:
|
||||
_, ix = self.index.search(npy, 1)
|
||||
npy = self.big_npy[ix.squeeze()]
|
||||
else:
|
||||
score, ix = self.index.search(npy, k=8)
|
||||
weight = np.square(1 / score)
|
||||
weight /= weight.sum(axis=1, keepdims=True)
|
||||
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
||||
|
||||
# recover silient font
|
||||
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
|
||||
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
if protect < 0.5 and search_index:
|
||||
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||
|
||||
# ピッチサイズ調整
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
if feats.shape[1] < p_len:
|
||||
p_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, :p_len]
|
||||
pitchf = pitchf[:, :p_len]
|
||||
|
||||
feats_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
# print("----------plen::1:", p_len)
|
||||
|
||||
# pitchの推定が上手くいかない(pitchf=0)場合、検索前の特徴を混ぜる
|
||||
# pitchffの作り方の疑問はあるが、本家通りなので、このまま使うことにする。
|
||||
# https://github.com/w-okada/voice-changer/pull/276#issuecomment-1571336929
|
||||
if protect < 0.5 and search_index:
|
||||
if protect < 0.5:
|
||||
pitchff = pitchf.clone()
|
||||
pitchff[pitchf > 0] = 1
|
||||
pitchff[pitchf < 1] = protect
|
||||
pitchff = pitchff.unsqueeze(-1)
|
||||
feats = feats * pitchff + feats0 * (1 - pitchff)
|
||||
feats = feats.to(feats0.dtype)
|
||||
p_len = torch.tensor([p_len], device=self.device).long()
|
||||
# p_len = torch.tensor([p_len], device=self.device).long()
|
||||
|
||||
# apply silent front for inference
|
||||
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||
|
||||
feats_len = feats.shape[1]
|
||||
if pitch is not None and pitchf is not None:
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
# # apply silent front for inference
|
||||
# if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
|
||||
# npyOffset = math.floor(silence_front * 16000) // 360 # 160x2 = 360
|
||||
# feats = feats[:, npyOffset * 2 :, :] # NOQA
|
||||
|
||||
# 推論実行
|
||||
try:
|
||||
@ -224,7 +193,16 @@ class Pipeline(object):
|
||||
with autocast(enabled=self.isHalf):
|
||||
audio1 = (
|
||||
torch.clip(
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)[0][0, 0].to(dtype=torch.float32),
|
||||
self.inferencer.infer(
|
||||
feats,
|
||||
pitch.unsqueeze(-1),
|
||||
volume,
|
||||
mask,
|
||||
sid,
|
||||
infer_speedup=10,
|
||||
k_step=20,
|
||||
silence_front=silence_front
|
||||
).to(dtype=torch.float32),
|
||||
-1.0,
|
||||
1.0,
|
||||
)
|
||||
@ -243,16 +221,7 @@ class Pipeline(object):
|
||||
else:
|
||||
pitchf_buffer = None
|
||||
|
||||
del p_len, padding_mask, pitch, pitchf, feats
|
||||
del pitch, pitchf, feats, sid
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
|
||||
# pipelineに(入力されるときはhubertように16k)
|
||||
if self.t_pad_tgt != 0:
|
||||
offset = self.t_pad_tgt
|
||||
end = -1 * self.t_pad_tgt
|
||||
audio1 = audio1[offset:end]
|
||||
|
||||
del sid
|
||||
torch.cuda.empty_cache()
|
||||
return audio1, pitchf_buffer, feats_buffer
|
||||
|
@ -1,51 +1,48 @@
|
||||
import os
|
||||
import traceback
|
||||
import faiss
|
||||
from data.ModelSlot import DiffusionSVCModelSlot, RVCModelSlot
|
||||
from data.ModelSlot import DiffusionSVCModelSlot
|
||||
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
|
||||
def createPipeline(modelSlot: DiffusionSVCModelSlot, gpu: int, f0Detector: str):
|
||||
dev = DeviceManager.get_instance().getDevice(gpu)
|
||||
half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
# half = DeviceManager.get_instance().halfPrecisionAvailable(gpu)
|
||||
half = False
|
||||
|
||||
# # Inferencer 生成
|
||||
# try:
|
||||
# inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
|
||||
# except Exception as e:
|
||||
# print("[Voice Changer] exception! loading inferencer", e)
|
||||
# traceback.print_exc()
|
||||
# Inferencer 生成
|
||||
try:
|
||||
inferencer = InferencerManager.getInferencer(modelSlot.modelType, modelSlot.modelFile, gpu)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] exception! loading inferencer", e)
|
||||
traceback.print_exc()
|
||||
|
||||
# # Embedder 生成
|
||||
# try:
|
||||
# embedder = EmbedderManager.getEmbedder(
|
||||
# modelSlot.embedder,
|
||||
# # emmbedderFilename,
|
||||
# half,
|
||||
# dev,
|
||||
# )
|
||||
# except Exception as e:
|
||||
# print("[Voice Changer] exception! loading embedder", e)
|
||||
# traceback.print_exc()
|
||||
# Embedder 生成
|
||||
try:
|
||||
embedder = EmbedderManager.getEmbedder(
|
||||
modelSlot.embedder,
|
||||
# emmbedderFilename,
|
||||
half,
|
||||
dev,
|
||||
)
|
||||
except Exception as e:
|
||||
print("[Voice Changer] exception! loading embedder", e)
|
||||
traceback.print_exc()
|
||||
|
||||
# # pitchExtractor
|
||||
# pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||
# pitchExtractor
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(f0Detector, gpu)
|
||||
|
||||
pipeline = Pipeline(
|
||||
embedder,
|
||||
inferencer,
|
||||
pitchExtractor,
|
||||
modelSlot.samplingRate,
|
||||
dev,
|
||||
half,
|
||||
)
|
||||
|
||||
# pipeline = Pipeline(
|
||||
# embedder,
|
||||
# inferencer,
|
||||
# pitchExtractor,
|
||||
# index,
|
||||
# modelSlot.samplingRate,
|
||||
# dev,
|
||||
# half,
|
||||
# )
|
||||
|
||||
# return pipeline
|
||||
return pipeline
|
||||
|
||||
|
@ -11,6 +11,7 @@ class ModelSlotManager:
|
||||
def __init__(self, model_dir: str):
|
||||
self.model_dir = model_dir
|
||||
self.modelSlots = loadAllSlotInfo(self.model_dir)
|
||||
print("MODEL SLOT INFO-------------->>>>>", self.modelSlots)
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls, model_dir: str):
|
||||
|
41
server/voice_changer/common/VolumeExtractor.py
Normal file
41
server/voice_changer/common/VolumeExtractor.py
Normal file
@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class VolumeExtractor:
|
||||
def __init__(self, hop_size: float, block_size: int, model_sampling_rate: int, audio_sampling_rate: int):
|
||||
self.hop_size = hop_size
|
||||
self.block_size = block_size
|
||||
self.model_sampling_rate = model_sampling_rate
|
||||
self.audio_sampling_rate = audio_sampling_rate
|
||||
# self.hop_size = self.block_size * self.audio_sampling_rate / self.model_sampling_rate # モデルの処理単位が512(Diffusion-SVC), 入力のサンプリングレートのサイズにhopsizeを合わせる。
|
||||
|
||||
def extract(self, audio): # audio: 1d numpy array
|
||||
audio = audio.squeeze().cpu()
|
||||
print("----VolExtractor2", audio.shape, self.block_size, self.model_sampling_rate, self.audio_sampling_rate, self.hop_size)
|
||||
n_frames = int(len(audio) // self.hop_size) + 1
|
||||
print("=======> n_frames", n_frames)
|
||||
audio2 = audio ** 2
|
||||
print("----VolExtractor3", audio2.shape)
|
||||
audio2 = np.pad(audio2, (int(self.hop_size // 2), int((self.hop_size + 1) // 2)), mode='reflect')
|
||||
print("----VolExtractor4", audio2.shape)
|
||||
volume = np.array(
|
||||
[np.mean(audio2[int(n * self.hop_size): int((n + 1) * self.hop_size)]) for n in range(n_frames)])
|
||||
volume = np.sqrt(volume)
|
||||
return volume
|
||||
|
||||
def get_mask_from_volume(self, volume, threhold=-60.0, device='cpu') -> torch.Tensor:
|
||||
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
|
||||
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
|
||||
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
|
||||
mask = upsample(mask, self.block_size).squeeze(-1)
|
||||
return mask
|
||||
|
||||
|
||||
def upsample(signal: torch.Tensor, factor: int) -> torch.Tensor:
|
||||
signal = signal.permute(0, 2, 1)
|
||||
signal = nn.functional.interpolate(torch.cat((signal, signal[:, :, -1:]), 2), size=signal.shape[-1] * factor + 1, mode='linear', align_corners=True)
|
||||
signal = signal[:, :, :-1]
|
||||
return signal.permute(0, 2, 1)
|
Loading…
Reference in New Issue
Block a user