mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-03-13 19:34:02 +03:00
Experimental: remve torchaudio from rvc
This commit is contained in:
parent
3d2f5ad0da
commit
cacd127c76
@ -113,18 +113,18 @@ class DiffusionSVCInferencer(Inferencer):
|
||||
infer_speedup: int,
|
||||
silence_front: float,
|
||||
) -> torch.Tensor:
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||
# gt_spec = self.vocoder.extract(audio_t, 16000)
|
||||
# gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
|
||||
|
||||
# print("[ ----Timer::1: ]", t.secs)
|
||||
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||
|
||||
# print("[ ----Timer::2: ]", t.secs)
|
||||
with Timer("pre-process") as t: # NOQA
|
||||
with Timer("pre-process", False) as t: # NOQA
|
||||
if self.vocoder_onnx is None:
|
||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
|
||||
|
@ -81,23 +81,6 @@ class Pipeline(object):
|
||||
|
||||
@torch.no_grad()
|
||||
def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float):
|
||||
'''
|
||||
with Timer("[VolumeExt np]") as t:
|
||||
for i in range(100):
|
||||
volume = self.volumeExtractor.extract(audio)
|
||||
time_np = t.secs
|
||||
with Timer("[VolumeExt pt]") as t:
|
||||
for i in range(100):
|
||||
volume_t = self.volumeExtractor.extract_t(audio)
|
||||
time_pt = t.secs
|
||||
|
||||
print("[Volume np]:", volume)
|
||||
print("[Volume pt]:", volume_t)
|
||||
print("[Perform]:", time_np, time_pt)
|
||||
# -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090)
|
||||
# -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF)
|
||||
# ---> これくらいの処理ならCPU上のTorchでやった方が早い?
|
||||
'''
|
||||
volume_t = self.volumeExtractor.extract_t(audio)
|
||||
mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold)
|
||||
volume = volume_t.unsqueeze(-1).unsqueeze(0)
|
||||
@ -119,7 +102,7 @@ class Pipeline(object):
|
||||
protect=0.5
|
||||
):
|
||||
# print("---------- pipe line --------------------")
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
audio16k = self.resamplerIn(audio_t)
|
||||
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
||||
@ -127,7 +110,7 @@ class Pipeline(object):
|
||||
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
||||
# print("[Timer::1: ]", t.secs)
|
||||
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
# ピッチ検出
|
||||
try:
|
||||
# pitch = self.pitchExtractor.extract(
|
||||
@ -157,7 +140,7 @@ class Pipeline(object):
|
||||
feats = feats.view(1, -1)
|
||||
# print("[Timer::2: ]", t.secs)
|
||||
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
|
||||
# embedding
|
||||
with autocast(enabled=self.isHalf):
|
||||
@ -175,7 +158,7 @@ class Pipeline(object):
|
||||
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
||||
# print("[Timer::3: ]", t.secs)
|
||||
|
||||
with Timer("pre-process") as t:
|
||||
with Timer("pre-process", False) as t:
|
||||
# 推論実行
|
||||
try:
|
||||
with torch.no_grad():
|
||||
@ -206,7 +189,7 @@ class Pipeline(object):
|
||||
raise e
|
||||
# print("[Timer::4: ]", t.secs)
|
||||
|
||||
with Timer("pre-process") as t: # NOQA
|
||||
with Timer("pre-process", False) as t: # NOQA
|
||||
feats_buffer = feats.squeeze(0).detach().cpu()
|
||||
if pitch is not None:
|
||||
pitch_buffer = pitch.squeeze(0).detach().cpu()
|
||||
|
@ -122,7 +122,7 @@ class RVCr2(VoiceChangerModel):
|
||||
|
||||
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (160 - (convertSize % 160))
|
||||
outSize = convertSize - extra_frame
|
||||
outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate)
|
||||
|
||||
# バッファがたまっていない場合はzeroで補う
|
||||
if self.audio_buffer.shape[0] < convertSize:
|
||||
@ -193,6 +193,7 @@ class RVCr2(VoiceChangerModel):
|
||||
embOutputLayer = self.slotInfo.embOutputLayer
|
||||
useFinalProj = self.slotInfo.useFinalProj
|
||||
|
||||
|
||||
try:
|
||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||
sid,
|
||||
@ -202,14 +203,16 @@ class RVCr2(VoiceChangerModel):
|
||||
f0_up_key,
|
||||
index_rate,
|
||||
if_f0,
|
||||
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
|
||||
# 0,
|
||||
self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。入力のサンプリングレートで算出
|
||||
embOutputLayer,
|
||||
useFinalProj,
|
||||
repeat,
|
||||
protect
|
||||
protect,
|
||||
outSize
|
||||
)
|
||||
outSize = outSize // 16000 * self.slotInfo.samplingRate
|
||||
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
||||
# result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
|
||||
result = cast(
|
||||
AudioInOut,
|
||||
|
@ -90,7 +90,10 @@ class Pipeline(object):
|
||||
protect=0.5,
|
||||
out_size=None,
|
||||
):
|
||||
with Timer("main-process") as t:
|
||||
# print(f"pipeline exec input, audio:{audio.shape}, pitchf:{pitchf.shape}, feature:{feature.shape}")
|
||||
# print(f"pipeline exec input, silence_front:{silence_front}, out_size:{out_size}")
|
||||
|
||||
with Timer("main-process", False) as t: # NOQA
|
||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
|
||||
# self.t_pad = self.sr * repeat # 1秒
|
||||
@ -241,6 +244,7 @@ class Pipeline(object):
|
||||
raise e
|
||||
|
||||
feats_buffer = feats.squeeze(0).detach().cpu()
|
||||
|
||||
if pitchf is not None:
|
||||
pitchf_buffer = pitchf.squeeze(0).detach().cpu()
|
||||
else:
|
||||
@ -258,6 +262,7 @@ class Pipeline(object):
|
||||
|
||||
del sid
|
||||
# torch.cuda.empty_cache()
|
||||
# print("EXEC AVERAGE:", t.avrSecs)
|
||||
return audio1, pitchf_buffer, feats_buffer
|
||||
|
||||
def __del__(self):
|
||||
|
@ -208,12 +208,13 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
block_frame = receivedData.shape[0]
|
||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
||||
self._generate_strength(crossfade_frame)
|
||||
# data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
|
||||
|
||||
audio = self.voiceChanger.inference(
|
||||
receivedData,
|
||||
crossfade_frame=crossfade_frame,
|
||||
sola_search_frame=sola_search_frame
|
||||
)
|
||||
|
||||
if hasattr(self, "sola_buffer") is True:
|
||||
np.set_printoptions(threshold=10000)
|
||||
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
|
||||
|
@ -1,15 +1,43 @@
|
||||
import time
|
||||
import inspect
|
||||
|
||||
|
||||
class Timer(object):
|
||||
def __init__(self, title: str):
|
||||
storedSecs = {} # Class variable
|
||||
|
||||
def __init__(self, title: str, enalbe: bool = True):
|
||||
self.title = title
|
||||
self.enable = enalbe
|
||||
self.secs = 0
|
||||
self.msecs = 0
|
||||
self.avrSecs = 0
|
||||
|
||||
if self.enable is False:
|
||||
return
|
||||
|
||||
self.maxStores = 10
|
||||
|
||||
current_frame = inspect.currentframe()
|
||||
caller_frame = inspect.getouterframes(current_frame, 2)
|
||||
frame = caller_frame[1]
|
||||
filename = frame.filename
|
||||
line_number = frame.lineno
|
||||
self.key = f"{title}_{filename}_{line_number}"
|
||||
if self.key not in self.storedSecs:
|
||||
self.storedSecs[self.key] = []
|
||||
|
||||
def __enter__(self):
|
||||
if self.enable is False:
|
||||
return
|
||||
self.start = time.time()
|
||||
return self
|
||||
|
||||
def __exit__(self, *_):
|
||||
if self.enable is False:
|
||||
return
|
||||
self.end = time.time()
|
||||
self.secs = self.end - self.start
|
||||
self.msecs = self.secs * 1000 # millisecs
|
||||
self.storedSecs[self.key].append(self.secs)
|
||||
self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores:]
|
||||
self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
|
||||
|
Loading…
x
Reference in New Issue
Block a user