Experimental: remve torchaudio from rvc

This commit is contained in:
w-okada 2023-07-28 00:41:19 +09:00
parent 3d2f5ad0da
commit cacd127c76
6 changed files with 53 additions and 33 deletions

View File

@ -113,18 +113,18 @@ class DiffusionSVCInferencer(Inferencer):
infer_speedup: int,
silence_front: float,
) -> torch.Tensor:
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
# gt_spec = self.vocoder.extract(audio_t, 16000)
# gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
# print("[ ----Timer::1: ]", t.secs)
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
# print("[ ----Timer::2: ]", t.secs)
with Timer("pre-process") as t: # NOQA
with Timer("pre-process", False) as t: # NOQA
if self.vocoder_onnx is None:
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)

View File

@ -81,23 +81,6 @@ class Pipeline(object):
@torch.no_grad()
def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float):
'''
with Timer("[VolumeExt np]") as t:
for i in range(100):
volume = self.volumeExtractor.extract(audio)
time_np = t.secs
with Timer("[VolumeExt pt]") as t:
for i in range(100):
volume_t = self.volumeExtractor.extract_t(audio)
time_pt = t.secs
print("[Volume np]:", volume)
print("[Volume pt]:", volume_t)
print("[Perform]:", time_np, time_pt)
# -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090)
# -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF)
# ---> これくらいの処理ならCPU上のTorchでやった方が早い
'''
volume_t = self.volumeExtractor.extract_t(audio)
mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold)
volume = volume_t.unsqueeze(-1).unsqueeze(0)
@ -119,7 +102,7 @@ class Pipeline(object):
protect=0.5
):
# print("---------- pipe line --------------------")
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio16k = self.resamplerIn(audio_t)
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
@ -127,7 +110,7 @@ class Pipeline(object):
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
# print("[Timer::1: ]", t.secs)
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
# ピッチ検出
try:
# pitch = self.pitchExtractor.extract(
@ -157,7 +140,7 @@ class Pipeline(object):
feats = feats.view(1, -1)
# print("[Timer::2: ]", t.secs)
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
# embedding
with autocast(enabled=self.isHalf):
@ -175,7 +158,7 @@ class Pipeline(object):
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
# print("[Timer::3: ]", t.secs)
with Timer("pre-process") as t:
with Timer("pre-process", False) as t:
# 推論実行
try:
with torch.no_grad():
@ -206,7 +189,7 @@ class Pipeline(object):
raise e
# print("[Timer::4: ]", t.secs)
with Timer("pre-process") as t: # NOQA
with Timer("pre-process", False) as t: # NOQA
feats_buffer = feats.squeeze(0).detach().cpu()
if pitch is not None:
pitch_buffer = pitch.squeeze(0).detach().cpu()

View File

@ -122,7 +122,7 @@ class RVCr2(VoiceChangerModel):
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (160 - (convertSize % 160))
outSize = convertSize - extra_frame
outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate)
# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
@ -193,6 +193,7 @@ class RVCr2(VoiceChangerModel):
embOutputLayer = self.slotInfo.embOutputLayer
useFinalProj = self.slotInfo.useFinalProj
try:
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid,
@ -202,14 +203,16 @@ class RVCr2(VoiceChangerModel):
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
# 0,
self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。入力のサンプリングレートで算出
embOutputLayer,
useFinalProj,
repeat,
protect
protect,
outSize
)
outSize = outSize // 16000 * self.slotInfo.samplingRate
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
# result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
result = cast(
AudioInOut,

View File

@ -90,7 +90,10 @@ class Pipeline(object):
protect=0.5,
out_size=None,
):
with Timer("main-process") as t:
# print(f"pipeline exec input, audio:{audio.shape}, pitchf:{pitchf.shape}, feature:{feature.shape}")
# print(f"pipeline exec input, silence_front:{silence_front}, out_size:{out_size}")
with Timer("main-process", False) as t: # NOQA
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
# self.t_pad = self.sr * repeat # 1秒
@ -241,6 +244,7 @@ class Pipeline(object):
raise e
feats_buffer = feats.squeeze(0).detach().cpu()
if pitchf is not None:
pitchf_buffer = pitchf.squeeze(0).detach().cpu()
else:
@ -258,6 +262,7 @@ class Pipeline(object):
del sid
# torch.cuda.empty_cache()
# print("EXEC AVERAGE:", t.avrSecs)
return audio1, pitchf_buffer, feats_buffer
def __del__(self):

View File

@ -208,12 +208,13 @@ class VoiceChangerV2(VoiceChangerIF):
block_frame = receivedData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
# data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
audio = self.voiceChanger.inference(
receivedData,
crossfade_frame=crossfade_frame,
sola_search_frame=sola_search_frame
)
if hasattr(self, "sola_buffer") is True:
np.set_printoptions(threshold=10000)
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)

View File

@ -1,15 +1,43 @@
import time
import inspect
class Timer(object):
def __init__(self, title: str):
storedSecs = {} # Class variable
def __init__(self, title: str, enalbe: bool = True):
self.title = title
self.enable = enalbe
self.secs = 0
self.msecs = 0
self.avrSecs = 0
if self.enable is False:
return
self.maxStores = 10
current_frame = inspect.currentframe()
caller_frame = inspect.getouterframes(current_frame, 2)
frame = caller_frame[1]
filename = frame.filename
line_number = frame.lineno
self.key = f"{title}_{filename}_{line_number}"
if self.key not in self.storedSecs:
self.storedSecs[self.key] = []
def __enter__(self):
if self.enable is False:
return
self.start = time.time()
return self
def __exit__(self, *_):
if self.enable is False:
return
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs
self.storedSecs[self.key].append(self.secs)
self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores:]
self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])