diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py index 557db7f2..eb80fae7 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py +++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py @@ -113,18 +113,18 @@ class DiffusionSVCInferencer(Inferencer): infer_speedup: int, silence_front: float, ) -> torch.Tensor: - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None) # gt_spec = self.vocoder.extract(audio_t, 16000) # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1) # print("[ ----Timer::1: ]", t.secs) - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None) # print("[ ----Timer::2: ]", t.secs) - with Timer("pre-process") as t: # NOQA + with Timer("pre-process", False) as t: # NOQA if self.vocoder_onnx is None: start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index eb78d7fb..ed3ef8dc 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -81,23 +81,6 @@ class Pipeline(object): @torch.no_grad() def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float): - ''' - with Timer("[VolumeExt np]") as t: - for i in range(100): - volume = self.volumeExtractor.extract(audio) - time_np = t.secs - with Timer("[VolumeExt pt]") as t: - for i in range(100): - volume_t = self.volumeExtractor.extract_t(audio) - time_pt = t.secs - - print("[Volume np]:", volume) - print("[Volume pt]:", volume_t) - print("[Perform]:", time_np, time_pt) - # -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090) - # -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF) - # ---> これくらいの処理ならCPU上のTorchでやった方が早い? - ''' volume_t = self.volumeExtractor.extract_t(audio) mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold) volume = volume_t.unsqueeze(-1).unsqueeze(0) @@ -119,7 +102,7 @@ class Pipeline(object): protect=0.5 ): # print("---------- pipe line --------------------") - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) audio16k = self.resamplerIn(audio_t) volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0) @@ -127,7 +110,7 @@ class Pipeline(object): n_frames = int(audio16k.size(-1) // self.hop_size + 1) # print("[Timer::1: ]", t.secs) - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: # ピッチ検出 try: # pitch = self.pitchExtractor.extract( @@ -157,7 +140,7 @@ class Pipeline(object): feats = feats.view(1, -1) # print("[Timer::2: ]", t.secs) - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: # embedding with autocast(enabled=self.isHalf): @@ -175,7 +158,7 @@ class Pipeline(object): feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1) # print("[Timer::3: ]", t.secs) - with Timer("pre-process") as t: + with Timer("pre-process", False) as t: # 推論実行 try: with torch.no_grad(): @@ -206,7 +189,7 @@ class Pipeline(object): raise e # print("[Timer::4: ]", t.secs) - with Timer("pre-process") as t: # NOQA + with Timer("pre-process", False) as t: # NOQA feats_buffer = feats.squeeze(0).detach().cpu() if pitch is not None: pitch_buffer = pitch.squeeze(0).detach().cpu() diff --git a/server/voice_changer/RVC/RVCr2.py b/server/voice_changer/RVC/RVCr2.py index 10e64335..419bd119 100644 --- a/server/voice_changer/RVC/RVCr2.py +++ b/server/voice_changer/RVC/RVCr2.py @@ -122,7 +122,7 @@ class RVCr2(VoiceChangerModel): if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 convertSize = convertSize + (160 - (convertSize % 160)) - outSize = convertSize - extra_frame + outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate) # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: @@ -193,6 +193,7 @@ class RVCr2(VoiceChangerModel): embOutputLayer = self.slotInfo.embOutputLayer useFinalProj = self.slotInfo.useFinalProj + try: audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( sid, @@ -202,14 +203,16 @@ class RVCr2(VoiceChangerModel): f0_up_key, index_rate, if_f0, - self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 + # 0, + self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。入力のサンプリングレートで算出 embOutputLayer, useFinalProj, repeat, - protect + protect, + outSize ) - outSize = outSize // 16000 * self.slotInfo.samplingRate - result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol) + # result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol) + result = audio_out.detach().cpu().numpy() * np.sqrt(vol) result = cast( AudioInOut, diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index fac98260..f1f7cd0a 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -90,7 +90,10 @@ class Pipeline(object): protect=0.5, out_size=None, ): - with Timer("main-process") as t: + # print(f"pipeline exec input, audio:{audio.shape}, pitchf:{pitchf.shape}, feature:{feature.shape}") + # print(f"pipeline exec input, silence_front:{silence_front}, out_size:{out_size}") + + with Timer("main-process", False) as t: # NOQA # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 search_index = self.index is not None and self.big_npy is not None and index_rate != 0 # self.t_pad = self.sr * repeat # 1秒 @@ -241,6 +244,7 @@ class Pipeline(object): raise e feats_buffer = feats.squeeze(0).detach().cpu() + if pitchf is not None: pitchf_buffer = pitchf.squeeze(0).detach().cpu() else: @@ -258,6 +262,7 @@ class Pipeline(object): del sid # torch.cuda.empty_cache() + # print("EXEC AVERAGE:", t.avrSecs) return audio1, pitchf_buffer, feats_buffer def __del__(self): diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index ba151bbe..0623cb43 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -208,12 +208,13 @@ class VoiceChangerV2(VoiceChangerIF): block_frame = receivedData.shape[0] crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) self._generate_strength(crossfade_frame) - # data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame) + audio = self.voiceChanger.inference( receivedData, crossfade_frame=crossfade_frame, sola_search_frame=sola_search_frame ) + if hasattr(self, "sola_buffer") is True: np.set_printoptions(threshold=10000) audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame) diff --git a/server/voice_changer/utils/Timer.py b/server/voice_changer/utils/Timer.py index 43d7591f..d73169d4 100644 --- a/server/voice_changer/utils/Timer.py +++ b/server/voice_changer/utils/Timer.py @@ -1,15 +1,43 @@ import time +import inspect class Timer(object): - def __init__(self, title: str): + storedSecs = {} # Class variable + + def __init__(self, title: str, enalbe: bool = True): self.title = title + self.enable = enalbe + self.secs = 0 + self.msecs = 0 + self.avrSecs = 0 + + if self.enable is False: + return + + self.maxStores = 10 + + current_frame = inspect.currentframe() + caller_frame = inspect.getouterframes(current_frame, 2) + frame = caller_frame[1] + filename = frame.filename + line_number = frame.lineno + self.key = f"{title}_{filename}_{line_number}" + if self.key not in self.storedSecs: + self.storedSecs[self.key] = [] def __enter__(self): + if self.enable is False: + return self.start = time.time() return self def __exit__(self, *_): + if self.enable is False: + return self.end = time.time() self.secs = self.end - self.start self.msecs = self.secs * 1000 # millisecs + self.storedSecs[self.key].append(self.secs) + self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores:] + self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])