Experimental: remve torchaudio from rvc

2025-03-13 19:34:02 +03:00 · 2023-07-28 00:41:19 +09:00 · 2023-07-28 00:41:19 +09:00 · cacd127c76
commit cacd127c76
parent 3d2f5ad0da
6 changed files with 53 additions and 33 deletions
--- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
@ -113,18 +113,18 @@ class DiffusionSVCInferencer(Inferencer):
        infer_speedup: int,
        silence_front: float,
    ) -> torch.Tensor:
-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:
            gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
            # gt_spec = self.vocoder.extract(audio_t, 16000)
            # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)

        # print("[    ----Timer::1: ]", t.secs)

-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:
            out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)

        # print("[    ----Timer::2: ]", t.secs)
-        with Timer("pre-process") as t:  # NOQA
+        with Timer("pre-process", False) as t:  # NOQA
            if self.vocoder_onnx is None:
                start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
                out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
--- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
+++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
@ -81,23 +81,6 @@ class Pipeline(object):

    @torch.no_grad()
    def extract_volume_and_mask(self, audio: torch.Tensor, threshold: float):
-        '''
-        with Timer("[VolumeExt np]") as t:
-            for i in range(100):
-                volume = self.volumeExtractor.extract(audio)
-        time_np = t.secs
-        with Timer("[VolumeExt pt]") as t:
-            for i in range(100):
-                volume_t = self.volumeExtractor.extract_t(audio)
-        time_pt = t.secs
-
-        print("[Volume np]:", volume)
-        print("[Volume pt]:", volume_t)
-        print("[Perform]:", time_np, time_pt)
-        # -> [Perform]: 0.030178070068359375 0.005780220031738281 (RTX4090)
-        # -> [Perform]: 0.029046058654785156 0.0025115013122558594 (CPU i9 13900KF)
-        # ---> これくらいの処理ならCPU上のTorchでやった方が早い？
-        '''
        volume_t = self.volumeExtractor.extract_t(audio)
        mask = self.volumeExtractor.get_mask_from_volume_t(volume_t, self.inferencer_block_size, threshold=threshold)
        volume = volume_t.unsqueeze(-1).unsqueeze(0)
@ -119,7 +102,7 @@ class Pipeline(object):
        protect=0.5
    ):
        # print("---------- pipe line --------------------")
-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:
            audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
            audio16k = self.resamplerIn(audio_t)
            volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
@ -127,7 +110,7 @@ class Pipeline(object):
            n_frames = int(audio16k.size(-1) // self.hop_size + 1)
        # print("[Timer::1: ]", t.secs)

-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:
            # ピッチ検出
            try:
                # pitch = self.pitchExtractor.extract(
@ -157,7 +140,7 @@ class Pipeline(object):
            feats = feats.view(1, -1)
        # print("[Timer::2: ]", t.secs)

-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:

            # embedding
            with autocast(enabled=self.isHalf):
@ -175,7 +158,7 @@ class Pipeline(object):
            feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
        # print("[Timer::3: ]", t.secs)

-        with Timer("pre-process") as t:
+        with Timer("pre-process", False) as t:
            # 推論実行
            try:
                with torch.no_grad():
@ -206,7 +189,7 @@ class Pipeline(object):
                    raise e
        # print("[Timer::4: ]", t.secs)

-        with Timer("pre-process") as t:  # NOQA
+        with Timer("pre-process", False) as t:  # NOQA
            feats_buffer = feats.squeeze(0).detach().cpu()
            if pitch is not None:
                pitch_buffer = pitch.squeeze(0).detach().cpu()
--- a/server/voice_changer/RVC/RVCr2.py
+++ b/server/voice_changer/RVC/RVCr2.py
@ -122,7 +122,7 @@ class RVCr2(VoiceChangerModel):

        if convertSize % 160 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
            convertSize = convertSize + (160 - (convertSize % 160))
-        outSize = convertSize - extra_frame
+        outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate) 

        # バッファがたまっていない場合はzeroで補う
        if self.audio_buffer.shape[0] < convertSize:
@ -193,6 +193,7 @@ class RVCr2(VoiceChangerModel):
        embOutputLayer = self.slotInfo.embOutputLayer
        useFinalProj = self.slotInfo.useFinalProj

+        
        try:
            audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
                sid,
@ -202,14 +203,16 @@ class RVCr2(VoiceChangerModel):
                f0_up_key,
                index_rate,
                if_f0,
-                self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0.,  # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★１)。
+                # 0,
+                self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0.,  # extaraDataSizeの秒数。入力のサンプリングレートで算出
                embOutputLayer,
                useFinalProj,
                repeat,
-                protect
+                protect,
+                outSize
            )
-            outSize = outSize // 16000 * self.slotInfo.samplingRate
-            result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
+            # result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
+            result = audio_out.detach().cpu().numpy() * np.sqrt(vol)

            result = cast(
                AudioInOut,
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -90,7 +90,10 @@ class Pipeline(object):
        protect=0.5,
        out_size=None,
    ):
-        with Timer("main-process") as t:
+        # print(f"pipeline exec input, audio:{audio.shape}, pitchf:{pitchf.shape}, feature:{feature.shape}")
+        # print(f"pipeline exec input, silence_front:{silence_front}, out_size:{out_size}")
+
+        with Timer("main-process", False) as t:  # NOQA
            # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
            search_index = self.index is not None and self.big_npy is not None and index_rate != 0
            # self.t_pad = self.sr * repeat  # 1秒
@ -241,6 +244,7 @@ class Pipeline(object):
                    raise e

            feats_buffer = feats.squeeze(0).detach().cpu()
+
            if pitchf is not None:
                pitchf_buffer = pitchf.squeeze(0).detach().cpu()
            else:
@ -258,6 +262,7 @@ class Pipeline(object):

            del sid
            # torch.cuda.empty_cache()
+        # print("EXEC AVERAGE:", t.avrSecs)
        return audio1, pitchf_buffer, feats_buffer

    def __del__(self):
--- a/server/voice_changer/VoiceChangerV2.py
+++ b/server/voice_changer/VoiceChangerV2.py
@ -208,12 +208,13 @@ class VoiceChangerV2(VoiceChangerIF):
                block_frame = receivedData.shape[0]
                crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
                self._generate_strength(crossfade_frame)
-                # data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
+
                audio = self.voiceChanger.inference(
                    receivedData,
                    crossfade_frame=crossfade_frame,
                    sola_search_frame=sola_search_frame
                )
+
                if hasattr(self, "sola_buffer") is True:
                    np.set_printoptions(threshold=10000)
                    audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
--- a/server/voice_changer/utils/Timer.py
+++ b/server/voice_changer/utils/Timer.py
@ -1,15 +1,43 @@
 import time
+import inspect


 class Timer(object):
-    def __init__(self, title: str):
+    storedSecs = {}  # Class variable
+
+    def __init__(self, title: str, enalbe: bool = True):
        self.title = title
+        self.enable = enalbe
+        self.secs = 0
+        self.msecs = 0
+        self.avrSecs = 0
+
+        if self.enable is False:
+            return
+
+        self.maxStores = 10
+
+        current_frame = inspect.currentframe()
+        caller_frame = inspect.getouterframes(current_frame, 2)
+        frame = caller_frame[1]
+        filename = frame.filename
+        line_number = frame.lineno
+        self.key = f"{title}_{filename}_{line_number}"
+        if self.key not in self.storedSecs:
+            self.storedSecs[self.key] = []

    def __enter__(self):
+        if self.enable is False:
+            return
        self.start = time.time()
        return self

    def __exit__(self, *_):
+        if self.enable is False:
+            return
        self.end = time.time()
        self.secs = self.end - self.start
        self.msecs = self.secs * 1000  # millisecs
+        self.storedSecs[self.key].append(self.secs)
+        self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores:]
+        self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])