optimize convert

2025-01-23 21:45:00 +03:00 · 2023-06-02 23:33:46 +09:00 · 2023-06-02 23:33:46 +09:00 · 2e96f8072a
commit 2e96f8072a
parent bfeef443f5
3 changed files with 63 additions and 7 deletions
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -280,8 +280,11 @@ class RVC:
        crossfadeSize: int,
        solaSearchFrame: int = 0,
    ):
-        newData = newData.astype(np.float32) / 32768.0
+        newData = (
+            newData.astype(np.float32) / 32768.0
+        )  # RVCのモデルのサンプリングレートで入ってきている。（extraDataLength, Crossfade等も同じSRで処理）(★１)

+        print("newData", newData.shape, crossfadeSize, solaSearchFrame)
        if self.audio_buffer is not None:
            # 過去のデータに連結
            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
@ -292,8 +295,10 @@ class RVC:
            inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
        )

+        print("convertSize1", convertSize)
        if convertSize % 128 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
            convertSize = convertSize + (128 - (convertSize % 128))
+        print("convertSize2", convertSize)

        convertOffset = -1 * convertSize
        self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
@ -314,6 +319,7 @@ class RVC:
        vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy()
        vol = max(vol, self.prevVol * 0.0)
        self.prevVol = vol
+        print("inf0 : ", audio_buffer.shape, convertSize)

        return (audio_buffer, convertSize, vol)

@ -341,6 +347,7 @@ class RVC:
        if vol < self.settings.silentThreshold:
            return np.zeros(convertSize).astype(np.int16)

+        print("inf1 : ", audio.shape)
        audio = torchaudio.functional.resample(
            audio, self.settings.modelSamplingRate, 16000, rolloff=0.99
        )
@ -360,7 +367,8 @@ class RVC:
            f0_up_key,
            index_rate,
            if_f0,
-            self.settings.extraConvertSize / self.settings.modelSamplingRate,
+            self.settings.extraConvertSize
+            / self.settings.modelSamplingRate,  # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★１)。
            embOutputLayer,
            useFinalProj,
            repeat,
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -1,6 +1,6 @@
 import numpy as np
 from typing import Any
-
+import math
 import torch
 import torch.nn.functional as F
 from Exceptions import (
@ -90,7 +90,7 @@ class Pipeline(object):
        )
        self.t_pad = self.sr * repeat
        self.t_pad_tgt = self.targetSR * repeat
-
+        print("Audio Feature1", audio.shape)  # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
        audio_pad = F.pad(
            audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect"
        ).squeeze(0)
@ -130,9 +130,21 @@ class Pipeline(object):
        feats = feats.view(1, -1)

        # embedding
+        print("audio feature", feats.shape)
        padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
        try:
+            # testFeat = feats.clone()
+            # while True:
+            #     print("embedding audio;", testFeat.shape)
+            #     testFeatOut = self.embedder.extractFeatures(
+            #         testFeat, embOutputLayer, useFinalProj
+            #     )
+            #     testFeat = testFeat[:, 1:]
+            #     print("embedding vector;", testFeatOut.shape)
+
+            print("embedding audio;", feats.shape)
            feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
+            print("embedding vector;", feats.shape)
        except RuntimeError as e:
            if "HALF" in e.__str__().upper():
                raise HalfPrecisionChangingException()
@ -147,6 +159,20 @@ class Pipeline(object):
        # if self.index is not None and self.feature is not None and index_rate != 0:
        if search_index:
            npy = feats[0].cpu().numpy()
+            print("npy shape", npy.shape, npy.shape[0] * 16000)
+            npyOffset = math.floor(silence_front * 16000) // 360
+            print(
+                "npyOffset",
+                silence_front,
+                self.targetSR,
+                (silence_front * self.targetSR),
+                npyOffset,
+            )
+            npy = npy[npyOffset:]
+            print(
+                "npy trimmed shape",
+                npy.shape,
+            )
            if self.isHalf is True:
                npy = npy.astype("float32")
            # D, I = self.index.search(npy, 1)
@ -156,6 +182,7 @@ class Pipeline(object):
            k = 1
            if k == 1:
                _, ix = self.index.search(npy, 1)
+                print("ix  shape", ix.shape)
                npy = self.big_npy[ix.squeeze()]
            else:
                score, ix = self.index.search(npy, k=8)
@ -166,6 +193,11 @@ class Pipeline(object):
            if self.isHalf is True:
                npy = npy.astype("float16")

+            npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]), npy])
+            print(
+                "npy last shape",
+                npy.shape,
+            )
            feats = (
                torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
                + (1 - index_rate) * feats
@ -195,6 +227,22 @@ class Pipeline(object):
            feats = feats.to(feats0.dtype)
        p_len = torch.tensor([p_len], device=self.device).long()

+        npyOffset = math.floor(silence_front * 16000) // 360
+        print(
+            "npy last shape2",
+            feats.shape,
+        )
+        feats = feats[:, npyOffset * 2 :, :]
+        feats_len = feats.shape[1]
+        pitch = pitch[:, -feats_len:]
+        pitchf = pitchf[:, -feats_len:]
+        p_len = torch.tensor([feats_len], device=self.device).long()
+
+        print(
+            "npy last shape3",
+            feats.shape,
+            feats_len,
+        )
        # 推論実行
        try:
            with torch.no_grad():
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -435,7 +435,7 @@ class VoiceChanger:
                raise RuntimeError("Voice Changer is not selected.")

            processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
-
+            print("original frame", receivedData.shape[0])
            # 前処理
            with Timer("pre-process") as t:
                if self.settings.inputSampleRate != processing_sampling_rate:
@ -453,6 +453,7 @@ class VoiceChanger:
                sola_search_frame = int(0.012 * processing_sampling_rate)
                # sola_search_frame = 0
                block_frame = newData.shape[0]
+                print("block frame", newData.shape[0])
                crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
                self._generate_strength(crossfade_frame)

@ -472,8 +473,7 @@ class VoiceChanger:
                        sola_search_frame + crossfade_frame + block_frame
                    )
                    audio = audio[audio_offset:]
-                    a = 0
-                    audio = audio[a:]
+
                    # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
                    cor_nom = np.convolve(
                        audio[: crossfade_frame + sola_search_frame],