skipdiffusion

2025-03-13 19:34:02 +03:00 · 2023-08-06 04:50:42 +09:00 · 2023-08-06 04:50:42 +09:00 · e18138b5d6
commit e18138b5d6
parent 6d4c138821
8 changed files with 78 additions and 16 deletions
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
+++ b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
@ -23,6 +23,26 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
            return <></>;
        }

+        const skipDiffusionClass = serverSetting.serverSetting.skipDiffusion == 0 ? "character-area-toggle-button" : "character-area-toggle-button-active";
+
+        const skipDiffRow = (
+            <div className="character-area-control">
+                <div className="character-area-control-title">Boost</div>
+                <div className="character-area-control-field">
+                    <div className="character-area-buttons">
+                        <div
+                            className={skipDiffusionClass}
+                            onClick={() => {
+                                serverSetting.updateServerSettings({ ...serverSetting.serverSetting, skipDiffusion: serverSetting.serverSetting.skipDiffusion == 0 ? 1 : 0 });
+                            }}
+                        >
+                            skip diff
+                        </div>
+                    </div>
+                </div>
+            </div>
+        );
+
        const skipValues = getDivisors(serverSetting.serverSetting.kStep);
        skipValues.pop();

@ -82,6 +102,7 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
        );
        return (
            <>
+                {skipDiffRow}
                {kStepRow}
                {speedUpRow}
            </>
--- a/client/demo/src/css/App.css
+++ b/client/demo/src/css/App.css
@ -1338,6 +1338,7 @@ body {
        .character-area-control {
            display: flex;
            gap: 3px;
+            align-items: center;
            .character-area-control-buttons {
                display: flex;
                flex-direction: row;
@ -1405,6 +1406,35 @@ body {
                    .character-area-button:hover {
                        border: solid 2px #faa;
                    }
+                    .character-area-toggle-button {
+                        border: solid 2px #999;
+                        color: white;
+                        background: #666;
+
+                        cursor: pointer;
+
+                        font-size: 0.8rem;
+                        border-radius: 5px;
+                        height: 1.2rem;
+                        padding-left: 2px;
+                        padding-right: 2px;
+                    }
+                    .character-area-toggle-button:hover {
+                        border: solid 2px #faa;
+                    }
+                    .character-area-toggle-button-active {
+                        border: solid 2px #999;
+                        color: white;
+                        background: #844;
+
+                        cursor: pointer;
+
+                        font-size: 0.8rem;
+                        border-radius: 5px;
+                        height: 1.2rem;
+                        padding-left: 2px;
+                        padding-right: 2px;
+                    }
                }
            }
        }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -124,6 +124,7 @@ export const ServerSettingKey = {
    "threshold": "threshold",

    "speedUp": "speedUp",
+    "skipDiffusion": "skipDiffusion",

    "inputSampleRate": "inputSampleRate",
    "enableDirectML": "enableDirectML",
@ -186,7 +187,7 @@ export type VoiceChangerServerSetting = {
    threshold: number// DDSP-SVC

    speedUp: number // Diffusion-SVC
-
+    skipDiffusion: number // Diffusion-SVC 0:off, 1:on

    inputSampleRate: InputSampleRate
    enableDirectML: number
@ -403,6 +404,7 @@ export const DefaultServerSetting: ServerInfo = {
    threshold: -45,

    speedUp: 10,
+    skipDiffusion: 1,

    enableDirectML: 0,
    // 
--- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py
+++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py
@ -180,7 +180,8 @@ class DiffusionSVC(VoiceChangerModel):
                silenceFrontSec,
                embOutputLayer,
                useFinalProj,
-                protect
+                protect,
+                skip_diffusion=self.settings.skipDiffusion,
            )
            result = audio_out.detach().cpu().numpy()
            return result
--- a/server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
+++ b/server/voice_changer/DiffusionSVC/DiffusionSVCSettings.py
@ -13,7 +13,7 @@ class DiffusionSVCSettings:

    kStep: int = 20
    speedUp: int = 10
-    skipDiffusion: int = 0  # 0:off, 1:on
+    skipDiffusion: int = 1  # 0:off, 1:on

    silenceFront: int = 1  # 0:off, 1:on
    modelSamplingRate: int = 44100
@ -30,6 +30,7 @@ class DiffusionSVCSettings:
        "kStep",
        "speedUp",
        "silenceFront",
+        "skipDiffusion",
    ]
    floatData = ["silentThreshold"]
    strData = ["f0Detector"]
--- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
@ -112,25 +112,27 @@ class DiffusionSVCInferencer(Inferencer):
        k_step: int,
        infer_speedup: int,
        silence_front: float,
+        skip_diffusion: bool = True,
    ) -> torch.Tensor:
        with Timer("pre-process", False) as t:
            gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
-            # gt_spec = self.vocoder.extract(audio_t, 16000)
-            # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)

        # print("[    ----Timer::1: ]", t.secs)

-        with Timer("pre-process", False) as t:
-            out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+        with Timer("pre-process", True) as t:
+            if skip_diffusion == 0:
+                out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+                gt_spec = out_mel
+        print("[    ----Timer::2: ]", t.secs)
+

-        # print("[    ----Timer::2: ]", t.secs)
        with Timer("pre-process", False) as t:  # NOQA
            if self.vocoder_onnx is None:
                start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
-                out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
+                out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)
                out_wav *= mask
            else:
-                out_wav = self.vocoder_onnx.infer(out_mel, pitch, silence_front, mask)
+                out_wav = self.vocoder_onnx.infer(gt_spec, pitch, silence_front, mask)
        # print("[    ----Timer::3: ]", t.secs)

        return out_wav.squeeze()
--- a/server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/Inferencer.py
@ -21,11 +21,16 @@ class Inferencer(Protocol):

    def infer(
        self,
+        audio_t: torch.Tensor,
        feats: torch.Tensor,
-        pitch_length: torch.Tensor,
-        pitch: torch.Tensor | None,
-        pitchf: torch.Tensor | None,
+        pitch: torch.Tensor,
+        volume: torch.Tensor,
+        mask: torch.Tensor,
        sid: torch.Tensor,
+        k_step: int,
+        infer_speedup: int,
+        silence_front: float,
+        skip_diffusion: bool = True,
    ) -> torch.Tensor:
        ...