update

2025-02-02 16:23:58 +03:00 · 2023-10-09 12:15:03 +09:00 · 2023-10-09 12:15:03 +09:00 · 782d003a91
commit 782d003a91
parent 2b9a497eb0
8 changed files with 209 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,9 @@ server/memo.md

 client/lib/dist
 client/lib/worklet/dist
+client/demo/public/models
+client/demo/dist/models
+client/demo/src/001_provider/backup
 # client/demo/dist/ # demo用に残す

 docker/cudnn/
--- a/client/demo/webpack.common.js
+++ b/client/demo/webpack.common.js
@ -1,11 +1,15 @@
 const path = require("path");
 const HtmlWebpackPlugin = require("html-webpack-plugin");
 const CopyPlugin = require("copy-webpack-plugin");
+const webpack = require("webpack");
 module.exports = {
    mode: "production",
    entry: "./src/000_index.tsx",
    resolve: {
        extensions: [".ts", ".tsx", ".js"],
+        fallback: {
+            buffer: require.resolve("buffer/"),
+        },
    },
    module: {
        rules: [
@ -29,7 +33,7 @@ module.exports = {
                test: /\.css$/,
                use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
            },
-
+            { test: /\.json$/, type: "asset/inline" },
        ],
    },
    output: {
@ -37,6 +41,9 @@ module.exports = {
        path: path.resolve(__dirname, "dist"),
    },
    plugins: [
+        new webpack.ProvidePlugin({
+            Buffer: ["buffer", "Buffer"],
+        }),
        new HtmlWebpackPlugin({
            template: path.resolve(__dirname, "public/index.html"),
            filename: "./index.html",
@ -47,5 +54,5 @@ module.exports = {
        new CopyPlugin({
            patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
        }),
-    ]
+    ],
 };
--- a/client/lib/src/client/VoiceChangerWorkletNode.ts
+++ b/client/lib/src/client/VoiceChangerWorkletNode.ts
@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = {
 };

 export type InternalCallback = {
-    processAudio: (data: Uint8Array) => Uint8Array;
+    processAudio: (data: Uint8Array) => Promise<Uint8Array>;
 };

 export class VoiceChangerWorkletNode extends AudioWorkletNode {
@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
                downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
            }

-            // Float to Int16
-            const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
-            const dataView = new DataView(arrayBuffer);
-            for (let i = 0; i < downsampledBuffer.length; i++) {
-                let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
-                s = s < 0 ? s * 0x8000 : s * 0x7fff;
-                dataView.setInt16(i * 2, s, true);
+            // Float to Int16 (internalの場合はfloatのまま行く。)
+            if (this.setting.protocol != "internal") {
+                const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
+                const dataView = new DataView(arrayBuffer);
+                for (let i = 0; i < downsampledBuffer.length; i++) {
+                    let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
+                    s = s < 0 ? s * 0x8000 : s * 0x7fff;
+                    dataView.setInt16(i * 2, s, true);
+                }
+                // バッファリング
+                this.requestChunks.push(arrayBuffer);
+            } else {
+                // internal
+                // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
+                this.requestChunks.push(downsampledBuffer.buffer);
            }

-            // バッファリング
-            this.requestChunks.push(arrayBuffer);
-
            //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
            if (this.requestChunks.length < this.setting.inputChunkNum) {
                return;
@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
                this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
                return;
            }
-            const res = this.internalCallback.processAudio(newBuffer);
+            const res = await this.internalCallback.processAudio(newBuffer);
+            if (res.length == 0) {
+                return;
+            }
            if (this.outputNode != null) {
                this.outputNode.postReceivedVoice(res.buffer);
            } else {
--- a/client/lib/worklet/src/voice-changer-worklet-processor.ts
+++ b/client/lib/worklet/src/voice-changer-worklet-processor.ts
@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
    private isRecording = false;

    playBuffer: Float32Array[] = [];
+    unpushedF32Data: Float32Array = new Float32Array(0);
    /**
     * @constructor
     */
@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
        }

        const f32Data = request.voice;
-        const chunkNum = f32Data.length / this.BLOCK_SIZE;
+        const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length);
+        concatedF32Data.set(this.unpushedF32Data);
+        concatedF32Data.set(f32Data, this.unpushedF32Data.length);
+
+        const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE);
        for (let i = 0; i < chunkNum; i++) {
-            const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
+            const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
            this.playBuffer.push(block);
        }
+        this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE);
    }

    pushData = (inputData: Float32Array) => {
@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
        }

        if (this.playBuffer.length === 0) {
-            // console.log("[worklet] no play buffer")
+            // console.log("[worklet] no play buffer");
            return true;
        }
-
+        // console.log("[worklet] play buffer");
        //// 一定期間無音状態が続いている場合はスキップ。
        // let voice: Float32Array | undefined
        // while (true) {
--- a/server/sio/MMVC_SocketIOApp.py
+++ b/server/sio/MMVC_SocketIOApp.py
@ -40,6 +40,10 @@ class MMVC_SocketIOApp:
                        "filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
                        "content_type": "image/png",
                    },
+                    "/ort-wasm-simd.wasm": {
+                        "filename": f"{getFrontendPath()}/ort-wasm-simd.wasm",
+                        "content_type": "application/wasm",
+                    },
                    "": f"{getFrontendPath()}",
                    "/": f"{getFrontendPath()}/index.html",
                },
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -93,9 +93,9 @@ class Pipeline(object):
                pitch = None
                pitchf = None
        except IndexError as e:  # NOQA
-            # print(e)
-            # import traceback
-            # traceback.print_exc()
+            print(e)
+            import traceback
+            traceback.print_exc()
            raise NotEnoughDataExtimateF0()
        return pitch, pitchf

--- a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
        )

    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
-        n_frames = int(len(audio) // window) + 1
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr

--- a/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py
+++ b/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py
@ -0,0 +1,160 @@
+# 実験用。使用しない。
+
+import torch
+from torchcrepe.model import Crepe
+import os
+CENTS_PER_BIN = 20
+PITCH_BINS = 360
+
+
+class TorchCrepe2(torch.nn.Module):
+
+    def __init__(self, model='full'):
+        super().__init__()
+        self.crepe = Crepe(model)
+        file = os.path.join(os.path.dirname(__file__), f'{model}.pth')
+        self.crepe.load_state_dict(torch.load(file, map_location="cpu"))
+        self.crepe = self.crepe.to(torch.device("cpu"))
+        self.crepe.eval()
+
+        self.sample_rate = 16000
+        self.hop_length = 160
+        self.window_size = 1024
+
+    def forward(self, audio, f0_min: int, f0_max: int):
+        # total_frames = 1 + int(audio.size(1) // self.hop_length)
+        audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2))
+        # batch_size = total_frames
+
+        start = 0
+        end = audio.size(1)
+
+        # Chunk
+        frames = torch.nn.functional.unfold(
+            audio[:, None, None, start:end],
+            kernel_size=(1, self.window_size),
+            stride=(1, self.hop_length))
+
+        frames = frames.transpose(1, 2).reshape(-1, self.window_size)
+
+        # Place on device
+        # frames = frames.to(device)
+
+        # Mean-center
+        frames -= frames.mean(dim=1, keepdim=True)
+
+        # Scale
+        frames /= torch.max(torch.tensor(1e-10, device=frames.device),
+                            frames.std(dim=1, keepdim=True))
+
+        probabilities = self.crepe(frames.to(torch.device("cpu")))
+        probabilities = probabilities.reshape(
+            audio.size(0), -1, PITCH_BINS).transpose(1, 2)
+        
+        minidx = frequency_to_bins(torch.tensor(f0_min))
+        maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil)
+
+        probabilities[:, :minidx] = -float('inf')
+        probabilities[:, maxidx:] = -float('inf')
+
+        bins, pitch = weighted_argmax(probabilities)
+
+        return pitch, periodicity(probabilities, bins)
+
+
+def weighted_argmax(logits):
+    """Sample observations using weighted sum near the argmax"""
+    # Find center of analysis window
+    bins = logits.argmax(dim=1)
+
+    # Find bounds of analysis window
+    start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
+    end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
+
+    # Mask out everything outside of window
+    for batch in range(logits.size(0)):
+        for time in range(logits.size(2)):
+            logits[batch, :start[batch, time], time] = -float('inf')
+            logits[batch, end[batch, time]:, time] = -float('inf')
+
+    # Construct weights
+    if not hasattr(weighted_argmax, 'weights'):
+        weights = bins_to_cents(torch.arange(360))
+        weighted_argmax.weights = weights[None, :, None]
+
+    # Ensure devices are the same (no-op if they are)
+    weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
+
+    # Convert to probabilities
+    with torch.no_grad():
+        probs = torch.sigmoid(logits)
+
+    # Apply weights
+    cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
+
+    # Convert to frequency in Hz
+    return bins, cents_to_frequency(cents)
+
+
+def bins_to_cents(bins):
+    """Converts pitch bins to cents"""
+    cents = CENTS_PER_BIN * bins + 1997.3794084376191
+
+    # Trade quantization error for noise
+    return dither(cents)
+
+
+def dither(cents):
+    """Dither the predicted pitch in cents to remove quantization error"""
+    # noise = scipy.stats.triang.rvs(c=0.5,
+    #                                loc=-CENTS_PER_BIN,
+    #                                scale=2 * CENTS_PER_BIN,
+    #                                size=cents.size())
+        
+    # 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値
+    c = 0.5
+    loc = -CENTS_PER_BIN
+    scale = 2 * CENTS_PER_BIN
+    u = torch.rand(cents.size())
+    # f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2)
+    f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2))
+    noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc
+    mask = u >= c
+    noise[mask] = 2 * (scale - noise[mask])
+    return cents + cents.new_tensor(noise)
+
+
+def cents_to_frequency(cents):
+    """Converts cents to frequency in Hz"""
+    return 10 * 2 ** (cents / 1200)
+
+
+def periodicity(probabilities, bins):
+    """Computes the periodicity from the network output and pitch bins"""
+    # shape=(batch * time / hop_length, 360)
+    probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
+
+    # shape=(batch * time / hop_length, 1)
+    bins_stacked = bins.reshape(-1, 1).to(torch.int64)
+
+    # Use maximum logit over pitch bins as periodicity
+    periodicity = probs_stacked.gather(1, bins_stacked)
+
+    # shape=(batch, time / hop_length)
+    return periodicity.reshape(probabilities.size(0), probabilities.size(2))
+
+
+def cents_to_bins(cents, quantize_fn=torch.floor):
+    """Converts cents to pitch bins"""
+    bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
+    return quantize_fn(bins).int()
+
+
+def frequency_to_bins(frequency, quantize_fn=torch.floor):
+    """Convert frequency in Hz to pitch bins"""
+    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
+
+
+def frequency_to_cents(frequency):
+    """Convert frequency in Hz to cents"""
+    return 1200 * torch.log2(frequency / 10.)