update

2025-02-02 16:23:58 +03:00 · 2023-10-09 12:15:03 +09:00 · 2023-10-09 12:15:03 +09:00 · 782d003a91
commit 782d003a91
parent 2b9a497eb0
8 changed files with 209 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@ -36,6 +36,9 @@ server/memo.md
 client/lib/dist
 client/lib/worklet/dist
 client/demo/public/models
 client/demo/dist/models
 client/demo/src/001_provider/backup
 # client/demo/dist/ # demo用に残す
 docker/cudnn/
--- a/client/demo/webpack.common.js
+++ b/client/demo/webpack.common.js
@ -1,11 +1,15 @@
 const path = require("path");
 const HtmlWebpackPlugin = require("html-webpack-plugin");
 const CopyPlugin = require("copy-webpack-plugin");
 const webpack = require("webpack");
 module.exports = {
    mode: "production",
    entry: "./src/000_index.tsx",
    resolve: {
        extensions: [".ts", ".tsx", ".js"],
        fallback: {
            buffer: require.resolve("buffer/"),
        },
    },
    module: {
        rules: [
@ -29,7 +33,7 @@ module.exports = {
                test: /\.css$/,
                use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
            },
-
+            { test: /\.json$/, type: "asset/inline" },
        ],
    },
    output: {
@ -37,6 +41,9 @@ module.exports = {
        path: path.resolve(__dirname, "dist"),
    },
    plugins: [
        new webpack.ProvidePlugin({
            Buffer: ["buffer", "Buffer"],
        }),
        new HtmlWebpackPlugin({
            template: path.resolve(__dirname, "public/index.html"),
            filename: "./index.html",
@ -47,5 +54,5 @@ module.exports = {
        new CopyPlugin({
            patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
        }),
-    ]
+    ],
 };
--- a/client/lib/src/client/VoiceChangerWorkletNode.ts
+++ b/client/lib/src/client/VoiceChangerWorkletNode.ts
@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = {
 };
 export type InternalCallback = {
-    processAudio: (data: Uint8Array) => Uint8Array;
+    processAudio: (data: Uint8Array) => Promise<Uint8Array>;
 };
 export class VoiceChangerWorkletNode extends AudioWorkletNode {
@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
                downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
            }
-            // Float to Int16
+            // Float to Int16 (internalの場合はfloatのまま行く。)
-            const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
+            if (this.setting.protocol != "internal") {
-            const dataView = new DataView(arrayBuffer);
+                const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
-            for (let i = 0; i < downsampledBuffer.length; i++) {
+                const dataView = new DataView(arrayBuffer);
-                let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
+                for (let i = 0; i < downsampledBuffer.length; i++) {
-                s = s < 0 ? s * 0x8000 : s * 0x7fff;
+                    let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
-                dataView.setInt16(i * 2, s, true);
+                    s = s < 0 ? s * 0x8000 : s * 0x7fff;
                    dataView.setInt16(i * 2, s, true);
                }
                // バッファリング
                this.requestChunks.push(arrayBuffer);
            } else {
                // internal
                // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
                this.requestChunks.push(downsampledBuffer.buffer);
            }
            // バッファリング
            this.requestChunks.push(arrayBuffer);
            //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
            if (this.requestChunks.length < this.setting.inputChunkNum) {
                return;
@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
                this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
                return;
            }
-            const res = this.internalCallback.processAudio(newBuffer);
+            const res = await this.internalCallback.processAudio(newBuffer);
            if (res.length == 0) {
                return;
            }
            if (this.outputNode != null) {
                this.outputNode.postReceivedVoice(res.buffer);
            } else {
--- a/client/lib/worklet/src/voice-changer-worklet-processor.ts
+++ b/client/lib/worklet/src/voice-changer-worklet-processor.ts
@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
    private isRecording = false;
    playBuffer: Float32Array[] = [];
    unpushedF32Data: Float32Array = new Float32Array(0);
    /**
     * @constructor
     */
@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
        }
        const f32Data = request.voice;
-        const chunkNum = f32Data.length / this.BLOCK_SIZE;
+        const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length);
        concatedF32Data.set(this.unpushedF32Data);
        concatedF32Data.set(f32Data, this.unpushedF32Data.length);
        const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE);
        for (let i = 0; i < chunkNum; i++) {
-            const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
+            const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
            this.playBuffer.push(block);
        }
        this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE);
    }
    pushData = (inputData: Float32Array) => {
@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
        }
        if (this.playBuffer.length === 0) {
-            // console.log("[worklet] no play buffer")
+            // console.log("[worklet] no play buffer");
            return true;
        }
-
+        // console.log("[worklet] play buffer");
        //// 一定期間無音状態が続いている場合はスキップ。
        // let voice: Float32Array | undefined
        // while (true) {
--- a/server/sio/MMVC_SocketIOApp.py
+++ b/server/sio/MMVC_SocketIOApp.py
@ -40,6 +40,10 @@ class MMVC_SocketIOApp:
                        "filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
                        "content_type": "image/png",
                    },
                    "/ort-wasm-simd.wasm": {
                        "filename": f"{getFrontendPath()}/ort-wasm-simd.wasm",
                        "content_type": "application/wasm",
                    },
                    "": f"{getFrontendPath()}",
                    "/": f"{getFrontendPath()}/index.html",
                },
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -93,9 +93,9 @@ class Pipeline(object):
                pitch = None
                pitchf = None
        except IndexError as e:  # NOQA
-            # print(e)
+            print(e)
-            # import traceback
+            import traceback
-            # traceback.print_exc()
+            traceback.print_exc()
            raise NotEnoughDataExtimateF0()
        return pitch, pitchf
--- a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py
@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
        )
    def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
        n_frames = int(len(audio) // window) + 1
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr
--- a/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py
+++ b/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py
@ -0,0 +1,160 @@
 # 実験用。使用しない。
 import torch
 from torchcrepe.model import Crepe
 import os
 CENTS_PER_BIN = 20
 PITCH_BINS = 360
 class TorchCrepe2(torch.nn.Module):
    def __init__(self, model='full'):
        super().__init__()
        self.crepe = Crepe(model)
        file = os.path.join(os.path.dirname(__file__), f'{model}.pth')
        self.crepe.load_state_dict(torch.load(file, map_location="cpu"))
        self.crepe = self.crepe.to(torch.device("cpu"))
        self.crepe.eval()
        self.sample_rate = 16000
        self.hop_length = 160
        self.window_size = 1024
    def forward(self, audio, f0_min: int, f0_max: int):
        # total_frames = 1 + int(audio.size(1) // self.hop_length)
        audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2))
        # batch_size = total_frames
        start = 0
        end = audio.size(1)
        # Chunk
        frames = torch.nn.functional.unfold(
            audio[:, None, None, start:end],
            kernel_size=(1, self.window_size),
            stride=(1, self.hop_length))
        frames = frames.transpose(1, 2).reshape(-1, self.window_size)
        # Place on device
        # frames = frames.to(device)
        # Mean-center
        frames -= frames.mean(dim=1, keepdim=True)
        # Scale
        frames /= torch.max(torch.tensor(1e-10, device=frames.device),
                            frames.std(dim=1, keepdim=True))
        probabilities = self.crepe(frames.to(torch.device("cpu")))
        probabilities = probabilities.reshape(
            audio.size(0), -1, PITCH_BINS).transpose(1, 2)
        minidx = frequency_to_bins(torch.tensor(f0_min))
        maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil)
        probabilities[:, :minidx] = -float('inf')
        probabilities[:, maxidx:] = -float('inf')
        bins, pitch = weighted_argmax(probabilities)
        return pitch, periodicity(probabilities, bins)
 def weighted_argmax(logits):
    """Sample observations using weighted sum near the argmax"""
    # Find center of analysis window
    bins = logits.argmax(dim=1)
    # Find bounds of analysis window
    start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
    end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
    # Mask out everything outside of window
    for batch in range(logits.size(0)):
        for time in range(logits.size(2)):
            logits[batch, :start[batch, time], time] = -float('inf')
            logits[batch, end[batch, time]:, time] = -float('inf')
    # Construct weights
    if not hasattr(weighted_argmax, 'weights'):
        weights = bins_to_cents(torch.arange(360))
        weighted_argmax.weights = weights[None, :, None]
    # Ensure devices are the same (no-op if they are)
    weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
    # Convert to probabilities
    with torch.no_grad():
        probs = torch.sigmoid(logits)
    # Apply weights
    cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
    # Convert to frequency in Hz
    return bins, cents_to_frequency(cents)
 def bins_to_cents(bins):
    """Converts pitch bins to cents"""
    cents = CENTS_PER_BIN * bins + 1997.3794084376191
    # Trade quantization error for noise
    return dither(cents)
 def dither(cents):
    """Dither the predicted pitch in cents to remove quantization error"""
    # noise = scipy.stats.triang.rvs(c=0.5,
    #                                loc=-CENTS_PER_BIN,
    #                                scale=2 * CENTS_PER_BIN,
    #                                size=cents.size())
    # 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値
    c = 0.5
    loc = -CENTS_PER_BIN
    scale = 2 * CENTS_PER_BIN
    u = torch.rand(cents.size())
    # f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2)
    f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2))
    noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc
    mask = u >= c
    noise[mask] = 2 * (scale - noise[mask])
    return cents + cents.new_tensor(noise)
 def cents_to_frequency(cents):
    """Converts cents to frequency in Hz"""
    return 10 * 2 ** (cents / 1200)
 def periodicity(probabilities, bins):
    """Computes the periodicity from the network output and pitch bins"""
    # shape=(batch * time / hop_length, 360)
    probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
    # shape=(batch * time / hop_length, 1)
    bins_stacked = bins.reshape(-1, 1).to(torch.int64)
    # Use maximum logit over pitch bins as periodicity
    periodicity = probs_stacked.gather(1, bins_stacked)
    # shape=(batch, time / hop_length)
    return periodicity.reshape(probabilities.size(0), probabilities.size(2))
 def cents_to_bins(cents, quantize_fn=torch.floor):
    """Converts cents to pitch bins"""
    bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
    return quantize_fn(bins).int()
 def frequency_to_bins(frequency, quantize_fn=torch.floor):
    """Convert frequency in Hz to pitch bins"""
    return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
 def frequency_to_cents(frequency):
    """Convert frequency in Hz to cents"""
    return 1200 * torch.log2(frequency / 10.)