add downsampling mode

2025-01-23 13:35:12 +03:00 · 2023-02-14 22:32:25 +09:00 · 2023-02-14 22:32:25 +09:00 · 852b4216ca
commit 852b4216ca
parent 9d84046a77
11 changed files with 734 additions and 92 deletions
--- a/client/demo/dist/index.html
+++ b/client/demo/dist/index.html
@ -1 +1,10 @@
-<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
+<!DOCTYPE html>
+<html style="width: 100%; height: 100%; overflow: hidden">
+    <head>
+        <meta charset="utf-8" />
+        <title>Voice Changer Client Demo</title>
+    <script defer src="index.js"></script></head>
+    <body style="width: 100%; height: 100%; margin: 0px">
+        <div id="app" style="width: 100%; height: 100%"></div>
+    </body>
+</html>
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/dist/index.js.LICENSE.txt
+++ b/client/demo/dist/index.js.LICENSE.txt
@ -1,31 +0,0 @@
-/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
-
-/**
- * @license React
- * react-dom.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * react.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * scheduler.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
--- a/client/demo/package.json
+++ b/client/demo/package.json
@ -10,6 +10,7 @@
        "build:prod": "npm-run-all clean webpack:prod",
        "build:dev": "npm-run-all clean webpack:dev",
        "start": "webpack-dev-server  --config webpack.dev.js",
+        "build:mod": "cd ../lib && npm run build:dev && cd - && cp -r ../lib/dist/* node_modules/@dannadori/voice-changer-client-js/dist/",
        "test": "echo \"Error: no test specified\" && exit 1"
    },
    "keywords": [
--- a/client/demo/src/100_options_microphone.tsx
+++ b/client/demo/src/100_options_microphone.tsx
@ -30,7 +30,9 @@ export const useMicrophoneOptions = () => {

    useEffect(() => {
        const createAudioContext = () => {
-            const ctx = new AudioContext()
+            const ctx = new AudioContext({
+                sampleRate: 48000,
+            })
            setAudioContext(ctx)
            document.removeEventListener('touchstart', createAudioContext);
            document.removeEventListener('mousedown', createAudioContext);
--- a/client/demo/src/105_advanced_setting.tsx
+++ b/client/demo/src/105_advanced_setting.tsx
@ -1,4 +1,4 @@
-import { BufferSize, Protocol, SampleRate, VoiceChangerMode } from "@dannadori/voice-changer-client-js"
+import { BufferSize, DownSamplingMode, Protocol, SampleRate, VoiceChangerMode } from "@dannadori/voice-changer-client-js"
 import React, { useMemo, useState } from "react"
 import { ClientState } from "@dannadori/voice-changer-client-js";

@ -197,6 +197,26 @@ export const useAdvancedSetting = (props: UseAdvancedSettingProps): AdvancedSett
    }, [props.clientState.clientSetting.setting.voiceChangerMode, props.clientState.clientSetting.setVoiceChangerMode])


+    const downSamplingModeRow = useMemo(() => {
+        return (
+            <div className="body-row split-3-7 left-padding-1 guided">
+                <div className="body-item-title left-padding-1 ">DownSamplingMode</div>
+                <div className="body-select-container">
+                    <select className="body-select" value={props.clientState.clientSetting.setting.downSamplingMode} onChange={(e) => {
+                        props.clientState.clientSetting.setDownSamplingMode(e.target.value as DownSamplingMode)
+                    }}>
+                        {
+                            Object.values(DownSamplingMode).map(x => {
+                                return <option key={x} value={x}>{x}</option>
+                            })
+                        }
+                    </select>
+                </div>
+            </div>
+        )
+    }, [props.clientState.clientSetting.setting.downSamplingMode, props.clientState.clientSetting.setDownSamplingMode])
+
+

    const workletSettingRow = useMemo(() => {
        return (
@ -265,9 +285,10 @@ export const useAdvancedSetting = (props: UseAdvancedSettingProps): AdvancedSett
                <div className="body-row divider"></div>
                {workletSettingRow}
                <div className="body-row divider"></div>
+                {downSamplingModeRow}
            </>
        )
-    }, [showAdvancedSetting, mmvcServerUrlRow, protocolRow, sampleRateRow, bufferSizeRow, convertChunkNumRow, minConvertSizeRow, crossFadeOverlapRateRow, crossFadeOffsetRateRow, crossFadeEndRateRow, vfForceDisableRow, voiceChangeModeRow, workletSettingRow])
+    }, [showAdvancedSetting, mmvcServerUrlRow, protocolRow, sampleRateRow, bufferSizeRow, convertChunkNumRow, minConvertSizeRow, crossFadeOverlapRateRow, crossFadeOffsetRateRow, crossFadeEndRateRow, vfForceDisableRow, voiceChangeModeRow, workletSettingRow, downSamplingModeRow])


    const advancedSetting = useMemo(() => {
--- a/client/lib/src/AudioStreamer.ts
+++ b/client/lib/src/AudioStreamer.ts
@ -1,7 +1,7 @@
 import { io, Socket } from "socket.io-client";
 import { DefaultEventsMap } from "@socket.io/component-emitter";
 import { Duplex, DuplexOptions } from "readable-stream";
-import { Protocol, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION } from "./const";
+import { DownSamplingMode, Protocol, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION } from "./const";

 export type Callbacks = {
    onVoiceReceived: (voiceChangerMode: VoiceChangerMode, data: ArrayBuffer) => void
@ -19,6 +19,7 @@ export type AudioStreamerSettings = {
    voiceChangerMode: VoiceChangerMode;
 }

+
 export class AudioStreamer extends Duplex {
    private callbacks: Callbacks
    private audioStreamerListeners: AudioStreamerListeners
@ -34,6 +35,11 @@ export class AudioStreamer extends Duplex {
    // performance monitor
    private bufferStart = 0;

+    // Flags 
+    // private downSamplingMode: DownSamplingMode = DownSamplingMode.decimate
+    private downSamplingMode: DownSamplingMode = DownSamplingMode.average
+
+
    constructor(callbacks: Callbacks, audioStreamerListeners: AudioStreamerListeners, options?: DuplexOptions) {
        super(options);
        this.callbacks = callbacks
@ -84,6 +90,11 @@ export class AudioStreamer extends Duplex {
        this.voiceChangerMode = val
    }

+    // set Flags
+    setDownSamplingMode = (val: DownSamplingMode) => {
+        this.downSamplingMode = val
+    }
+
    getSettings = (): AudioStreamerSettings => {
        return {
            serverUrl: this.serverUrl,
@ -107,21 +118,63 @@ export class AudioStreamer extends Duplex {
        callback();
    }

+    _averageDownsampleBuffer(buffer: Float32Array, originalSampleRate: number, destinationSamplerate: number) {
+        if (originalSampleRate == destinationSamplerate) {
+            return buffer;
+        }
+        if (destinationSamplerate > originalSampleRate) {
+            throw "downsampling rate show be smaller than original sample rate";
+        }
+        const sampleRateRatio = originalSampleRate / destinationSamplerate;
+        const newLength = Math.round(buffer.length / sampleRateRatio);
+        const result = new Float32Array(newLength);
+        let offsetResult = 0;
+        let offsetBuffer = 0;
+        while (offsetResult < result.length) {
+            var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
+            // Use average value of skipped samples
+            var accum = 0, count = 0;
+            for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
+                accum += buffer[i];
+                count++;
+            }
+            result[offsetResult] = accum / count;
+            // Or you can simply get rid of the skipped samples:
+            // result[offsetResult] = buffer[nextOffsetBuffer];
+            offsetResult++;
+            offsetBuffer = nextOffsetBuffer;
+        }
+        return result;
+    }
+
+
    private _write_realtime = (buffer: Float32Array) => {
+        let downsampledBuffer: Float32Array | null = null
+        if (this.downSamplingMode == DownSamplingMode.decimate) {
+            //////// (Kind 1) 間引き //////////
            // bufferSize個のデータ（48Khz）が入ってくる。
            //// 48000Hz で入ってくるので間引いて24000Hzに変換する。
-        //// バイトサイズは周波数変換で(x1/2), 16bit(2byte)で(x2)
-        const arrayBuffer = new ArrayBuffer((buffer.length / 2) * 2)
-        const dataView = new DataView(arrayBuffer);
-
+            downsampledBuffer = new Float32Array(buffer.length / 2);
            for (let i = 0; i < buffer.length; i++) {
                if (i % 2 == 0) {
-                let s = Math.max(-1, Math.min(1, buffer[i]));
+                    downsampledBuffer[i / 2] = buffer[i]
+                }
+            }
+        } else {
+            //////// (Kind 2) 平均 //////////
+            downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
+        }
+
+        // Float to signed16
+        const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2)
+        const dataView = new DataView(arrayBuffer);
+        for (let i = 0; i < downsampledBuffer.length; i++) {
+            let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
            s = s < 0 ? s * 0x8000 : s * 0x7FFF
-                // ２分の１個目で２バイトずつ進むので((i/2)*2)
-                dataView.setInt16((i / 2) * 2, s, true);
-            }
+            dataView.setInt16(i * 2, s, true);
        }
+
+
        // 256byte(最低バッファサイズ256から間引いた個数x2byte)をchunkとして管理
        const chunkByteSize = 256 // (const.ts ★1)
        for (let i = 0; i < arrayBuffer.byteLength / chunkByteSize; i++) {
@ -129,6 +182,7 @@ export class AudioStreamer extends Duplex {
            this.requestChunks.push(ab)
        }

+
        //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
        if (this.requestChunks.length < this.inputChunkNum) {
            return
@ -198,15 +252,7 @@ export class AudioStreamer extends Duplex {
    }

    private sendBuffer = async (newBuffer: Uint8Array) => {
-        // if (this.serverUrl.length == 0) {
-        //     // console.warn("no server url")
-        //     // return
-        //     // throw "no server url"
-        // }
        const timestamp = Date.now()
-        // console.log("REQUEST_MESSAGE:", [this.gpu, this.srcId, this.dstId, timestamp, newBuffer.buffer])
-        // console.log("SERVER_URL", this.serverUrl, this.protocol)
-        // const convertChunkNum = this.voiceChangerMode === "realtime" ? this.requestParamas.convertChunkNum : 0
        if (this.protocol === "sio") {
            if (!this.socket) {
                console.warn(`sio is not initialized`)
@ -214,26 +260,12 @@ export class AudioStreamer extends Duplex {
            }
            // console.log("emit!")
            this.socket.emit('request_message', [
-                // this.requestParamas.gpu,
-                // this.requestParamas.srcId,
-                // this.requestParamas.dstId,
                timestamp,
-                // convertChunkNum,
-                // this.requestParamas.crossFadeLowerValue,
-                // this.requestParamas.crossFadeOffsetRate,
-                // this.requestParamas.crossFadeEndRate,
                newBuffer.buffer]);
        } else {
            const res = await postVoice(
                this.serverUrl + "/test",
-                // this.requestParamas.gpu,
-                // this.requestParamas.srcId,
-                // this.requestParamas.dstId,
                timestamp,
-                // convertChunkNum,
-                // this.requestParamas.crossFadeLowerValue,
-                // this.requestParamas.crossFadeOffsetRate,
-                // this.requestParamas.crossFadeEndRate,
                newBuffer.buffer)

            if (res.byteLength < 128 * 2) {
@ -248,24 +280,10 @@ export class AudioStreamer extends Duplex {

 export const postVoice = async (
    url: string,
-    // gpu: number,
-    // srcId: number,
-    // dstId: number,
    timestamp: number,
-    // convertChunkNum: number,
-    // crossFadeLowerValue: number,
-    // crossFadeOffsetRate: number,
-    // crossFadeEndRate: number,
    buffer: ArrayBuffer) => {
    const obj = {
-        // gpu,
-        // srcId,
-        // dstId,
        timestamp,
-        // convertChunkNum,
-        // crossFadeLowerValue,
-        // crossFadeOffsetRate,
-        // crossFadeEndRate,
        buffer: Buffer.from(buffer).toString('base64')
    };
    const body = JSON.stringify(obj);
@ -283,7 +301,6 @@ export const postVoice = async (
    const changedVoiceBase64 = receivedJson["changedVoiceBase64"]
    const buf = Buffer.from(changedVoiceBase64, "base64")
    const ab = new ArrayBuffer(buf.length);
-    // console.log("RECIV", buf.length)
    const view = new Uint8Array(ab);
    for (let i = 0; i < buf.length; ++i) {
        view[i] = buf[i];
--- a/client/lib/src/VoiceChangerClient.ts
+++ b/client/lib/src/VoiceChangerClient.ts
@ -3,7 +3,7 @@ import { VoiceChangerWorkletNode, VoiceChangerWorkletListener } from "./VoiceCha
 import workerjs from "raw-loader!../worklet/dist/index.js";
 import { VoiceFocusDeviceTransformer, VoiceFocusTransformDevice } from "amazon-chime-sdk-js";
 import { createDummyMediaStream, validateUrl } from "./util";
-import { BufferSize, DefaultVoiceChangerClientSetting, Protocol, ServerSettingKey, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletSetting } from "./const";
+import { BufferSize, DefaultVoiceChangerClientSetting, DownSamplingMode, Protocol, ServerSettingKey, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletSetting } from "./const";
 import MicrophoneStream from "microphone-stream";
 import { AudioStreamer, Callbacks, AudioStreamerListeners } from "./AudioStreamer";
 import { ServerConfigurator } from "./ServerConfigurator";
@ -133,6 +133,9 @@ export class VoiceChangerClient {
            this.currentMediaStream = await navigator.mediaDevices.getUserMedia({
                audio: {
                    deviceId: input,
+                    channelCount: 1,
+                    sampleRate: 48000,
+                    sampleSize: 16,
                    // echoCancellation: false,
                    // noiseSuppression: false
                }
@ -228,6 +231,10 @@ export class VoiceChangerClient {
    setVoiceChangerMode = (val: VoiceChangerMode) => {
        this.audioStreamer.setVoiceChangerMode(val)
    }
+    ////  Audio Streamer Flag
+    setDownSamplingMode = (val: DownSamplingMode) => {
+        this.audioStreamer.setDownSamplingMode(val)
+    }

    // configure worklet
    configureWorklet = (setting: WorkletSetting) => {
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -4,6 +4,7 @@
 // 24000sample -> 1sec, 128sample(1chunk) -> 5.333msec
 // 187.5chunk -> 1sec

+
 // types
 export type VoiceChangerServerSetting = {
    convertChunkNum: number, // VITSに入力する変換サイズ。(入力データの2倍以上の大きさで指定。それより小さいものが指定された場合は、サーバ側で自動的に入力の2倍のサイズが設定される。)
@ -35,6 +36,7 @@ export type VoiceChangerClientSetting = {
    correspondences: Correspondence[],
    forceVfDisable: boolean,
    voiceChangerMode: VoiceChangerMode,
+    downSamplingMode: DownSamplingMode,

    inputGain: number
    outputGain: number
@ -92,6 +94,12 @@ export const VoiceChangerMode = {
 } as const
 export type VoiceChangerMode = typeof VoiceChangerMode[keyof typeof VoiceChangerMode]

+export const DownSamplingMode = {
+    "decimate": "decimate",
+    "average": "average"
+} as const
+export type DownSamplingMode = typeof DownSamplingMode[keyof typeof DownSamplingMode]
+
 export const SampleRate = {
    "48000": 48000,
 } as const
@ -186,6 +194,7 @@ export const DefaultVoiceChangerClientSetting: VoiceChangerClientSetting = {
    correspondences: [],
    forceVfDisable: false,
    voiceChangerMode: "realtime",
+    downSamplingMode: "average",
    inputGain: 1.0,
    outputGain: 1.0
 }
--- a/client/lib/src/hooks/useClientSetting.ts
+++ b/client/lib/src/hooks/useClientSetting.ts
@ -1,5 +1,6 @@
 import { useState, useMemo, useRef, useEffect } from "react"
-import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence } from "../const"
+
+import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence, DownSamplingMode } from "../const"
 import { createDummyMediaStream } from "../util"
 import { VoiceChangerClient } from "../VoiceChangerClient"
 import { useIndexedDB } from "./useIndexedDB"
@ -19,6 +20,7 @@ export type ClientSettingState = {
    setVfForceDisabled: (vfForceDisabled: boolean) => Promise<void>
    setInputChunkNum: (num: number) => void;
    setVoiceChangerMode: (mode: VoiceChangerMode) => void
+    setDownSamplingMode: (mode: DownSamplingMode) => void
    setSampleRate: (num: SampleRate) => void
    setSpeakers: (speakers: Speaker[]) => void
    setCorrespondences: (file: File | null) => Promise<void>
@ -165,6 +167,17 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
        }
    }, [props.voiceChangerClient])

+    const setDownSamplingMode = useMemo(() => {
+        return (mode: DownSamplingMode) => {
+            if (!props.voiceChangerClient) return
+            props.voiceChangerClient.setDownSamplingMode(mode)
+            settingRef.current.downSamplingMode = mode
+            setSetting({ ...settingRef.current })
+        }
+    }, [props.voiceChangerClient])
+
+
+
    const setSampleRate = useMemo(() => {
        return (num: SampleRate) => {
            if (!props.voiceChangerClient) return
@ -261,6 +274,7 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
        setVfForceDisabled,
        setInputChunkNum,
        setVoiceChangerMode,
+        setDownSamplingMode,
        setSampleRate,
        setSpeakers,
        setCorrespondences,
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -401,7 +401,7 @@ class VoiceChanger():

    def on_request(self, unpackedData: any):
        convertSize = self.settings.convertChunkNum * 128  # 128sample/1chunk
-        # self.stream_in.write(unpackedData.astype(np.int16).tobytes())
+        self.stream_in.write(unpackedData.astype(np.int16).tobytes())
        # print("convsize:", unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate))
        if unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize:
            convertSize = int(unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate)) + 1024