WIP: local mic

2025-01-23 21:45:00 +03:00 · 2023-02-18 04:15:34 +09:00 · 2023-02-18 04:15:34 +09:00 · b97dc18654
commit b97dc18654
parent be9695307a
13 changed files with 316 additions and 26 deletions
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/src/103_device_setting.tsx
+++ b/client/demo/src/103_device_setting.tsx
@ -1,4 +1,4 @@
-import { fileSelectorAsDataURL, useIndexedDB } from "@dannadori/voice-changer-client-js"
+import { fileSelectorAsDataURL, ServerAudioDevice, useIndexedDB } from "@dannadori/voice-changer-client-js"
 import React, { useEffect, useMemo, useRef, useState } from "react"
 import { AUDIO_ELEMENT_FOR_PLAY_RESULT, AUDIO_ELEMENT_FOR_TEST_CONVERTED, AUDIO_ELEMENT_FOR_TEST_CONVERTED_ECHOBACK, AUDIO_ELEMENT_FOR_TEST_ORIGINAL, INDEXEDDB_KEY_AUDIO_OUTPUT } from "./const"
 import { useAppState } from "./001_provider/001_AppStateProvider";
@ -60,6 +60,7 @@ export const useDeviceSetting = (): DeviceSettingState => {
    const [inputAudioDeviceInfo, setInputAudioDeviceInfo] = useState<MediaDeviceInfo[]>([])
    const [outputAudioDeviceInfo, setOutputAudioDeviceInfo] = useState<MediaDeviceInfo[]>([])
    const [serverInputAudioDeviceInfo, setServerInputAudioDeviceInfo] = useState<ServerAudioDevice[]>([])
    const [audioInputForGUI, setAudioInputForGUI] = useState<string>("none")
    const [audioOutputForGUI, setAudioOutputForGUI] = useState<string>("none")
@ -70,15 +71,27 @@ export const useDeviceSetting = (): DeviceSettingState => {
    const [outputRecordingStarted, setOutputRecordingStarted] = useState<boolean>(false)
    const [useServerMicrophone, setUseServerMicrophone] = useState<boolean>(false)
    // リスト内の
    useEffect(() => {
        const initialize = async () => {
            const audioInfo = await reloadDevices()
            setInputAudioDeviceInfo(audioInfo[0])
            setOutputAudioDeviceInfo(audioInfo[1])
            if (useServerMicrophone) {
                try {
                    const serverDevices = await appState.serverSetting.getServerDevices()
                    setServerInputAudioDeviceInfo(serverDevices.audio_input_devices)
                } catch (e) {
                    console.warn(e)
                }
            }
        }
        initialize()
-    }, [])
+    }, [useServerMicrophone])
    // キャッシュの設定は反映（たぶん、設定操作の時も起動していしまう。が問題は起こらないはず）
    useEffect(() => {
        if (typeof appState.clientSetting.setting.audioInput == "string") {
            if (inputAudioDeviceInfo.find(x => {
@ -92,6 +105,9 @@ export const useDeviceSetting = (): DeviceSettingState => {
    }, [inputAudioDeviceInfo, appState.clientSetting.setting.audioInput])
    const audioInputRow = useMemo(() => {
        if (useServerMicrophone) {
            return <></>
        }
        return (
            <div className="body-row split-3-7 left-padding-1  guided">
                <div className="body-item-title left-padding-1">AudioInput</div>
@ -108,14 +124,41 @@ export const useDeviceSetting = (): DeviceSettingState => {
                </div>
            </div>
        )
-    }, [inputAudioDeviceInfo, audioInputForGUI, appState.clientSetting.setting.audioInput])
+    }, [inputAudioDeviceInfo, audioInputForGUI, useServerMicrophone])
    const audioInputServerRow = useMemo(() => {
        if (!useServerMicrophone) {
            return <></>
        }
        return (
            <div className="body-row split-3-7 left-padding-1  guided">
                <div className="body-item-title left-padding-1">AudioInput(Server)</div>
                <div className="body-select-container">
                    <select className="body-select" value={audioInputForGUI} onChange={(e) => {
                        setAudioInputForGUI(e.target.value)
                    }}>
                        {
                            serverInputAudioDeviceInfo.map(x => {
                                return <option key={x.name} value={x.index}>{x.name}</option>
                            })
                        }
                    </select>
                </div>
            </div>
        )
    }, [serverInputAudioDeviceInfo, audioInputForGUI, useServerMicrophone])
    useEffect(() => {
        if (audioInputForGUI == "file") {
            // file selector (audioMediaInputRow)
        } else {
            if (!useServerMicrophone) {
                appState.clientSetting.setAudioInput(audioInputForGUI)
            } else {
                console.log("server mic")
                appState.clientSetting.setAudioInput(null)
                appState.serverSetting.setServerMicrophone(Number(audioInputForGUI))
            }
        }
    }, [appState.audioContext, audioInputForGUI, appState.clientSetting.setAudioInput])
@ -294,10 +337,16 @@ export const useDeviceSetting = (): DeviceSettingState => {
                        <span className="title" onClick={() => { appState.frontendManagerState.stateControls.openDeviceSettingCheckbox.updateState(!appState.frontendManagerState.stateControls.openDeviceSettingCheckbox.checked()) }}>
                            Device Setting
                        </span>
                        <span className="belongings">
                            <input className="belongings-checkbox" type="checkbox" checked={useServerMicrophone} onChange={(e) => {
                                setUseServerMicrophone(e.target.checked)
                            }} /> use server mic (Experimental)
                        </span>
                    </div>
                    <div className="partition-content">
                        {audioInputRow}
                        {audioInputServerRow}
                        {audioMediaInputRow}
                        {audioOutputRow}
                        {audioOutputRecordingRow}
@ -305,7 +354,7 @@ export const useDeviceSetting = (): DeviceSettingState => {
                </div>
            </>
        )
-    }, [audioInputRow, audioMediaInputRow, audioOutputRow, audioOutputRecordingRow])
+    }, [audioInputRow, audioInputServerRow, audioMediaInputRow, audioOutputRow, audioOutputRecordingRow, useServerMicrophone])
    // 出力の録音データ(from worklet)がストアされたら実行
--- a/client/demo/src/css/App.css
+++ b/client/demo/src/css/App.css
@ -97,6 +97,17 @@ body {
            .title {
                font-size: 1.1rem;
            }
            .belongings {
                font-weight: 400;
                font-size: 0.8rem;
                display: flex;
                flex-direction: row;
                align-items: flex-end;
                margin-left: 10px;
                .belongings-checkbox {
                    margin-bottom: 3px;
                }
            }
        }
        .partition-content {
            position: static;
--- a/client/lib/src/AudioStreamer.ts
+++ b/client/lib/src/AudioStreamer.ts
@ -55,7 +55,10 @@ export class AudioStreamer extends Duplex {
            this.socket.on('connect_error', (err) => {
                this.audioStreamerListeners.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED, `[SIO] rconnection failed ${err}`)
            })
-            this.socket.on('connect', () => console.log(`[SIO] sonnect to ${this.serverUrl}`));
+            this.socket.on('connect', () => {
                console.log(`[SIO] sonnect to ${this.serverUrl}`)
                console.log(`[SIO] ${this.socket?.id}`)
            });
            this.socket.on('response', (response: any[]) => {
                const cur = Date.now()
                const responseTime = cur - response[0]
@ -104,6 +107,11 @@ export class AudioStreamer extends Duplex {
        }
    }
    getSocketId = () => {
        return this.socket?.id
    }
    // Main Process
    //// Pipe from mic stream 
--- a/client/lib/src/ServerConfigurator.ts
+++ b/client/lib/src/ServerConfigurator.ts
@ -1,4 +1,4 @@
-import { ServerInfo, ServerSettingKey } from "./const";
+import { ServerAudioDevices, ServerInfo, ServerSettingKey } from "./const";
 type FileChunk = {
@ -127,14 +127,14 @@ export class ServerConfigurator {
    // Local Mic
-    getServerMicrophones = async () => {
+    getServerDevices = async () => {
-        const url = this.serverUrl + "/microphone"
+        const url = this.serverUrl + "/device"
-        const info = await new Promise<ServerInfo>((resolve) => {
+        const info = await new Promise<ServerAudioDevices>((resolve) => {
            const request = new Request(url, {
                method: 'GET',
            });
            fetch(request).then(async (response) => {
-                const json = await response.json() as ServerInfo
+                const json = await response.json() as ServerAudioDevices
                resolve(json)
            })
        })
--- a/client/lib/src/VoiceChangerClient.ts
+++ b/client/lib/src/VoiceChangerClient.ts
@ -116,8 +116,9 @@ export class VoiceChangerClient {
    // forceVfDisable is for the condition that vf is enabled in constructor. 
    //noiseSuppression2 => VoiceFocus
-    setup = async (input: string | MediaStream, bufferSize: BufferSize, echoCancel: boolean = true, noiseSuppression: boolean = true, noiseSuppression2: boolean = false) => {
+    setup = async (input: string | MediaStream | null, bufferSize: BufferSize, echoCancel: boolean = true, noiseSuppression: boolean = true, noiseSuppression2: boolean = false) => {
        const lockNum = await this.lock()
        console.log(`Input Setup=> echo: ${echoCancel}, noise1: ${noiseSuppression}, noise2: ${noiseSuppression2}`)
        // condition check
        if (!this.vcNode) {
@ -131,6 +132,17 @@ export class VoiceChangerClient {
            this.currentMediaStream.getTracks().forEach(x => { x.stop() })
            this.currentMediaStream = null
        }
        //// Input デバイスがnullの時はmicStreamを止めてリターン
        if (!input) {
            console.log(`Input Setup=> client mic is disabled.`)
            if (this.micStream) {
                this.micStream.pauseRecording()
            }
            await this.unlock(lockNum)
            return
        }
        if (typeof input == "string") {
            this.currentMediaStream = await navigator.mediaDevices.getUserMedia({
                audio: {
@ -302,6 +314,14 @@ export class VoiceChangerClient {
        return this.configurator.getSettings()
    }
    getServerDevices = () => {
        return this.configurator.getServerDevices()
    }
    getSocketId = () => {
        return this.audioStreamer.getSocketId()
    }
 }
--- a/client/lib/src/const.ts
+++ b/client/lib/src/const.ts
@ -25,6 +25,7 @@ export type VoiceChangerServerSetting = {
    f0Factor: number
    f0Detector: string // dio or harvest
    recordIO: number // 0:off, 1:on
    serverMicProps: string
 }
 export type VoiceChangerClientSetting = {
@ -81,9 +82,20 @@ export type ServerInfo = {
    f0Factor: number
    f0Detector: string
    recordIO: number
    serverMicProps: string
 }
 export type ServerAudioDevice = {
    kind: string,
    index: number,
    name: string,
    hostAPI: string
 }
 export type ServerAudioDevices = {
    audio_input_devices: ServerAudioDevice[]
    audio_output_devices: ServerAudioDevice[]
 }
@ -155,7 +167,8 @@ export const ServerSettingKey = {
    "onnxExecutionProvider": "onnxExecutionProvider",
    "f0Factor": "f0Factor",
    "f0Detector": "f0Detector",
-    "recordIO": "recordIO"
+    "recordIO": "recordIO",
    "serverMicProps": "serverMicProps",
 } as const
 export type ServerSettingKey = typeof ServerSettingKey[keyof typeof ServerSettingKey]
@ -174,7 +187,9 @@ export const DefaultVoiceChangerServerSetting: VoiceChangerServerSetting = {
    f0Factor: 1.0,
    onnxExecutionProvider: "CPUExecutionProvider",
    f0Detector: "dio",
-    recordIO: 0
+    recordIO: 0,
    serverMicProps: ""
 }
 export const DefaultVoiceChangerClientSetting: VoiceChangerClientSetting = {
--- a/client/lib/src/hooks/useClientSetting.ts
+++ b/client/lib/src/hooks/useClientSetting.ts
@ -1,7 +1,6 @@
 import { useState, useMemo, useRef, useEffect } from "react"
 import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence, DownSamplingMode } from "../const"
 import { createDummyMediaStream } from "../util"
 import { VoiceChangerClient } from "../VoiceChangerClient"
 import { useIndexedDB } from "./useIndexedDB"
@ -113,9 +112,7 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
    const _setInput = async () => {
        if (!props.voiceChangerClient) return
        if (!settingRef.current.audioInput || settingRef.current.audioInput == "none") {
-            // console.log("[useClient] setup!(1)", settingRef.current.audioInput)
+            await props.voiceChangerClient.setup(null, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)
            const ms = createDummyMediaStream(props.audioContext!)
            await props.voiceChangerClient.setup(ms, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)
        } else {
            // console.log("[useClient] setup!(2)", settingRef.current.audioInput)
            await props.voiceChangerClient.setup(settingRef.current.audioInput, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)
--- a/client/lib/src/hooks/useServerSetting.ts
+++ b/client/lib/src/hooks/useServerSetting.ts
@ -1,5 +1,5 @@
 import { useState, useMemo, useRef, useEffect } from "react"
-import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA } from "../const"
+import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA, ServerAudioDevices } from "../const"
 import { VoiceChangerClient } from "../VoiceChangerClient"
 import { useIndexedDB } from "./useIndexedDB"
@ -51,9 +51,11 @@ export type ServerSettingState = {
    setF0Factor: (num: number) => Promise<boolean>;
    setF0Detector: (val: string) => Promise<boolean>;
    setRecordIO: (num: number) => Promise<boolean>;
    setServerMicrophone: (index: number) => Promise<boolean | undefined>
    reloadServerInfo: () => Promise<void>;
    setFileUploadSetting: (val: FileUploadSetting) => void
    loadModel: () => Promise<void>
    getServerDevices: () => Promise<ServerAudioDevices>
    uploadProgress: number
    isUploading: boolean
 }
@ -218,6 +220,19 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
            return await _set_and_store(ServerSettingKey.recordIO, "" + num)
        }
    }, [props.voiceChangerClient])
    const setServerMicrophone = useMemo(() => {
        return async (index: number) => {
            if (!props.voiceChangerClient) {
                return
            }
            const sid = props.voiceChangerClient.getSocketId()
            const serverMicProps = {
                sid: sid,
                deviceIndex: index
            }
            return await _set_and_store(ServerSettingKey.serverMicProps, JSON.stringify(serverMicProps))
        }
    }, [props.voiceChangerClient])
    //////////////
    // 操作
    /////////////
@ -368,6 +383,16 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
        await removeItem(INDEXEDDB_KEY_MODEL_DATA)
    }
    const getServerDevices = async (): Promise<ServerAudioDevices> => {
        if (!props.voiceChangerClient) {
            return {
                audio_input_devices: [],
                audio_output_devices: []
            }
        }
        const res = await props.voiceChangerClient.getServerDevices()
        return res
    }
    return {
        setting,
@ -387,9 +412,11 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
        setF0Factor,
        setF0Detector,
        setRecordIO,
        setServerMicrophone,
        reloadServerInfo,
        setFileUploadSetting,
        loadModel,
        getServerDevices,
        uploadProgress,
        isUploading,
    }
--- a/client/native/cli_client.py
+++ b/client/native/cli_client.py
@ -0,0 +1,122 @@
 import argparse
 import pyaudio
 import wave
 import struct
 import socketio
 import ssl
 from datetime import datetime
 import time
 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
 context.verify_mode = ssl.CERT_NONE
 def setupArgParser():
    parser = argparse.ArgumentParser()
    parser.add_argument("-p", type=int, default=18888, help="port")
    parser.add_argument("-d", type=int, help="device index")
    parser.add_argument("-s", type=str, default="", help="sid")
    return parser
 class MockStream:
    """
    オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
    """
    def __init__(self, sampling_rate):
        self.sampling_rate = sampling_rate
        self.start_count = 2
        self.end_count = 2
        self.fr = None
        self.fw = None
    def open_inputfile(self, input_filename):
        self.fr = wave.open(input_filename, 'rb')
    def open_outputfile(self, output_filename):
        self.fw = wave.open(output_filename, 'wb')
        self.fw.setnchannels(1)
        self.fw.setsampwidth(2)
        self.fw.setframerate(self.sampling_rate)
    def read(self, length, exception_on_overflow=False):
        if self.start_count > 0:
            wav = bytes(length * 2)
            self.start_count -= 1  # 最初の2回はダミーの空データ送る
        else:
            wav = self.fr.readframes(length)
        if len(wav) <= 0:  # データなくなってから最後の2回はダミーの空データを送る
            wav = bytes(length * 2)
            self.end_count -= 1
            if self.end_count < 0:
                Hyperparameters.VC_END_FLAG = True
        return wav
    def write(self, wav):
        self.fw.writeframes(wav)
    def stop_stream(self):
        pass
    def close(self):
        if self.fr != None:
            self.fr.close()
            self.fr = None
        if self.fw != None:
            self.fw.close()
            self.fw = None
 mock_stream_out = MockStream(24000)
 mock_stream_out.open_outputfile("test.wav")
 class MyCustomNamespace(socketio.ClientNamespace):  # 名前空間を設定するクラス
    def on_connect(self):
        print('[{}] connect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    def on_disconnect(self):
        print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
    def on_response(self, msg):
        print('[{}] response : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), msg))
        timestamp = msg[0]
        data = msg[1]
        unpackedData = struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)
        mock_stream_out.write(data)
 def my_background_task(sio):  # ここにバックグランド処理のコードを書く
    while True:
        sio.emit('broadcast_message', "aaa", namespace="/test")  # ターミナルで入力された文字をサーバーに送信
        sio.sleep(1)
 if __name__ == '__main__':
    parser = setupArgParser()
    args, unknown = parser.parse_known_args()
    port = args.p
    deviceIndex = args.d
    sid = args.s
    audio = pyaudio.PyAudio()
    audio_input_stream = audio.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=24000,
        frames_per_buffer=4096,
        input_device_index=args.d,
        input=True)
    sio = socketio.Client(ssl_verify=False)
    sio.register_namespace(MyCustomNamespace("/test"))
    sio.connect("https://192.168.0.3:18888")
    while True:
        in_wav = audio_input_stream.read(4096, exception_on_overflow=False)
        bin = struct.pack('<%sh' % len(in_wav), *in_wav)
        sio.emit('request_message', [1000, bin], namespace="/test")
        # sio.start_background_task(my_background_task, sio)
--- a/server/restapi/MMVC_Rest_VoiceChanger.py
+++ b/server/restapi/MMVC_Rest_VoiceChanger.py
@ -23,11 +23,11 @@ class MMVC_Rest_VoiceChanger:
        self.voiceChangerManager = voiceChangerManager
        self.router = APIRouter()
        self.router.add_api_route("/test", self.test, methods=["POST"])
-        self.router.add_api_route("/microphone", self.get_microphone, methods=["GET"])
+        self.router.add_api_route("/device", self.get_device, methods=["GET"])
        self.tlock = threading.Lock()
-    def get_microphone(self):
+    def get_device(self):
        audio = pyaudio.PyAudio()
        audio_input_devices = []
        audio_output_devices = []
--- a/server/sio/MMVC_Namespace.py
+++ b/server/sio/MMVC_Namespace.py
@ -17,7 +17,7 @@ class MMVC_Namespace(socketio.AsyncNamespace):
        return cls._instance
    def on_connect(self, sid, environ):
-        # print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
+        print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), sid))
        pass
    async def on_request_message(self, sid, msg):
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -29,6 +29,10 @@ import librosa
 import librosa.display
 SAMPLING_RATE = 24000
 import pyaudio
 import json
 from multiprocessing import Process, Queue
 class MockStream:
    """
@ -93,6 +97,7 @@ class VocieChangerSettings():
    f0Factor: float = 1.0
    f0Detector: str = "dio"  # dio or harvest
    recordIO: int = 1  # 0:off, 1:on
    serverMicProps: str = ""
    pyTorchModelFile: str = ""
    onnxModelFile: str = ""
@ -101,7 +106,11 @@ class VocieChangerSettings():
    # ↓mutableな物だけ列挙
    intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize", "recordIO"]
    floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate", "f0Factor"]
-    strData = ["framework", "f0Detector"]
+    strData = ["framework", "f0Detector", "serverMicProps"]
 def readMicrophone(queue, sid, deviceIndex):
    print("READ MIC", queue, sid, deviceIndex)
 class VoiceChanger():
@ -277,6 +286,35 @@ class VoiceChanger():
            setattr(self.settings, key, float(val))
        elif key in self.settings.strData:
            setattr(self.settings, key, str(val))
            if key == "serverMicProps":
                if hasattr(self, "serverMicrophoneReaderProcess"):
                    self.serverMicrophoneReaderProcess.terminate()
                if len(val) == 0:
                    print("server mic close")
                    pass
                else:
                    props = json.loads(val)
                    print(props)
                    sid = props["sid"]
                    deviceIndex = props["deviceIndex"]
                    self.serverMicrophoneReaderProcessQueue = Queue()
                    self.serverMicrophoneReaderProcess = Process(target=readMicrophone, args=(
                        self.serverMicrophoneReaderProcessQueue, sid, deviceIndex,))
                    self.serverMicrophoneReaderProcess.start()
                    try:
                        print(sid, deviceIndex)
                    except Exception as e:
                        print(e)
                # audio = pyaudio.PyAudio()
                # audio_input_stream = audio.open(format=pyaudio.paInt16,
                #                                 channels=1,
                #                                 rate=SAMPLING_RATE,
                #                                 frames_per_buffer=4096,
                #                                 input_device_index=val,
                #                                 input=True)
        else:
            print(f"{key} is not mutalbe variable!")
@ -626,3 +664,6 @@ class VoiceChanger():
            # print(audio1)
        return audio1
    def __del__(self):
        print("DESTRUCTOR")