WIP: local mic

This commit is contained in:
wataru 2023-02-18 04:15:34 +09:00
parent be9695307a
commit b97dc18654
13 changed files with 316 additions and 26 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
import { fileSelectorAsDataURL, useIndexedDB } from "@dannadori/voice-changer-client-js"
import { fileSelectorAsDataURL, ServerAudioDevice, useIndexedDB } from "@dannadori/voice-changer-client-js"
import React, { useEffect, useMemo, useRef, useState } from "react"
import { AUDIO_ELEMENT_FOR_PLAY_RESULT, AUDIO_ELEMENT_FOR_TEST_CONVERTED, AUDIO_ELEMENT_FOR_TEST_CONVERTED_ECHOBACK, AUDIO_ELEMENT_FOR_TEST_ORIGINAL, INDEXEDDB_KEY_AUDIO_OUTPUT } from "./const"
import { useAppState } from "./001_provider/001_AppStateProvider";
@ -60,6 +60,7 @@ export const useDeviceSetting = (): DeviceSettingState => {
const [inputAudioDeviceInfo, setInputAudioDeviceInfo] = useState<MediaDeviceInfo[]>([])
const [outputAudioDeviceInfo, setOutputAudioDeviceInfo] = useState<MediaDeviceInfo[]>([])
const [serverInputAudioDeviceInfo, setServerInputAudioDeviceInfo] = useState<ServerAudioDevice[]>([])
const [audioInputForGUI, setAudioInputForGUI] = useState<string>("none")
const [audioOutputForGUI, setAudioOutputForGUI] = useState<string>("none")
@ -70,15 +71,27 @@ export const useDeviceSetting = (): DeviceSettingState => {
const [outputRecordingStarted, setOutputRecordingStarted] = useState<boolean>(false)
const [useServerMicrophone, setUseServerMicrophone] = useState<boolean>(false)
// リスト内の
useEffect(() => {
const initialize = async () => {
const audioInfo = await reloadDevices()
setInputAudioDeviceInfo(audioInfo[0])
setOutputAudioDeviceInfo(audioInfo[1])
if (useServerMicrophone) {
try {
const serverDevices = await appState.serverSetting.getServerDevices()
setServerInputAudioDeviceInfo(serverDevices.audio_input_devices)
} catch (e) {
console.warn(e)
}
}
}
initialize()
}, [])
}, [useServerMicrophone])
// キャッシュの設定は反映(たぶん、設定操作の時も起動していしまう。が問題は起こらないはず)
useEffect(() => {
if (typeof appState.clientSetting.setting.audioInput == "string") {
if (inputAudioDeviceInfo.find(x => {
@ -92,6 +105,9 @@ export const useDeviceSetting = (): DeviceSettingState => {
}, [inputAudioDeviceInfo, appState.clientSetting.setting.audioInput])
const audioInputRow = useMemo(() => {
if (useServerMicrophone) {
return <></>
}
return (
<div className="body-row split-3-7 left-padding-1 guided">
<div className="body-item-title left-padding-1">AudioInput</div>
@ -108,14 +124,41 @@ export const useDeviceSetting = (): DeviceSettingState => {
</div>
</div>
)
}, [inputAudioDeviceInfo, audioInputForGUI, appState.clientSetting.setting.audioInput])
}, [inputAudioDeviceInfo, audioInputForGUI, useServerMicrophone])
const audioInputServerRow = useMemo(() => {
if (!useServerMicrophone) {
return <></>
}
return (
<div className="body-row split-3-7 left-padding-1 guided">
<div className="body-item-title left-padding-1">AudioInput(Server)</div>
<div className="body-select-container">
<select className="body-select" value={audioInputForGUI} onChange={(e) => {
setAudioInputForGUI(e.target.value)
}}>
{
serverInputAudioDeviceInfo.map(x => {
return <option key={x.name} value={x.index}>{x.name}</option>
})
}
</select>
</div>
</div>
)
}, [serverInputAudioDeviceInfo, audioInputForGUI, useServerMicrophone])
useEffect(() => {
if (audioInputForGUI == "file") {
// file selector (audioMediaInputRow)
} else {
if (!useServerMicrophone) {
appState.clientSetting.setAudioInput(audioInputForGUI)
} else {
console.log("server mic")
appState.clientSetting.setAudioInput(null)
appState.serverSetting.setServerMicrophone(Number(audioInputForGUI))
}
}
}, [appState.audioContext, audioInputForGUI, appState.clientSetting.setAudioInput])
@ -294,10 +337,16 @@ export const useDeviceSetting = (): DeviceSettingState => {
<span className="title" onClick={() => { appState.frontendManagerState.stateControls.openDeviceSettingCheckbox.updateState(!appState.frontendManagerState.stateControls.openDeviceSettingCheckbox.checked()) }}>
Device Setting
</span>
<span className="belongings">
<input className="belongings-checkbox" type="checkbox" checked={useServerMicrophone} onChange={(e) => {
setUseServerMicrophone(e.target.checked)
}} /> use server mic (Experimental)
</span>
</div>
<div className="partition-content">
{audioInputRow}
{audioInputServerRow}
{audioMediaInputRow}
{audioOutputRow}
{audioOutputRecordingRow}
@ -305,7 +354,7 @@ export const useDeviceSetting = (): DeviceSettingState => {
</div>
</>
)
}, [audioInputRow, audioMediaInputRow, audioOutputRow, audioOutputRecordingRow])
}, [audioInputRow, audioInputServerRow, audioMediaInputRow, audioOutputRow, audioOutputRecordingRow, useServerMicrophone])
// 出力の録音データ(from worklet)がストアされたら実行

View File

@ -97,6 +97,17 @@ body {
.title {
font-size: 1.1rem;
}
.belongings {
font-weight: 400;
font-size: 0.8rem;
display: flex;
flex-direction: row;
align-items: flex-end;
margin-left: 10px;
.belongings-checkbox {
margin-bottom: 3px;
}
}
}
.partition-content {
position: static;

View File

@ -55,7 +55,10 @@ export class AudioStreamer extends Duplex {
this.socket.on('connect_error', (err) => {
this.audioStreamerListeners.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED, `[SIO] rconnection failed ${err}`)
})
this.socket.on('connect', () => console.log(`[SIO] sonnect to ${this.serverUrl}`));
this.socket.on('connect', () => {
console.log(`[SIO] sonnect to ${this.serverUrl}`)
console.log(`[SIO] ${this.socket?.id}`)
});
this.socket.on('response', (response: any[]) => {
const cur = Date.now()
const responseTime = cur - response[0]
@ -104,6 +107,11 @@ export class AudioStreamer extends Duplex {
}
}
getSocketId = () => {
return this.socket?.id
}
// Main Process
//// Pipe from mic stream

View File

@ -1,4 +1,4 @@
import { ServerInfo, ServerSettingKey } from "./const";
import { ServerAudioDevices, ServerInfo, ServerSettingKey } from "./const";
type FileChunk = {
@ -127,14 +127,14 @@ export class ServerConfigurator {
// Local Mic
getServerMicrophones = async () => {
const url = this.serverUrl + "/microphone"
const info = await new Promise<ServerInfo>((resolve) => {
getServerDevices = async () => {
const url = this.serverUrl + "/device"
const info = await new Promise<ServerAudioDevices>((resolve) => {
const request = new Request(url, {
method: 'GET',
});
fetch(request).then(async (response) => {
const json = await response.json() as ServerInfo
const json = await response.json() as ServerAudioDevices
resolve(json)
})
})

View File

@ -116,8 +116,9 @@ export class VoiceChangerClient {
// forceVfDisable is for the condition that vf is enabled in constructor.
//noiseSuppression2 => VoiceFocus
setup = async (input: string | MediaStream, bufferSize: BufferSize, echoCancel: boolean = true, noiseSuppression: boolean = true, noiseSuppression2: boolean = false) => {
setup = async (input: string | MediaStream | null, bufferSize: BufferSize, echoCancel: boolean = true, noiseSuppression: boolean = true, noiseSuppression2: boolean = false) => {
const lockNum = await this.lock()
console.log(`Input Setup=> echo: ${echoCancel}, noise1: ${noiseSuppression}, noise2: ${noiseSuppression2}`)
// condition check
if (!this.vcNode) {
@ -131,6 +132,17 @@ export class VoiceChangerClient {
this.currentMediaStream.getTracks().forEach(x => { x.stop() })
this.currentMediaStream = null
}
//// Input デバイスがnullの時はmicStreamを止めてリターン
if (!input) {
console.log(`Input Setup=> client mic is disabled.`)
if (this.micStream) {
this.micStream.pauseRecording()
}
await this.unlock(lockNum)
return
}
if (typeof input == "string") {
this.currentMediaStream = await navigator.mediaDevices.getUserMedia({
audio: {
@ -302,6 +314,14 @@ export class VoiceChangerClient {
return this.configurator.getSettings()
}
getServerDevices = () => {
return this.configurator.getServerDevices()
}
getSocketId = () => {
return this.audioStreamer.getSocketId()
}
}

View File

@ -25,6 +25,7 @@ export type VoiceChangerServerSetting = {
f0Factor: number
f0Detector: string // dio or harvest
recordIO: number // 0:off, 1:on
serverMicProps: string
}
export type VoiceChangerClientSetting = {
@ -81,9 +82,20 @@ export type ServerInfo = {
f0Factor: number
f0Detector: string
recordIO: number
serverMicProps: string
}
export type ServerAudioDevice = {
kind: string,
index: number,
name: string,
hostAPI: string
}
export type ServerAudioDevices = {
audio_input_devices: ServerAudioDevice[]
audio_output_devices: ServerAudioDevice[]
}
@ -155,7 +167,8 @@ export const ServerSettingKey = {
"onnxExecutionProvider": "onnxExecutionProvider",
"f0Factor": "f0Factor",
"f0Detector": "f0Detector",
"recordIO": "recordIO"
"recordIO": "recordIO",
"serverMicProps": "serverMicProps",
} as const
export type ServerSettingKey = typeof ServerSettingKey[keyof typeof ServerSettingKey]
@ -174,7 +187,9 @@ export const DefaultVoiceChangerServerSetting: VoiceChangerServerSetting = {
f0Factor: 1.0,
onnxExecutionProvider: "CPUExecutionProvider",
f0Detector: "dio",
recordIO: 0
recordIO: 0,
serverMicProps: ""
}
export const DefaultVoiceChangerClientSetting: VoiceChangerClientSetting = {

View File

@ -1,7 +1,6 @@
import { useState, useMemo, useRef, useEffect } from "react"
import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence, DownSamplingMode } from "../const"
import { createDummyMediaStream } from "../util"
import { VoiceChangerClient } from "../VoiceChangerClient"
import { useIndexedDB } from "./useIndexedDB"
@ -113,9 +112,7 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
const _setInput = async () => {
if (!props.voiceChangerClient) return
if (!settingRef.current.audioInput || settingRef.current.audioInput == "none") {
// console.log("[useClient] setup!(1)", settingRef.current.audioInput)
const ms = createDummyMediaStream(props.audioContext!)
await props.voiceChangerClient.setup(ms, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)
await props.voiceChangerClient.setup(null, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)
} else {
// console.log("[useClient] setup!(2)", settingRef.current.audioInput)
await props.voiceChangerClient.setup(settingRef.current.audioInput, settingRef.current.bufferSize, settingRef.current.echoCancel, settingRef.current.noiseSuppression, settingRef.current.noiseSuppression2)

View File

@ -1,5 +1,5 @@
import { useState, useMemo, useRef, useEffect } from "react"
import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA } from "../const"
import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA, ServerAudioDevices } from "../const"
import { VoiceChangerClient } from "../VoiceChangerClient"
import { useIndexedDB } from "./useIndexedDB"
@ -51,9 +51,11 @@ export type ServerSettingState = {
setF0Factor: (num: number) => Promise<boolean>;
setF0Detector: (val: string) => Promise<boolean>;
setRecordIO: (num: number) => Promise<boolean>;
setServerMicrophone: (index: number) => Promise<boolean | undefined>
reloadServerInfo: () => Promise<void>;
setFileUploadSetting: (val: FileUploadSetting) => void
loadModel: () => Promise<void>
getServerDevices: () => Promise<ServerAudioDevices>
uploadProgress: number
isUploading: boolean
}
@ -218,6 +220,19 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
return await _set_and_store(ServerSettingKey.recordIO, "" + num)
}
}, [props.voiceChangerClient])
const setServerMicrophone = useMemo(() => {
return async (index: number) => {
if (!props.voiceChangerClient) {
return
}
const sid = props.voiceChangerClient.getSocketId()
const serverMicProps = {
sid: sid,
deviceIndex: index
}
return await _set_and_store(ServerSettingKey.serverMicProps, JSON.stringify(serverMicProps))
}
}, [props.voiceChangerClient])
//////////////
// 操作
/////////////
@ -368,6 +383,16 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
await removeItem(INDEXEDDB_KEY_MODEL_DATA)
}
const getServerDevices = async (): Promise<ServerAudioDevices> => {
if (!props.voiceChangerClient) {
return {
audio_input_devices: [],
audio_output_devices: []
}
}
const res = await props.voiceChangerClient.getServerDevices()
return res
}
return {
setting,
@ -387,9 +412,11 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
setF0Factor,
setF0Detector,
setRecordIO,
setServerMicrophone,
reloadServerInfo,
setFileUploadSetting,
loadModel,
getServerDevices,
uploadProgress,
isUploading,
}

122
client/native/cli_client.py Normal file
View File

@ -0,0 +1,122 @@
import argparse
import pyaudio
import wave
import struct
import socketio
import ssl
from datetime import datetime
import time
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
context.verify_mode = ssl.CERT_NONE
def setupArgParser():
parser = argparse.ArgumentParser()
parser.add_argument("-p", type=int, default=18888, help="port")
parser.add_argument("-d", type=int, help="device index")
parser.add_argument("-s", type=str, default="", help="sid")
return parser
class MockStream:
"""
オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
"""
def __init__(self, sampling_rate):
self.sampling_rate = sampling_rate
self.start_count = 2
self.end_count = 2
self.fr = None
self.fw = None
def open_inputfile(self, input_filename):
self.fr = wave.open(input_filename, 'rb')
def open_outputfile(self, output_filename):
self.fw = wave.open(output_filename, 'wb')
self.fw.setnchannels(1)
self.fw.setsampwidth(2)
self.fw.setframerate(self.sampling_rate)
def read(self, length, exception_on_overflow=False):
if self.start_count > 0:
wav = bytes(length * 2)
self.start_count -= 1 # 最初の2回はダミーの空データ送る
else:
wav = self.fr.readframes(length)
if len(wav) <= 0: # データなくなってから最後の2回はダミーの空データを送る
wav = bytes(length * 2)
self.end_count -= 1
if self.end_count < 0:
Hyperparameters.VC_END_FLAG = True
return wav
def write(self, wav):
self.fw.writeframes(wav)
def stop_stream(self):
pass
def close(self):
if self.fr != None:
self.fr.close()
self.fr = None
if self.fw != None:
self.fw.close()
self.fw = None
mock_stream_out = MockStream(24000)
mock_stream_out.open_outputfile("test.wav")
class MyCustomNamespace(socketio.ClientNamespace): # 名前空間を設定するクラス
def on_connect(self):
print('[{}] connect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
def on_disconnect(self):
print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
def on_response(self, msg):
print('[{}] response : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), msg))
timestamp = msg[0]
data = msg[1]
unpackedData = struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)
mock_stream_out.write(data)
def my_background_task(sio): # ここにバックグランド処理のコードを書く
while True:
sio.emit('broadcast_message', "aaa", namespace="/test") # ターミナルで入力された文字をサーバーに送信
sio.sleep(1)
if __name__ == '__main__':
parser = setupArgParser()
args, unknown = parser.parse_known_args()
port = args.p
deviceIndex = args.d
sid = args.s
audio = pyaudio.PyAudio()
audio_input_stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=24000,
frames_per_buffer=4096,
input_device_index=args.d,
input=True)
sio = socketio.Client(ssl_verify=False)
sio.register_namespace(MyCustomNamespace("/test"))
sio.connect("https://192.168.0.3:18888")
while True:
in_wav = audio_input_stream.read(4096, exception_on_overflow=False)
bin = struct.pack('<%sh' % len(in_wav), *in_wav)
sio.emit('request_message', [1000, bin], namespace="/test")
# sio.start_background_task(my_background_task, sio)

View File

@ -23,11 +23,11 @@ class MMVC_Rest_VoiceChanger:
self.voiceChangerManager = voiceChangerManager
self.router = APIRouter()
self.router.add_api_route("/test", self.test, methods=["POST"])
self.router.add_api_route("/microphone", self.get_microphone, methods=["GET"])
self.router.add_api_route("/device", self.get_device, methods=["GET"])
self.tlock = threading.Lock()
def get_microphone(self):
def get_device(self):
audio = pyaudio.PyAudio()
audio_input_devices = []
audio_output_devices = []

View File

@ -17,7 +17,7 @@ class MMVC_Namespace(socketio.AsyncNamespace):
return cls._instance
def on_connect(self, sid, environ):
# print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'), sid))
pass
async def on_request_message(self, sid, msg):

View File

@ -29,6 +29,10 @@ import librosa
import librosa.display
SAMPLING_RATE = 24000
import pyaudio
import json
from multiprocessing import Process, Queue
class MockStream:
"""
@ -93,6 +97,7 @@ class VocieChangerSettings():
f0Factor: float = 1.0
f0Detector: str = "dio" # dio or harvest
recordIO: int = 1 # 0:off, 1:on
serverMicProps: str = ""
pyTorchModelFile: str = ""
onnxModelFile: str = ""
@ -101,7 +106,11 @@ class VocieChangerSettings():
# ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize", "recordIO"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate", "f0Factor"]
strData = ["framework", "f0Detector"]
strData = ["framework", "f0Detector", "serverMicProps"]
def readMicrophone(queue, sid, deviceIndex):
print("READ MIC", queue, sid, deviceIndex)
class VoiceChanger():
@ -277,6 +286,35 @@ class VoiceChanger():
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
if key == "serverMicProps":
if hasattr(self, "serverMicrophoneReaderProcess"):
self.serverMicrophoneReaderProcess.terminate()
if len(val) == 0:
print("server mic close")
pass
else:
props = json.loads(val)
print(props)
sid = props["sid"]
deviceIndex = props["deviceIndex"]
self.serverMicrophoneReaderProcessQueue = Queue()
self.serverMicrophoneReaderProcess = Process(target=readMicrophone, args=(
self.serverMicrophoneReaderProcessQueue, sid, deviceIndex,))
self.serverMicrophoneReaderProcess.start()
try:
print(sid, deviceIndex)
except Exception as e:
print(e)
# audio = pyaudio.PyAudio()
# audio_input_stream = audio.open(format=pyaudio.paInt16,
# channels=1,
# rate=SAMPLING_RATE,
# frames_per_buffer=4096,
# input_device_index=val,
# input=True)
else:
print(f"{key} is not mutalbe variable!")
@ -626,3 +664,6 @@ class VoiceChanger():
# print(audio1)
return audio1
def __del__(self):
print("DESTRUCTOR")