server resampling added

This commit is contained in:
wataru 2023-02-18 20:53:15 +09:00
parent c6309b83f8
commit 15686caf50
12 changed files with 944 additions and 989 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,5 @@
import { BufferSize, DownSamplingMode, Protocol, SampleRate, VoiceChangerMode } from "@dannadori/voice-changer-client-js"
import React, { useMemo, useState } from "react"
import { ClientState } from "@dannadori/voice-changer-client-js";
import { BufferSize, DownSamplingMode, InputSampleRate, Protocol, SampleRate, VoiceChangerMode } from "@dannadori/voice-changer-client-js"
import React, { useMemo } from "react"
import { useAppState } from "./001_provider/001_AppStateProvider";
import { AnimationTypes, HeaderButton, HeaderButtonProps } from "./components/101_HeaderButton";
@ -83,6 +82,27 @@ export const useAdvancedSetting = (): AdvancedSettingState => {
)
}, [appState.clientSetting.setting.sampleRate, appState.clientSetting.setSampleRate])
const sendingSampleRateRow = useMemo(() => {
return (
<div className="body-row split-3-7 left-padding-1 guided">
<div className="body-item-title left-padding-1">Sending Sample Rate</div>
<div className="body-select-container">
<select className="body-select" value={appState.clientSetting.setting.sendingSampleRate} onChange={(e) => {
appState.clientSetting.setSendingSampleRate(Number(e.target.value) as InputSampleRate)
appState.serverSetting.setInputSampleRate(Number(e.target.value) as InputSampleRate)
}}>
{
Object.values(InputSampleRate).map(x => {
return <option key={x} value={x}>{x}</option>
})
}
</select>
</div>
</div>
)
}, [appState.clientSetting.setting.sendingSampleRate, appState.clientSetting.setSendingSampleRate, appState.serverSetting.setInputSampleRate])
const bufferSizeRow = useMemo(() => {
return (
@ -263,6 +283,7 @@ export const useAdvancedSetting = (): AdvancedSettingState => {
{protocolRow}
<div className="body-row divider"></div>
{sampleRateRow}
{sendingSampleRateRow}
{bufferSizeRow}
<div className="body-row divider"></div>
@ -280,7 +301,7 @@ export const useAdvancedSetting = (): AdvancedSettingState => {
</>
)
}, [mmvcServerUrlRow, protocolRow, sampleRateRow, bufferSizeRow, convertChunkNumRow, minConvertSizeRow, crossFadeOverlapRateRow, crossFadeOffsetRateRow, crossFadeEndRateRow, voiceChangeModeRow, workletSettingRow, downSamplingModeRow])
}, [mmvcServerUrlRow, protocolRow, sampleRateRow, sendingSampleRateRow, bufferSizeRow, convertChunkNumRow, minConvertSizeRow, crossFadeOverlapRateRow, crossFadeOffsetRateRow, crossFadeEndRateRow, voiceChangeModeRow, workletSettingRow, downSamplingModeRow])
const advancedSetting = useMemo(() => {

File diff suppressed because it is too large Load Diff

View File

@ -27,7 +27,7 @@
"license": "ISC",
"devDependencies": {
"@types/audioworklet": "^0.0.36",
"@types/node": "^18.13.0",
"@types/node": "^18.14.0",
"@types/react": "18.0.28",
"@types/react-dom": "18.0.11",
"eslint": "^8.34.0",
@ -47,9 +47,11 @@
},
"dependencies": {
"@types/readable-stream": "^2.3.15",
"amazon-chime-sdk-js": "^3.10.0",
"amazon-chime-sdk-js": "^3.11.0",
"install": "^0.13.0",
"localforage": "^1.10.0",
"microphone-stream": "^6.0.1",
"path-browserify": "^1.0.1",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"readable-stream": "^4.3.0",

View File

@ -1,7 +1,8 @@
import { io, Socket } from "socket.io-client";
import { DefaultEventsMap } from "@socket.io/component-emitter";
import { Duplex, DuplexOptions } from "readable-stream";
import { DownSamplingMode, Protocol, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION } from "./const";
import { DefaultVoiceChangerClientSetting, DownSamplingMode, Protocol, SendingSampleRate, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION } from "./const";
export type Callbacks = {
onVoiceReceived: (voiceChangerMode: VoiceChangerMode, data: ArrayBuffer) => void
@ -38,7 +39,7 @@ export class AudioStreamer extends Duplex {
// Flags
// private downSamplingMode: DownSamplingMode = DownSamplingMode.decimate
private downSamplingMode: DownSamplingMode = DownSamplingMode.average
private sendingSampleRate: number = DefaultVoiceChangerClientSetting.sendingSampleRate
constructor(callbacks: Callbacks, audioStreamerListeners: AudioStreamerListeners, options?: DuplexOptions) {
super(options);
@ -97,6 +98,9 @@ export class AudioStreamer extends Duplex {
setDownSamplingMode = (val: DownSamplingMode) => {
this.downSamplingMode = val
}
setSendingSampleRate = (val: SendingSampleRate) => {
this.sendingSampleRate = val
}
getSettings = (): AudioStreamerSettings => {
return {
@ -156,9 +160,12 @@ export class AudioStreamer extends Duplex {
}
private _write_realtime = (buffer: Float32Array) => {
private _write_realtime = async (buffer: Float32Array) => {
let downsampledBuffer: Float32Array | null = null
if (this.downSamplingMode == DownSamplingMode.decimate) {
if (this.sendingSampleRate == 48000) {
downsampledBuffer = buffer
} else if (this.downSamplingMode == DownSamplingMode.decimate) {
//////// (Kind 1) 間引き //////////
// bufferSize個のデータ48Khzが入ってくる。
//// 48000Hz で入ってくるので間引いて24000Hzに変換する。
@ -170,7 +177,8 @@ export class AudioStreamer extends Duplex {
}
} else {
//////// (Kind 2) 平均 //////////
downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
// downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, this.sendingSampleRate)
}
// Float to signed16
@ -184,7 +192,9 @@ export class AudioStreamer extends Duplex {
// 256byte(最低バッファサイズ256から間引いた個数x2byte)をchunkとして管理
const chunkByteSize = 256 // (const.ts ★1)
// const chunkByteSize = 256 // (const.ts ★1)
// const chunkByteSize = 256 * 2 // (const.ts ★1)
const chunkByteSize = (256 * 2) * (this.sendingSampleRate / 48000) // (const.ts ★1)
for (let i = 0; i < arrayBuffer.byteLength / chunkByteSize; i++) {
const ab = arrayBuffer.slice(i * chunkByteSize, (i + 1) * chunkByteSize)
this.requestChunks.push(ab)

View File

@ -3,7 +3,7 @@ import { VoiceChangerWorkletNode, VoiceChangerWorkletListener } from "./VoiceCha
import workerjs from "raw-loader!../worklet/dist/index.js";
import { VoiceFocusDeviceTransformer, VoiceFocusTransformDevice } from "amazon-chime-sdk-js";
import { createDummyMediaStream, validateUrl } from "./util";
import { BufferSize, DefaultVoiceChangerClientSetting, DownSamplingMode, Protocol, ServerSettingKey, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletSetting } from "./const";
import { BufferSize, DefaultVoiceChangerClientSetting, DownSamplingMode, Protocol, SendingSampleRate, ServerSettingKey, VoiceChangerMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletSetting } from "./const";
import MicrophoneStream from "microphone-stream";
import { AudioStreamer, Callbacks, AudioStreamerListeners } from "./AudioStreamer";
import { ServerConfigurator } from "./ServerConfigurator";
@ -255,6 +255,9 @@ export class VoiceChangerClient {
setDownSamplingMode = (val: DownSamplingMode) => {
this.audioStreamer.setDownSamplingMode(val)
}
setSendingSampleRate = (val: SendingSampleRate) => {
this.audioStreamer.setSendingSampleRate(val)
}
// configure worklet
configureWorklet = (setting: WorkletSetting) => {

View File

@ -26,6 +26,7 @@ export type VoiceChangerServerSetting = {
f0Detector: string // dio or harvest
recordIO: number // 0:off, 1:on
serverMicProps: string
inputSampleRate: InputSampleRate
}
export type VoiceChangerClientSetting = {
@ -33,6 +34,7 @@ export type VoiceChangerClientSetting = {
mmvcServerUrl: string,
protocol: Protocol,
sampleRate: SampleRate, // 48000Hz
sendingSampleRate: SendingSampleRate,
bufferSize: BufferSize, // 256, 512, 1024, 2048, 4096, 8192, 16384 (for mic stream)
inputChunkNum: number, // n of (256 x n) for send buffer
speakers: Speaker[],
@ -83,6 +85,7 @@ export type ServerInfo = {
f0Detector: string
recordIO: number
serverMicProps: string
inputSampleRate: InputSampleRate
}
export type ServerAudioDevice = {
@ -123,6 +126,18 @@ export const SampleRate = {
} as const
export type SampleRate = typeof SampleRate[keyof typeof SampleRate]
export const SendingSampleRate = {
"48000": 48000,
"24000": 24000
} as const
export type SendingSampleRate = typeof SendingSampleRate[keyof typeof SendingSampleRate]
export const InputSampleRate = {
"48000": 48000,
"24000": 24000
} as const
export type InputSampleRate = typeof InputSampleRate[keyof typeof InputSampleRate]
export const BufferSize = {
"256": 256,
"512": 512,
@ -169,6 +184,7 @@ export const ServerSettingKey = {
"f0Detector": "f0Detector",
"recordIO": "recordIO",
"serverMicProps": "serverMicProps",
"inputSampleRate": "inputSampleRate",
} as const
export type ServerSettingKey = typeof ServerSettingKey[keyof typeof ServerSettingKey]
@ -188,8 +204,8 @@ export const DefaultVoiceChangerServerSetting: VoiceChangerServerSetting = {
onnxExecutionProvider: "CPUExecutionProvider",
f0Detector: "dio",
recordIO: 0,
serverMicProps: ""
serverMicProps: "",
inputSampleRate: 48000
}
export const DefaultVoiceChangerClientSetting: VoiceChangerClientSetting = {
@ -197,6 +213,7 @@ export const DefaultVoiceChangerClientSetting: VoiceChangerClientSetting = {
mmvcServerUrl: "",
protocol: "sio",
sampleRate: 48000,
sendingSampleRate: 48000,
bufferSize: 1024,
inputChunkNum: 48,
speakers: [

View File

@ -1,6 +1,6 @@
import { useState, useMemo, useRef, useEffect } from "react"
import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence, DownSamplingMode } from "../const"
import { VoiceChangerClientSetting, Protocol, BufferSize, VoiceChangerMode, SampleRate, Speaker, DefaultVoiceChangerClientSetting, INDEXEDDB_KEY_CLIENT, Correspondence, DownSamplingMode, SendingSampleRate } from "../const"
import { VoiceChangerClient } from "../VoiceChangerClient"
import { useIndexedDB } from "./useIndexedDB"
@ -22,6 +22,7 @@ export type ClientSettingState = {
setInputChunkNum: (num: number) => void;
setVoiceChangerMode: (mode: VoiceChangerMode) => void
setDownSamplingMode: (mode: DownSamplingMode) => void
setSendingSampleRate: (val: SendingSampleRate) => void
setSampleRate: (num: SampleRate) => void
setSpeakers: (speakers: Speaker[]) => void
setCorrespondences: (file: File | null) => Promise<void>
@ -191,6 +192,15 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
}
}, [props.voiceChangerClient])
const setSendingSampleRate = useMemo(() => {
return (val: SendingSampleRate) => {
if (!props.voiceChangerClient) return
props.voiceChangerClient.setSendingSampleRate(val)
settingRef.current.sendingSampleRate = val
setSetting({ ...settingRef.current })
}
}, [props.voiceChangerClient])
const setSampleRate = useMemo(() => {
@ -292,6 +302,7 @@ export const useClientSetting = (props: UseClientSettingProps): ClientSettingSta
setInputChunkNum,
setVoiceChangerMode,
setDownSamplingMode,
setSendingSampleRate,
setSampleRate,
setSpeakers,
setCorrespondences,

View File

@ -1,5 +1,5 @@
import { useState, useMemo, useRef, useEffect } from "react"
import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA, ServerAudioDevices } from "../const"
import { VoiceChangerServerSetting, ServerInfo, Framework, OnnxExecutionProvider, DefaultVoiceChangerServerSetting, ServerSettingKey, INDEXEDDB_KEY_SERVER, INDEXEDDB_KEY_MODEL_DATA, ServerAudioDevices, InputSampleRate } from "../const"
import { VoiceChangerClient } from "../VoiceChangerClient"
import { useIndexedDB } from "./useIndexedDB"
@ -52,6 +52,7 @@ export type ServerSettingState = {
setF0Detector: (val: string) => Promise<boolean>;
setRecordIO: (num: number) => Promise<boolean>;
setServerMicrophone: (index: number) => Promise<boolean | undefined>
setInputSampleRate: (num: InputSampleRate) => Promise<boolean>
reloadServerInfo: () => Promise<void>;
setFileUploadSetting: (val: FileUploadSetting) => void
loadModel: () => Promise<void>
@ -233,6 +234,12 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
return await _set_and_store(ServerSettingKey.serverMicProps, JSON.stringify(serverMicProps))
}
}, [props.voiceChangerClient])
const setInputSampleRate = useMemo(() => {
return async (num: number) => {
return await _set_and_store(ServerSettingKey.inputSampleRate, "" + num)
}
}, [props.voiceChangerClient])
//////////////
// 操作
/////////////
@ -413,6 +420,7 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
setF0Detector,
setRecordIO,
setServerMicrophone,
setInputSampleRate,
reloadServerInfo,
setFileUploadSetting,
loadModel,

View File

@ -4,6 +4,8 @@ module.exports = {
entry: "./src/index.ts",
resolve: {
extensions: [".ts", ".js"],
fallback: {
}
},
module: {
rules: [

View File

@ -30,7 +30,6 @@ class MMVC_Namespace(socketio.AsyncNamespace):
else:
unpackedData = np.array(struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data))
audio1 = self.voiceChangerManager.changeVoice(unpackedData)
# print("sio result:", len(audio1), audio1.shape)
bin = struct.pack('<%sh' % len(audio1), *audio1)
await self.emit('response', [timestamp, bin], to=sid)

View File

@ -4,6 +4,7 @@ import os
import traceback
import numpy as np
from dataclasses import dataclass, asdict
import resampy
import onnxruntime
@ -98,13 +99,14 @@ class VocieChangerSettings():
f0Detector: str = "dio" # dio or harvest
recordIO: int = 1 # 0:off, 1:on
serverMicProps: str = ""
inputSampleRate: int = 48000 # 48000 or 24000
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
# ↓mutableな物だけ列挙
intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize", "recordIO"]
intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize", "recordIO", "inputSampleRate"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate", "f0Factor"]
strData = ["framework", "f0Detector", "serverMicProps"]
@ -512,6 +514,9 @@ class VoiceChanger():
return result
def on_request(self, unpackedData: any):
if self.settings.inputSampleRate != 24000:
print("convert sampling rate!", self.settings.inputSampleRate)
unpackedData = resampy.resample(unpackedData, 48000, 24000)
convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk
# print("convsize:", unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate))
if unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize:
@ -547,123 +552,3 @@ class VoiceChanger():
self.stream_in.write(unpackedData.astype(np.int16).tobytes())
self.stream_out.write(result.tobytes())
return result
#########################################################################################
def overlap_merge(self, now_wav, prev_wav, overlap_length):
"""
生成したwavデータを前回生成したwavデータとoverlap_lengthだけ重ねてグラデーション的にマージします
終端のoverlap_lengthぶんは次回マージしてから再生するので削除します
Parameters
----------
now_wav: 今回生成した音声wavデータ
prev_wav: 前回生成した音声wavデータ
overlap_length: 重ねる長さ
"""
if overlap_length == 0:
return now_wav
gradation = np.arange(overlap_length) / overlap_length
now = np.frombuffer(now_wav, dtype='int16')
prev = np.frombuffer(prev_wav, dtype='int16')
now_head = now[:overlap_length]
prev_tail = prev[-overlap_length:]
print("merge params:", gradation.shape, now.shape, prev.shape, now_head.shape, prev_tail.shape)
merged = prev_tail * (np.cos(gradation * np.pi * 0.5) ** 2) + now_head * (np.cos((1 - gradation) * np.pi * 0.5) ** 2)
# merged = prev_tail * (1 - gradation) + now_head * gradation
overlapped = np.append(merged, now[overlap_length:-overlap_length])
signal = np.round(overlapped, decimals=0)
signal = signal.astype(np.int16)
# signal = signal.astype(np.int16).tobytes()
return signal
def on_request_(self, unpackedData: any):
self._generate_strength(unpackedData)
convertSize = 8192
unpackedData = unpackedData.astype(np.int16)
if hasattr(self, 'stored_raw_input') == False:
self.stored_raw_input = unpackedData
else:
self.stored_raw_input = np.concatenate([self.stored_raw_input, unpackedData])
self.stored_raw_input = self.stored_raw_input[-1 * (convertSize):]
processing_input = self.stored_raw_input
print("signal_shape1", unpackedData.shape, processing_input.shape, processing_input.dtype)
processing_input = processing_input / self.hps.data.max_wav_value
print("type:", processing_input.dtype)
_f0, _time = pw.dio(processing_input, self.hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(processing_input, _f0, _time, self.hps.data.sampling_rate)
f0 = convert_continuos_f0(f0, int(processing_input.shape[0] / self.hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32))
print("signal_shape2", f0.shape)
processing_input = torch.from_numpy(processing_input.astype(np.float32)).clone()
with torch.no_grad():
trans_length = processing_input.size()[0]
# spec, sid = get_audio_text_speaker_pair(signal.view(1, trans_length), Hyperparameters.SOURCE_ID)
processing_input_v = processing_input.view(1, trans_length) # unsqueezeと同じ
print("processing_input_v shape:", processing_input_v.shape)
spec = spectrogram_torch(processing_input_v, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(self.settings.srcId)])
dispose_stft_specs = 2
spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
f0 = f0[dispose_stft_specs:-dispose_stft_specs]
print("spec shape:", spec.shape)
data = TextAudioSpeakerCollate(
sample_rate=self.hps.data.sampling_rate,
hop_size=self.hps.data.hop_length,
f0_factor=self.settings.f0Factor
)([(spec, sid, f0)])
if self.settings.gpu >= 0 or self.gpu_num > 0:
# spec, spec_lengths, sid_src, sin, d = [x.cuda(Hyperparameters.GPU_ID) for x in data]
spec, spec_lengths, sid_src, sin, d = data
spec = spec.cuda(self.settings.gpu)
spec_lengths = spec_lengths.cuda(self.settings.gpu)
sid_src = sid_src.cuda(self.settings.gpu)
sin = sin.cuda(self.settings.gpu)
d = tuple([d[:1].cuda(self.settings.gpu) for d in d])
sid_target = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu)
audio = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths,
sin, d, sid_src, sid_target)[0, 0].data.cpu().float().numpy()
else:
spec, spec_lengths, sid_src, sin, d = data
sid_target = torch.LongTensor([self.settings.dstId])
audio = self.net_g.voice_conversion(spec, spec_lengths, sin, d, sid_src, sid_target)[0, 0].data.cpu().float().numpy()
dispose_conv1d_length = 1280
audio = audio[dispose_conv1d_length:-dispose_conv1d_length]
audio = audio * self.hps.data.max_wav_value
audio = audio.astype(np.int16)
print("fin audio shape:", audio.shape)
audio = audio.tobytes()
if hasattr(self, "prev_audio"):
try:
audio1 = self.overlap_merge(audio, self.prev_audio, 1024)
except:
audio1 = np.zeros(1).astype(np.int16)
pass
# return np.zeros(1).astype(np.int16)
else:
audio1 = np.zeros(1).astype(np.int16)
self.prev_audio = audio
self.out.write(audio)
self.stream_in.write(unpackedData.tobytes())
# print(audio1)
return audio1
def __del__(self):
print("DESTRUCTOR")