From 808278364462002107eadc07968f0db3afbc8884 Mon Sep 17 00:00:00 2001 From: wataru Date: Sun, 8 Jan 2023 17:58:27 +0900 Subject: [PATCH] WIP: alpha --- client/demo/src/100_options_microphone.tsx | 43 ++-- client/demo/src/103_speaker_setting.tsx | 1 - client/demo/src/106_server_control.tsx | 2 +- client/demo/src/hooks/useClient.ts | 2 - client/lib/src/VoiceChangerClient.ts | 5 +- server/const.py | 5 + server/sio/MMVC_Namespace.py | 20 +- server/voice_changer/VoiceChanger.py | 220 ++++++++++---------- server/voice_changer/VoiceChangerManager.py | 16 +- 9 files changed, 141 insertions(+), 173 deletions(-) diff --git a/client/demo/src/100_options_microphone.tsx b/client/demo/src/100_options_microphone.tsx index 323ac667..fd6e7200 100644 --- a/client/demo/src/100_options_microphone.tsx +++ b/client/demo/src/100_options_microphone.tsx @@ -53,22 +53,22 @@ export const useMicrophoneOptions = () => { useEffect(() => { if (!clientState.clientInitialized) return clientState.setServerUrl(serverSetting.mmvcServerUrl) - }, [serverSetting.mmvcServerUrl]) + }, [clientState.clientInitialized, serverSetting.mmvcServerUrl]) //// プロトコル変更 useEffect(() => { if (!clientState.clientInitialized) return clientState.setProtocol(serverSetting.protocol) - }, [serverSetting.protocol]) + }, [clientState.clientInitialized, serverSetting.protocol]) //// フレームワーク変更 useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.framework, serverSetting.framework) - }, [serverSetting.framework]) + }, [clientState.clientInitialized, serverSetting.framework]) //// OnnxExecutionProvider変更 useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.onnxExecutionProvider, serverSetting.onnxExecutionProvider) - }, [serverSetting.onnxExecutionProvider]) + }, [clientState.clientInitialized, serverSetting.onnxExecutionProvider]) // 102 DeviceSetting //// 入力情報の設定 @@ -82,58 +82,39 @@ export const useMicrophoneOptions = () => { useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.srcId, speakerSetting.srcId) - }, [speakerSetting.srcId]) + }, [clientState.clientInitialized, speakerSetting.srcId]) useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.dstId, speakerSetting.dstId) - }, [speakerSetting.dstId]) + }, [clientState.clientInitialized, speakerSetting.dstId]) // 104 ConvertSetting useEffect(() => { if (!clientState.clientInitialized) return clientState.setInputChunkNum(convertSetting.inputChunkNum) - }, [convertSetting.inputChunkNum]) + }, [clientState.clientInitialized, convertSetting.inputChunkNum]) useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.convertChunkNum, convertSetting.convertChunkNum) - }, [convertSetting.convertChunkNum]) + }, [clientState.clientInitialized, convertSetting.convertChunkNum]) useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.gpu, convertSetting.gpu) - }, [convertSetting.gpu]) + }, [clientState.clientInitialized, convertSetting.gpu]) useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.crossFadeOffsetRate, convertSetting.crossFadeOffsetRate) - }, [convertSetting.crossFadeOffsetRate]) + }, [clientState.clientInitialized, convertSetting.crossFadeOffsetRate]) useEffect(() => { if (!clientState.clientInitialized) return clientState.updateSettings(ServerSettingKey.crossFadeEndRate, convertSetting.crossFadeEndRate) - }, [convertSetting.crossFadeEndRate]) + }, [clientState.clientInitialized, convertSetting.crossFadeEndRate]) // 105 AdvancedSetting useEffect(() => { if (!clientState.clientInitialized) return clientState.setVoiceChangerMode(advancedSetting.voiceChangerMode) - }, [advancedSetting.voiceChangerMode]) - - - // // const [options, setOptions] = useState(InitMicrophoneOptionsState) - // const [params, setParams] = useState(DefaultVoiceChangerRequestParamas) - // const [options, setOptions] = useState(DefaultVoiceChangerOptions) - // const [isStarted, setIsStarted] = useState(false) - - - // useEffect(() => { - // const storeOptions = async () => { - // if (CHROME_EXTENSION) { - // // @ts-ignore - // await chrome.storage.local.set({ microphoneOptions: options }) - // } - // } - // storeOptions() - // }, [options]) // loadより前に持ってくるとstorage内が初期化されるのでだめかも。(要検証) - - + }, [clientState.clientInitialized, advancedSetting.voiceChangerMode]) const voiceChangerSetting = useMemo(() => { diff --git a/client/demo/src/103_speaker_setting.tsx b/client/demo/src/103_speaker_setting.tsx index 543a01c9..4564c261 100644 --- a/client/demo/src/103_speaker_setting.tsx +++ b/client/demo/src/103_speaker_setting.tsx @@ -86,7 +86,6 @@ export const useSpeakerSetting = () => {
set
- ) }, [speakers, editSpeakerTargetId, editSpeakerTargetName]) diff --git a/client/demo/src/106_server_control.tsx b/client/demo/src/106_server_control.tsx index bdfd054f..6d664ed8 100644 --- a/client/demo/src/106_server_control.tsx +++ b/client/demo/src/106_server_control.tsx @@ -38,7 +38,7 @@ export const useServerControl = (props: UseServerControlProps) => { ) - }, [isStarted]) + }, [isStarted, props.convertStart, props.convertStop]) const performanceRow = useMemo(() => { return ( diff --git a/client/demo/src/hooks/useClient.ts b/client/demo/src/hooks/useClient.ts index a2286331..1e9259aa 100644 --- a/client/demo/src/hooks/useClient.ts +++ b/client/demo/src/hooks/useClient.ts @@ -94,7 +94,6 @@ export const useClient = (props: UseClientProps): ClientState => { return } voiceChangerClientRef.current.setProtocol(protocol) - voiceChangerClientRef.current.stop() } }, []) @@ -105,7 +104,6 @@ export const useClient = (props: UseClientProps): ClientState => { return } voiceChangerClientRef.current.setInputChunkNum(num) - voiceChangerClientRef.current.stop() } }, []) diff --git a/client/lib/src/VoiceChangerClient.ts b/client/lib/src/VoiceChangerClient.ts index ed416ce0..8a2e3c27 100644 --- a/client/lib/src/VoiceChangerClient.ts +++ b/client/lib/src/VoiceChangerClient.ts @@ -33,6 +33,8 @@ export class VoiceChnagerClient { private promiseForInitialize: Promise private _isVoiceChanging = false + private sslCertified: string[] = [] + private callbacks: Callbacks = { onVoiceReceived: (voiceChangerMode: VoiceChangerMode, data: ArrayBuffer): void => { // console.log(voiceChangerMode, data) @@ -176,11 +178,12 @@ export class VoiceChnagerClient { const pageUrl = `${location.protocol}//${location.host}` console.log("SERVER CHECK", url, pageUrl) - if (url != pageUrl && location.protocol == "https:") { + if (url != pageUrl && location.protocol == "https:" && this.sslCertified.includes(url) == false) { if (openTab) { const value = window.confirm("MMVC Server is different from this page's origin. Open tab to open ssl connection. OK? (You can close the opened tab after ssl connection succeed.)"); if (value) { window.open(url, '_blank') + this.sslCertified.push(url) } else { alert("Your voice conversion may fail...") } diff --git a/server/const.py b/server/const.py index 47e9ac93..35fca3a1 100644 --- a/server/const.py +++ b/server/const.py @@ -1 +1,6 @@ frontend_path = "../client/demo/dist" + + +ERROR_NO_ONNX_SESSION = "ERROR_NO_ONNX_SESSION" + + diff --git a/server/sio/MMVC_Namespace.py b/server/sio/MMVC_Namespace.py index 6b8c03c1..dfde39ac 100644 --- a/server/sio/MMVC_Namespace.py +++ b/server/sio/MMVC_Namespace.py @@ -20,24 +20,14 @@ class MMVC_Namespace(socketio.AsyncNamespace): pass async def on_request_message(self, sid, msg): - # print("on_request_message", torch.cuda.memory_allocated()) - gpu = int(msg[0]) - srcId = int(msg[1]) - dstId = int(msg[2]) - timestamp = int(msg[3]) - convertChunkNum = int(msg[4]) - crossFadeLowerValue = float(msg[5]) - crossFadeOffsetRate = float(msg[6]) - crossFadeEndRate = float(msg[7]) - data = msg[8] + timestamp = int(msg[0]) + data = msg[1] unpackedData = np.array(struct.unpack('<%sh' % (len(data) // struct.calcsize('= 0 and val < self.gpu_num and hasattr(self, "onnx_session"): + providers = self.onnx_session.get_providers() + print("Providers::::", providers) + if "CUDAExecutionProvider" in providers: + provider_options=[{'device_id': self.settings.gpu}] + self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options) return self.get_info() elif key in self.settings.floatData: setattr(self.settings, key, float(val)) @@ -113,22 +124,7 @@ class VoiceChanger(): print(f"{key} is not mutalbe variable!") return self.get_info() - - # def set_gpu(self, gpu:int): - # self.settings.gpu = gpu - # return {"gpu":self.settings.gpu} - # def set_crossfade_setting(self, crossFadeOffsetRate:float, crossFadeEndRate:float): - # self.settings.crossFadeOffsetRate = crossFadeOffsetRate - # self.settings.crossFadeEndRate = crossFadeEndRate - # self.unpackedData_length = 0 # 次のVC時にStrengthを再計算させるため。 - - # def set_conversion_setting(self, srcId:int, dstId:int): - # self.settings.srcId = srcId - # self.settings.dstId = dstId - - # def set_convert_chunk_num(self, convertChunkNum): - # self.settings.convertChunkNum = convertChunkNum def _generate_strength(self, unpackedData): @@ -179,6 +175,91 @@ class VoiceChanger(): return data + def _onnx_inference(self, data, inputSize): + if hasattr(self, 'onnx_session'): + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data] + sid_tgt1 = torch.LongTensor([self.settings.dstId]) + # if spec.size()[2] >= 8: + audio1 = self.onnx_session.run( + ["audio"], + { + "specs": spec.numpy(), + "lengths": spec_lengths.numpy(), + "sid_src": sid_src.numpy(), + "sid_tgt": sid_tgt1.numpy() + })[0][0,0] * self.hps.data.max_wav_value + if hasattr(self, 'np_prev_audio1') == True: + prev = self.np_prev_audio1[-1*inputSize:] + cur = audio1[-2*inputSize:-1*inputSize] + # print(prev.shape, self.np_prev_strength.shape, cur.shape, self.np_cur_strength.shape) + powered_prev = prev * self.np_prev_strength + powered_cur = cur * self.np_cur_strength + result = powered_prev + powered_cur + #result = prev * self.np_prev_strength + cur * self.np_cur_strength + else: + cur = audio1[-2*inputSize:-1*inputSize] + result = cur + self.np_prev_audio1 = audio1 + return result + else: + raise ValueError(ERROR_NO_ONNX_SESSION, "No ONNX Session.") + + def _pyTorch_inference(self, data, inputSize): + if self.settings.gpu < 0 or self.gpu_num == 0: + with torch.no_grad(): + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data] + sid_tgt1 = torch.LongTensor([self.settings.dstId]).cpu() + audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data * self.hps.data.max_wav_value) + + if self.prev_strength.device != torch.device('cpu'): + print(f"prev_strength move from {self.prev_strength.device} to cpu") + self.prev_strength = self.prev_strength.cpu() + if self.cur_strength.device != torch.device('cpu'): + print(f"cur_strength move from {self.cur_strength.device} to cpu") + self.cur_strength = self.cur_strength.cpu() + + if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): + prev = self.prev_audio1[-1*inputSize:] + cur = audio1[-2*inputSize:-1*inputSize] + result = prev * self.prev_strength + cur * self.cur_strength + else: + cur = audio1[-2*inputSize:-1*inputSize] + result = cur + + self.prev_audio1 = audio1 + result = result.cpu().float().numpy() + + else: + with torch.no_grad(): + x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(self.settings.gpu) for x in data] + sid_tgt1 = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu) + audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data * self.hps.data.max_wav_value + + if self.prev_strength.device != torch.device('cuda', self.settings.gpu): + print(f"prev_strength move from {self.prev_strength.device} to gpu{self.settings.gpu}") + self.prev_strength = self.prev_strength.cuda(self.settings.gpu) + if self.cur_strength.device != torch.device('cuda', self.settings.gpu): + print(f"cur_strength move from {self.cur_strength.device} to gpu{self.settings.gpu}") + self.cur_strength = self.cur_strength.cuda(self.settings.gpu) + + + + if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.settings.gpu): + prev = self.prev_audio1[-1*inputSize:] + cur = audio1[-2*inputSize:-1*inputSize] + result = prev * self.prev_strength + cur * self.cur_strength + # print("merging...", prev.shape, cur.shape) + else: + cur = audio1[-2*inputSize:-1*inputSize] + result = cur + # print("no merging...", cur.shape) + self.prev_audio1 = audio1 + + #print(result) + result = result.cpu().float().numpy() + return result + + def on_request(self, unpackedData:any): convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk if unpackedData.shape[0] * 2 > convertSize: @@ -189,96 +270,21 @@ class VoiceChanger(): self._generate_strength(unpackedData) data = self._generate_input(unpackedData, convertSize) - # try: - # # if gpu < 0 or (self.gpu_num == 0 and not self.mps_enabled): - # if self.gpu == -2 and hasattr(self, 'onnx_session') == True: - # x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x for x in data] - # sid_tgt1 = torch.LongTensor([self.dstId]) - # # if spec.size()[2] >= 8: - # audio1 = self.onnx_session.run( - # ["audio"], - # { - # "specs": spec.numpy(), - # "lengths": spec_lengths.numpy(), - # "sid_src": sid_src.numpy(), - # "sid_tgt": sid_tgt1.numpy() - # })[0][0,0] * self.hps.data.max_wav_value - # if hasattr(self, 'np_prev_audio1') == True: - # prev = self.np_prev_audio1[-1*unpackedData.shape[0]:] - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # # print(prev.shape, self.np_prev_strength.shape, cur.shape, self.np_cur_strength.shape) - # powered_prev = prev * self.np_prev_strength - # powered_cur = cur * self.np_cur_strength - # result = powered_prev + powered_cur - # #result = prev * self.np_prev_strength + cur * self.np_cur_strength - # else: - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # result = cur - # self.np_prev_audio1 = audio1 - # elif self.gpu < 0 or self.gpu_num == 0: - # with torch.no_grad(): - # x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [ - # x.cpu() for x in data] - # sid_tgt1 = torch.LongTensor([self.dstId]).cpu() - # audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data * self.hps.data.max_wav_value) - - # if self.prev_strength.device != torch.device('cpu'): - # print(f"prev_strength move from {self.prev_strength.device} to cpu") - # self.prev_strength = self.prev_strength.cpu() - # if self.cur_strength.device != torch.device('cpu'): - # print(f"cur_strength move from {self.cur_strength.device} to cpu") - # self.cur_strength = self.cur_strength.cpu() - - # if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): - # prev = self.prev_audio1[-1*unpackedData.shape[0]:] - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # result = prev * self.prev_strength + cur * self.cur_strength - # else: - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # result = cur - - # self.prev_audio1 = audio1 - # result = result.cpu().float().numpy() - - # else: - # with torch.no_grad(): - # x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(self.gpu) for x in data] - # sid_tgt1 = torch.LongTensor([self.dstId]).cuda(self.gpu) - # audio1 = self.net_g.cuda(self.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0, 0].data * self.hps.data.max_wav_value - - # if self.prev_strength.device != torch.device('cuda', self.gpu): - # print(f"prev_strength move from {self.prev_strength.device} to gpu{self.gpu}") - # self.prev_strength = self.prev_strength.cuda(self.gpu) - # if self.cur_strength.device != torch.device('cuda', self.gpu): - # print(f"cur_strength move from {self.cur_strength.device} to gpu{self.gpu}") - # self.cur_strength = self.cur_strength.cuda(self.gpu) + try: + if self.settings.framework == "ONNX": + result = self._onnx_inference(data, unpackedData.shape[0]) + else: + result = self._pyTorch_inference(data, unpackedData.shape[0]) + except Exception as e: + print("VC PROCESSING!!!! EXCEPTION!!!", e) + print(traceback.format_exc()) + del self.np_prev_audio1 + del self.prev_audio1 - # if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.gpu): - # prev = self.prev_audio1[-1*unpackedData.shape[0]:] - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # result = prev * self.prev_strength + cur * self.cur_strength - # # print("merging...", prev.shape, cur.shape) - # else: - # cur = audio1[-2*unpackedData.shape[0]:-1*unpackedData.shape[0]] - # result = cur - # # print("no merging...", cur.shape) - # self.prev_audio1 = audio1 - - # #print(result) - # result = result.cpu().float().numpy() - - - # except Exception as e: - # print("VC PROCESSING!!!! EXCEPTION!!!", e) - # print(traceback.format_exc()) - # del self.np_prev_audio1 - # del self.prev_audio1 - - # result = result.astype(np.int16) - # # print("on_request result size:",result.shape) - # return result - return + result = result.astype(np.int16) + # print("on_request result size:",result.shape) + return result diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 768d5a9b..ad06e287 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -27,23 +27,9 @@ class VoiceChangerManager(): else: return {"no info":"no info"} - # def set_onnx_provider(self, provider:str): - # if hasattr(self, 'voiceChanger'): - # return self.voiceChanger.set_onnx_provider(provider) - # else: - # return {"error":"no voice changer"} - - - def changeVoice(self, gpu:int, srcId:int, dstId:int, timestamp:int, convertChunkNum:int, crossFadeLowerValue:float, crossFadeOffsetRate:float, crossFadeEndRate:float, unpackedData:any): + def changeVoice(self, unpackedData:any): if hasattr(self, 'voiceChanger') == True: return self.voiceChanger.on_request(unpackedData) else: print("Voice Change is not loaded. Did you load a correct model?") return np.zeros(1).astype(np.int16) - - def changeVoice_old(self, gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData): - if hasattr(self, 'voiceChanger') == True: - return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData) - else: - print("Voice Change is not loaded. Did you load a correct model?") - return np.zeros(1).astype(np.int16)