From 17597fdaabacacae96f09f5270333604bc69f502 Mon Sep 17 00:00:00 2001 From: w-okada Date: Wed, 29 Nov 2023 00:30:52 +0900 Subject: [PATCH] Add chihaya_jinja_sample Web Edition improvement(16k test) bugfix: - merge slot - servermode append error --- .gitignore | 3 + .../src/001_globalHooks/100_useWebInfo.ts | 9 +++ .../demo/components2/101-0_Portrait.tsx | 2 +- .../101-8_web-editionSettingArea.tsx | 10 +++ server/const.py | 8 +-- .../inferencer/DiffusionSVCInferencer.py | 26 +++----- .../DiffusionSVC/pipeline/Pipeline.py | 32 +++------ server/voice_changer/Local/ServerDevice.py | 4 +- server/voice_changer/VoiceChangerManager.py | 2 +- server/voice_changer/utils/Timer.py | 66 +++++++++---------- 10 files changed, 81 insertions(+), 81 deletions(-) diff --git a/.gitignore b/.gitignore index 11ec2bd0..44b31e86 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,9 @@ server/samples_0003_o.json server/samples_0003_t2.json server/samples_0003_o2.json server/samples_0003_d2.json +server/samples_0004_t.json +server/samples_0004_o.json +server/samples_0004_d.json server/test_official_v1_v2.json server/test_ddpn_v1_v2.json diff --git a/client/demo/src/001_globalHooks/100_useWebInfo.ts b/client/demo/src/001_globalHooks/100_useWebInfo.ts index 3e058522..13091959 100644 --- a/client/demo/src/001_globalHooks/100_useWebInfo.ts +++ b/client/demo/src/001_globalHooks/100_useWebInfo.ts @@ -45,6 +45,7 @@ export type WebInfoStateAndMethod = WebInfoState & { const ModelSampleRateStr = { "40k": "40k", "32k": "32k", + "16k": "16k", } as const; type ModelSampleRateStr = (typeof ModelSampleRateStr)[keyof typeof ModelSampleRateStr]; @@ -71,18 +72,22 @@ const noF0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLe "24000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_24000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_24000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_24000.bin", }, "16000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_16000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_16000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_16000.bin", }, "12000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_12000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_12000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_12000.bin", }, "8000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_8000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_8000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_8000.bin", }, }, }; @@ -109,18 +114,22 @@ const f0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLeng "24000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_24000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_24000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_24000.bin", }, "16000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_16000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_16000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_16000.bin", }, "12000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_12000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_12000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_12000.bin", }, "8000": { "40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_8000.bin", "32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_8000.bin", + "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_8000.bin", }, }, }; diff --git a/client/demo/src/components/demo/components2/101-0_Portrait.tsx b/client/demo/src/components/demo/components2/101-0_Portrait.tsx index 278e8297..ae2d2df7 100644 --- a/client/demo/src/components/demo/components2/101-0_Portrait.tsx +++ b/client/demo/src/components/demo/components2/101-0_Portrait.tsx @@ -50,7 +50,7 @@ export const Portrait = (_props: PortraitProps) => { } vol.innerText = volume.toFixed(4); if (webEdition) { - buf.innerText = webInfoState.responseTimeInfo.realDuration.toString() ?? "0"; + buf.innerText = bufferingTime.toString(); res.innerText = webInfoState.responseTimeInfo.responseTime.toString() ?? "0"; rtf.innerText = webInfoState.responseTimeInfo.rtf.toString() ?? "0"; } else { diff --git a/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx b/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx index 9db41bbc..ef4d80e7 100644 --- a/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx +++ b/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx @@ -55,6 +55,7 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => { ); + const sr16KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "16k" ? " character-area-control-button-active" : " character-area-control-button-stanby"); const sr32KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "32k" ? " character-area-control-button-active" : " character-area-control-button-stanby"); const sr40KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "40k" ? " character-area-control-button-active" : " character-area-control-button-stanby"); const sampleRate = ( @@ -64,6 +65,15 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
+ { + if (webInfoState.voiceChangerConfig.sampleRate == "16k" || !readyForConfig) return; + webInfoState.setVoiceChangerConfig("rvcv2", "16k", webInfoState.voiceChangerConfig.useF0, webInfoState.voiceChangerConfig.inputLength); + }} + > + 16k + { diff --git a/server/const.py b/server/const.py index 1af55e3f..2ab78af7 100644 --- a/server/const.py +++ b/server/const.py @@ -98,11 +98,9 @@ RVCSampleMode: TypeAlias = Literal[ def getSampleJsonAndModelIds(mode: RVCSampleMode): if mode == "production": return [ - # "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0001.json", - # "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0002.json", - "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_t2.json", - "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_o2.json", - "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_d2.json", + "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_t.json", + "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_o.json", + "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_d.json", ], [ ("Tsukuyomi-chan_o", {"useIndex": False}), ("Amitaro_o", {"useIndex": False}), diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py index 3a364f2d..1f180b19 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py +++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py @@ -7,7 +7,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from voice_changer.utils.Timer import Timer +from voice_changer.utils.Timer import Timer2 class DiffusionSVCInferencer(Inferencer): @@ -49,18 +49,14 @@ class DiffusionSVCInferencer(Inferencer): return model_block_size, model_sampling_rate @torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道 - def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, - gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True, - spk_emb=None): - + def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, gt_spec=None, infer_speedup=10, method="dpm-solver", k_step=None, use_tqdm=True, spk_emb=None): if self.diff_args.model.k_step_max is not None: if k_step is None: raise ValueError("k_step must not None when Shallow Diffusion Model inferring") if k_step > int(self.diff_args.model.k_step_max): raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model") if gt_spec is None: - raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from " - "input mel or output of naive model") + raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from " "input mel or output of naive model") aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev) @@ -75,8 +71,7 @@ class DiffusionSVCInferencer(Inferencer): return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict) @torch.no_grad() - def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None, - aug_shift=0, spk_emb=None): + def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, spk_emb=None): # spk_id spk_emb_dict = None if self.diff_args.model.use_speaker_encoder: # with speaker encoder @@ -85,9 +80,7 @@ class DiffusionSVCInferencer(Inferencer): else: spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev) aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev) - out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, - aug_shift=aug_shift, infer=True, - spk_emb=spk_emb, spk_emb_dict=spk_emb_dict) + out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, infer=True, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict) return out_spec @torch.no_grad() @@ -114,19 +107,18 @@ class DiffusionSVCInferencer(Inferencer): silence_front: float, skip_diffusion: bool = True, ) -> torch.Tensor: - with Timer("pre-process", False) as t: + with Timer2("pre-process", False) as t: gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None) # print("[ ----Timer::1: ]", t.secs) - with Timer("pre-process", False) as t: + with Timer2("pre-process", False) as t: if skip_diffusion == 0: - out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None) + out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method="dpm-solver", k_step=k_step, use_tqdm=False, spk_emb=None) gt_spec = out_mel # print("[ ----Timer::2: ]", t.secs) - - with Timer("pre-process", False) as t: # NOQA + with Timer2("pre-process", False) as t: # NOQA if self.vocoder_onnx is None: start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame) diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index c808cf4c..1d4a3c9a 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -17,7 +17,7 @@ from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.common.VolumeExtractor import VolumeExtractor from torchaudio.transforms import Resample -from voice_changer.utils.Timer import Timer +from voice_changer.utils.Timer import Timer2 logger = VoiceChangaerLogger.get_instance().getLogger() @@ -45,7 +45,7 @@ class Pipeline(object): device, isHalf, resamplerIn: Resample, - resamplerOut: Resample + resamplerOut: Resample, ): self.inferencer = inferencer inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig() @@ -64,7 +64,7 @@ class Pipeline(object): logger.info("GENERATE INFERENCER" + str(self.inferencer)) logger.info("GENERATE EMBEDDER" + str(self.embedder)) logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor)) - + self.targetSR = targetSR self.device = device self.isHalf = False @@ -103,7 +103,7 @@ class Pipeline(object): skip_diffusion=True, ): # print("---------- pipe line --------------------") - with Timer("pre-process", False) as t: + with Timer2("pre-process", False) as t: audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) audio16k = self.resamplerIn(audio_t) volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0) @@ -111,7 +111,7 @@ class Pipeline(object): n_frames = int(audio16k.size(-1) // self.hop_size + 1) # print("[Timer::1: ]", t.secs) - with Timer("pre-process", False) as t: + with Timer2("pre-process", False) as t: # ピッチ検出 try: # pitch = self.pitchExtractor.extract( @@ -141,8 +141,7 @@ class Pipeline(object): feats = feats.view(1, -1) # print("[Timer::2: ]", t.secs) - with Timer("pre-process", False) as t: - + with Timer2("pre-process", False) as t: # embedding with autocast(enabled=self.isHalf): try: @@ -156,28 +155,17 @@ class Pipeline(object): raise DeviceChangingException() else: raise e - feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1) + feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode="nearest").permute(0, 2, 1) # print("[Timer::3: ]", t.secs) - with Timer("pre-process", False) as t: + with Timer2("pre-process", False) as t: # 推論実行 try: with torch.no_grad(): with autocast(enabled=self.isHalf): audio1 = ( torch.clip( - self.inferencer.infer( - audio16k, - feats, - pitch.unsqueeze(-1), - volume, - mask, - sid, - k_step, - infer_speedup, - silence_front=silence_front, - skip_diffusion=skip_diffusion - ).to(dtype=torch.float32), + self.inferencer.infer(audio16k, feats, pitch.unsqueeze(-1), volume, mask, sid, k_step, infer_speedup, silence_front=silence_front, skip_diffusion=skip_diffusion).to(dtype=torch.float32), -1.0, 1.0, ) @@ -191,7 +179,7 @@ class Pipeline(object): raise e # print("[Timer::4: ]", t.secs) - with Timer("pre-process", False) as t: # NOQA + with Timer2("pre-process", False) as t: # NOQA feats_buffer = feats.squeeze(0).detach().cpu() if pitch is not None: pitch_buffer = pitch.squeeze(0).detach().cpu() diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py index 1f46409c..3e8217d9 100644 --- a/server/voice_changer/Local/ServerDevice.py +++ b/server/voice_changer/Local/ServerDevice.py @@ -9,7 +9,7 @@ from mods.log_control import VoiceChangaerLogger from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device import time import sounddevice as sd -from voice_changer.utils.Timer import Timer +from voice_changer.utils.Timer import Timer2 import librosa from voice_changer.utils.VoiceChangerModel import AudioInOut @@ -139,7 +139,7 @@ class ServerDevice: return out_wav, times def _processDataWithTime(self, indata: np.ndarray): - with Timer("all_inference_time") as t: + with Timer2("all_inference_time", False) as t: out_wav, times = self._processData(indata) all_inference_time = t.secs self.performance = [all_inference_time] + times diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index f14c1617..309b3131 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -364,7 +364,7 @@ class VoiceChangerManager(ServerDeviceCallbacks): req = json.loads(request) req = ModelMergerRequest(**req) req.files = [MergeElement(**f) for f in req.files] - slot = len(self.modelSlotManager.getAllSlotInfo()) - 1 + slot = len(self.modelSlotManager.getAllSlotInfo()) - 2 # Beatrice-JVS が追加されたので -1 -> -2 if req.voiceChangerType == "RVC": merged = RVCModelMerger.merge_models(self.params, req, slot) loadParam = LoadModelParams(voiceChangerType="RVC", slot=slot, isSampleMode=False, sampleId="", files=[LoadModelParamFile(name=os.path.basename(merged), kind="rvcModel", dir="")], params={}) diff --git a/server/voice_changer/utils/Timer.py b/server/voice_changer/utils/Timer.py index 43dc0b16..81eee9af 100644 --- a/server/voice_changer/utils/Timer.py +++ b/server/voice_changer/utils/Timer.py @@ -3,45 +3,45 @@ import inspect from typing import Dict, List -class Timer(object): - storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable +# class Timer(object): +# storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable - def __init__(self, title: str, enalbe: bool = True): - self.title = title - self.enable = enalbe - self.secs = 0 - self.msecs = 0 - self.avrSecs = 0 +# def __init__(self, title: str, enalbe: bool = True): +# self.title = title +# self.enable = enalbe +# self.secs = 0 +# self.msecs = 0 +# self.avrSecs = 0 - if self.enable is False: - return +# if self.enable is False: +# return - self.maxStores = 10 +# self.maxStores = 10 - current_frame = inspect.currentframe() - caller_frame = inspect.getouterframes(current_frame, 2) - frame = caller_frame[1] - filename = frame.filename - line_number = frame.lineno - self.key = f"{title}_{filename}_{line_number}" - if self.key not in self.storedSecs: - self.storedSecs[self.key] = {} +# current_frame = inspect.currentframe() +# caller_frame = inspect.getouterframes(current_frame, 2) +# frame = caller_frame[1] +# filename = frame.filename +# line_number = frame.lineno +# self.key = f"{title}_{filename}_{line_number}" +# if self.key not in self.storedSecs: +# self.storedSecs[self.key] = {} - def __enter__(self): - if self.enable is False: - return - self.start = time.time() - return self +# def __enter__(self): +# if self.enable is False: +# return +# self.start = time.time() +# return self - def __exit__(self, *_): - if self.enable is False: - return - self.end = time.time() - self.secs = self.end - self.start - self.msecs = self.secs * 1000 # millisecs - self.storedSecs[self.key].append(self.secs) - self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :] - self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key]) +# def __exit__(self, *_): +# if self.enable is False: +# return +# self.end = time.time() +# self.secs = self.end - self.start +# self.msecs = self.secs * 1000 # millisecs +# self.storedSecs[self.key].append(self.secs) +# self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :] +# self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key]) class Timer2(object):