mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 05:25:01 +03:00
Add chihaya_jinja_sample
Web Edition improvement(16k test) bugfix: - merge slot - servermode append error
This commit is contained in:
parent
b24c781a72
commit
17597fdaab
3
.gitignore
vendored
3
.gitignore
vendored
@ -58,6 +58,9 @@ server/samples_0003_o.json
|
|||||||
server/samples_0003_t2.json
|
server/samples_0003_t2.json
|
||||||
server/samples_0003_o2.json
|
server/samples_0003_o2.json
|
||||||
server/samples_0003_d2.json
|
server/samples_0003_d2.json
|
||||||
|
server/samples_0004_t.json
|
||||||
|
server/samples_0004_o.json
|
||||||
|
server/samples_0004_d.json
|
||||||
|
|
||||||
server/test_official_v1_v2.json
|
server/test_official_v1_v2.json
|
||||||
server/test_ddpn_v1_v2.json
|
server/test_ddpn_v1_v2.json
|
||||||
|
@ -45,6 +45,7 @@ export type WebInfoStateAndMethod = WebInfoState & {
|
|||||||
const ModelSampleRateStr = {
|
const ModelSampleRateStr = {
|
||||||
"40k": "40k",
|
"40k": "40k",
|
||||||
"32k": "32k",
|
"32k": "32k",
|
||||||
|
"16k": "16k",
|
||||||
} as const;
|
} as const;
|
||||||
type ModelSampleRateStr = (typeof ModelSampleRateStr)[keyof typeof ModelSampleRateStr];
|
type ModelSampleRateStr = (typeof ModelSampleRateStr)[keyof typeof ModelSampleRateStr];
|
||||||
|
|
||||||
@ -71,18 +72,22 @@ const noF0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLe
|
|||||||
"24000": {
|
"24000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_24000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_24000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_24000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_24000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_24000.bin",
|
||||||
},
|
},
|
||||||
"16000": {
|
"16000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_16000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_16000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_16000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_16000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_16000.bin",
|
||||||
},
|
},
|
||||||
"12000": {
|
"12000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_12000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_12000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_12000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_12000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_12000.bin",
|
||||||
},
|
},
|
||||||
"8000": {
|
"8000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_8000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_8000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_8000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_8000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_8000.bin",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
@ -109,18 +114,22 @@ const f0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLeng
|
|||||||
"24000": {
|
"24000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_24000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_24000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_24000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_24000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_24000.bin",
|
||||||
},
|
},
|
||||||
"16000": {
|
"16000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_16000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_16000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_16000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_16000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_16000.bin",
|
||||||
},
|
},
|
||||||
"12000": {
|
"12000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_12000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_12000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_12000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_12000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_12000.bin",
|
||||||
},
|
},
|
||||||
"8000": {
|
"8000": {
|
||||||
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_8000.bin",
|
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_8000.bin",
|
||||||
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_8000.bin",
|
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_8000.bin",
|
||||||
|
"16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_8000.bin",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -50,7 +50,7 @@ export const Portrait = (_props: PortraitProps) => {
|
|||||||
}
|
}
|
||||||
vol.innerText = volume.toFixed(4);
|
vol.innerText = volume.toFixed(4);
|
||||||
if (webEdition) {
|
if (webEdition) {
|
||||||
buf.innerText = webInfoState.responseTimeInfo.realDuration.toString() ?? "0";
|
buf.innerText = bufferingTime.toString();
|
||||||
res.innerText = webInfoState.responseTimeInfo.responseTime.toString() ?? "0";
|
res.innerText = webInfoState.responseTimeInfo.responseTime.toString() ?? "0";
|
||||||
rtf.innerText = webInfoState.responseTimeInfo.rtf.toString() ?? "0";
|
rtf.innerText = webInfoState.responseTimeInfo.rtf.toString() ?? "0";
|
||||||
} else {
|
} else {
|
||||||
|
@ -55,6 +55,7 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
|
|||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const sr16KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "16k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
|
||||||
const sr32KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "32k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
|
const sr32KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "32k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
|
||||||
const sr40KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "40k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
|
const sr40KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "40k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
|
||||||
const sampleRate = (
|
const sampleRate = (
|
||||||
@ -64,6 +65,15 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
|
|||||||
<div className="character-area-slider-control">
|
<div className="character-area-slider-control">
|
||||||
<span className="character-area-slider-control-kind"></span>
|
<span className="character-area-slider-control-kind"></span>
|
||||||
<span className="character-area-control-buttons">
|
<span className="character-area-control-buttons">
|
||||||
|
<span
|
||||||
|
className={!readyForConfig ? "character-area-control-button-disable" : sr16KClassName}
|
||||||
|
onClick={() => {
|
||||||
|
if (webInfoState.voiceChangerConfig.sampleRate == "16k" || !readyForConfig) return;
|
||||||
|
webInfoState.setVoiceChangerConfig("rvcv2", "16k", webInfoState.voiceChangerConfig.useF0, webInfoState.voiceChangerConfig.inputLength);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
16k
|
||||||
|
</span>
|
||||||
<span
|
<span
|
||||||
className={!readyForConfig ? "character-area-control-button-disable" : sr32KClassName}
|
className={!readyForConfig ? "character-area-control-button-disable" : sr32KClassName}
|
||||||
onClick={() => {
|
onClick={() => {
|
||||||
|
@ -98,11 +98,9 @@ RVCSampleMode: TypeAlias = Literal[
|
|||||||
def getSampleJsonAndModelIds(mode: RVCSampleMode):
|
def getSampleJsonAndModelIds(mode: RVCSampleMode):
|
||||||
if mode == "production":
|
if mode == "production":
|
||||||
return [
|
return [
|
||||||
# "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0001.json",
|
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_t.json",
|
||||||
# "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0002.json",
|
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_o.json",
|
||||||
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_t2.json",
|
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_d.json",
|
||||||
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_o2.json",
|
|
||||||
"https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_d2.json",
|
|
||||||
], [
|
], [
|
||||||
("Tsukuyomi-chan_o", {"useIndex": False}),
|
("Tsukuyomi-chan_o", {"useIndex": False}),
|
||||||
("Amitaro_o", {"useIndex": False}),
|
("Amitaro_o", {"useIndex": False}),
|
||||||
|
@ -7,7 +7,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder
|
|||||||
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
|
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
|
||||||
|
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.utils.Timer import Timer
|
from voice_changer.utils.Timer import Timer2
|
||||||
|
|
||||||
|
|
||||||
class DiffusionSVCInferencer(Inferencer):
|
class DiffusionSVCInferencer(Inferencer):
|
||||||
@ -49,18 +49,14 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
return model_block_size, model_sampling_rate
|
return model_block_size, model_sampling_rate
|
||||||
|
|
||||||
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
|
||||||
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
|
def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, gt_spec=None, infer_speedup=10, method="dpm-solver", k_step=None, use_tqdm=True, spk_emb=None):
|
||||||
gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
|
|
||||||
spk_emb=None):
|
|
||||||
|
|
||||||
if self.diff_args.model.k_step_max is not None:
|
if self.diff_args.model.k_step_max is not None:
|
||||||
if k_step is None:
|
if k_step is None:
|
||||||
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
|
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
|
||||||
if k_step > int(self.diff_args.model.k_step_max):
|
if k_step > int(self.diff_args.model.k_step_max):
|
||||||
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
|
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
|
||||||
if gt_spec is None:
|
if gt_spec is None:
|
||||||
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
|
raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from " "input mel or output of naive model")
|
||||||
"input mel or output of naive model")
|
|
||||||
|
|
||||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||||
|
|
||||||
@ -75,8 +71,7 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
|
def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, spk_emb=None):
|
||||||
aug_shift=0, spk_emb=None):
|
|
||||||
# spk_id
|
# spk_id
|
||||||
spk_emb_dict = None
|
spk_emb_dict = None
|
||||||
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
|
||||||
@ -85,9 +80,7 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
else:
|
else:
|
||||||
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
|
||||||
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
|
||||||
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
|
out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, infer=True, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
||||||
aug_shift=aug_shift, infer=True,
|
|
||||||
spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
|
|
||||||
return out_spec
|
return out_spec
|
||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
@ -114,19 +107,18 @@ class DiffusionSVCInferencer(Inferencer):
|
|||||||
silence_front: float,
|
silence_front: float,
|
||||||
skip_diffusion: bool = True,
|
skip_diffusion: bool = True,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
|
||||||
|
|
||||||
# print("[ ----Timer::1: ]", t.secs)
|
# print("[ ----Timer::1: ]", t.secs)
|
||||||
|
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
if skip_diffusion == 0:
|
if skip_diffusion == 0:
|
||||||
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
|
out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method="dpm-solver", k_step=k_step, use_tqdm=False, spk_emb=None)
|
||||||
gt_spec = out_mel
|
gt_spec = out_mel
|
||||||
# print("[ ----Timer::2: ]", t.secs)
|
# print("[ ----Timer::2: ]", t.secs)
|
||||||
|
|
||||||
|
with Timer2("pre-process", False) as t: # NOQA
|
||||||
with Timer("pre-process", False) as t: # NOQA
|
|
||||||
if self.vocoder_onnx is None:
|
if self.vocoder_onnx is None:
|
||||||
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
|
||||||
out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)
|
out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)
|
||||||
|
@ -17,7 +17,7 @@ from voice_changer.RVC.embedder.Embedder import Embedder
|
|||||||
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
from voice_changer.common.VolumeExtractor import VolumeExtractor
|
||||||
from torchaudio.transforms import Resample
|
from torchaudio.transforms import Resample
|
||||||
|
|
||||||
from voice_changer.utils.Timer import Timer
|
from voice_changer.utils.Timer import Timer2
|
||||||
|
|
||||||
logger = VoiceChangaerLogger.get_instance().getLogger()
|
logger = VoiceChangaerLogger.get_instance().getLogger()
|
||||||
|
|
||||||
@ -45,7 +45,7 @@ class Pipeline(object):
|
|||||||
device,
|
device,
|
||||||
isHalf,
|
isHalf,
|
||||||
resamplerIn: Resample,
|
resamplerIn: Resample,
|
||||||
resamplerOut: Resample
|
resamplerOut: Resample,
|
||||||
):
|
):
|
||||||
self.inferencer = inferencer
|
self.inferencer = inferencer
|
||||||
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
|
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
|
||||||
@ -64,7 +64,7 @@ class Pipeline(object):
|
|||||||
logger.info("GENERATE INFERENCER" + str(self.inferencer))
|
logger.info("GENERATE INFERENCER" + str(self.inferencer))
|
||||||
logger.info("GENERATE EMBEDDER" + str(self.embedder))
|
logger.info("GENERATE EMBEDDER" + str(self.embedder))
|
||||||
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
|
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
|
||||||
|
|
||||||
self.targetSR = targetSR
|
self.targetSR = targetSR
|
||||||
self.device = device
|
self.device = device
|
||||||
self.isHalf = False
|
self.isHalf = False
|
||||||
@ -103,7 +103,7 @@ class Pipeline(object):
|
|||||||
skip_diffusion=True,
|
skip_diffusion=True,
|
||||||
):
|
):
|
||||||
# print("---------- pipe line --------------------")
|
# print("---------- pipe line --------------------")
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||||
audio16k = self.resamplerIn(audio_t)
|
audio16k = self.resamplerIn(audio_t)
|
||||||
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
|
||||||
@ -111,7 +111,7 @@ class Pipeline(object):
|
|||||||
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
|
||||||
# print("[Timer::1: ]", t.secs)
|
# print("[Timer::1: ]", t.secs)
|
||||||
|
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
try:
|
try:
|
||||||
# pitch = self.pitchExtractor.extract(
|
# pitch = self.pitchExtractor.extract(
|
||||||
@ -141,8 +141,7 @@ class Pipeline(object):
|
|||||||
feats = feats.view(1, -1)
|
feats = feats.view(1, -1)
|
||||||
# print("[Timer::2: ]", t.secs)
|
# print("[Timer::2: ]", t.secs)
|
||||||
|
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
|
|
||||||
# embedding
|
# embedding
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
try:
|
try:
|
||||||
@ -156,28 +155,17 @@ class Pipeline(object):
|
|||||||
raise DeviceChangingException()
|
raise DeviceChangingException()
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
|
feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode="nearest").permute(0, 2, 1)
|
||||||
# print("[Timer::3: ]", t.secs)
|
# print("[Timer::3: ]", t.secs)
|
||||||
|
|
||||||
with Timer("pre-process", False) as t:
|
with Timer2("pre-process", False) as t:
|
||||||
# 推論実行
|
# 推論実行
|
||||||
try:
|
try:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
audio1 = (
|
audio1 = (
|
||||||
torch.clip(
|
torch.clip(
|
||||||
self.inferencer.infer(
|
self.inferencer.infer(audio16k, feats, pitch.unsqueeze(-1), volume, mask, sid, k_step, infer_speedup, silence_front=silence_front, skip_diffusion=skip_diffusion).to(dtype=torch.float32),
|
||||||
audio16k,
|
|
||||||
feats,
|
|
||||||
pitch.unsqueeze(-1),
|
|
||||||
volume,
|
|
||||||
mask,
|
|
||||||
sid,
|
|
||||||
k_step,
|
|
||||||
infer_speedup,
|
|
||||||
silence_front=silence_front,
|
|
||||||
skip_diffusion=skip_diffusion
|
|
||||||
).to(dtype=torch.float32),
|
|
||||||
-1.0,
|
-1.0,
|
||||||
1.0,
|
1.0,
|
||||||
)
|
)
|
||||||
@ -191,7 +179,7 @@ class Pipeline(object):
|
|||||||
raise e
|
raise e
|
||||||
# print("[Timer::4: ]", t.secs)
|
# print("[Timer::4: ]", t.secs)
|
||||||
|
|
||||||
with Timer("pre-process", False) as t: # NOQA
|
with Timer2("pre-process", False) as t: # NOQA
|
||||||
feats_buffer = feats.squeeze(0).detach().cpu()
|
feats_buffer = feats.squeeze(0).detach().cpu()
|
||||||
if pitch is not None:
|
if pitch is not None:
|
||||||
pitch_buffer = pitch.squeeze(0).detach().cpu()
|
pitch_buffer = pitch.squeeze(0).detach().cpu()
|
||||||
|
@ -9,7 +9,7 @@ from mods.log_control import VoiceChangaerLogger
|
|||||||
from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device
|
from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device
|
||||||
import time
|
import time
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
from voice_changer.utils.Timer import Timer
|
from voice_changer.utils.Timer import Timer2
|
||||||
import librosa
|
import librosa
|
||||||
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
@ -139,7 +139,7 @@ class ServerDevice:
|
|||||||
return out_wav, times
|
return out_wav, times
|
||||||
|
|
||||||
def _processDataWithTime(self, indata: np.ndarray):
|
def _processDataWithTime(self, indata: np.ndarray):
|
||||||
with Timer("all_inference_time") as t:
|
with Timer2("all_inference_time", False) as t:
|
||||||
out_wav, times = self._processData(indata)
|
out_wav, times = self._processData(indata)
|
||||||
all_inference_time = t.secs
|
all_inference_time = t.secs
|
||||||
self.performance = [all_inference_time] + times
|
self.performance = [all_inference_time] + times
|
||||||
|
@ -364,7 +364,7 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
|||||||
req = json.loads(request)
|
req = json.loads(request)
|
||||||
req = ModelMergerRequest(**req)
|
req = ModelMergerRequest(**req)
|
||||||
req.files = [MergeElement(**f) for f in req.files]
|
req.files = [MergeElement(**f) for f in req.files]
|
||||||
slot = len(self.modelSlotManager.getAllSlotInfo()) - 1
|
slot = len(self.modelSlotManager.getAllSlotInfo()) - 2 # Beatrice-JVS が追加されたので -1 -> -2
|
||||||
if req.voiceChangerType == "RVC":
|
if req.voiceChangerType == "RVC":
|
||||||
merged = RVCModelMerger.merge_models(self.params, req, slot)
|
merged = RVCModelMerger.merge_models(self.params, req, slot)
|
||||||
loadParam = LoadModelParams(voiceChangerType="RVC", slot=slot, isSampleMode=False, sampleId="", files=[LoadModelParamFile(name=os.path.basename(merged), kind="rvcModel", dir="")], params={})
|
loadParam = LoadModelParams(voiceChangerType="RVC", slot=slot, isSampleMode=False, sampleId="", files=[LoadModelParamFile(name=os.path.basename(merged), kind="rvcModel", dir="")], params={})
|
||||||
|
@ -3,45 +3,45 @@ import inspect
|
|||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
|
|
||||||
class Timer(object):
|
# class Timer(object):
|
||||||
storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
|
# storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
|
||||||
|
|
||||||
def __init__(self, title: str, enalbe: bool = True):
|
# def __init__(self, title: str, enalbe: bool = True):
|
||||||
self.title = title
|
# self.title = title
|
||||||
self.enable = enalbe
|
# self.enable = enalbe
|
||||||
self.secs = 0
|
# self.secs = 0
|
||||||
self.msecs = 0
|
# self.msecs = 0
|
||||||
self.avrSecs = 0
|
# self.avrSecs = 0
|
||||||
|
|
||||||
if self.enable is False:
|
# if self.enable is False:
|
||||||
return
|
# return
|
||||||
|
|
||||||
self.maxStores = 10
|
# self.maxStores = 10
|
||||||
|
|
||||||
current_frame = inspect.currentframe()
|
# current_frame = inspect.currentframe()
|
||||||
caller_frame = inspect.getouterframes(current_frame, 2)
|
# caller_frame = inspect.getouterframes(current_frame, 2)
|
||||||
frame = caller_frame[1]
|
# frame = caller_frame[1]
|
||||||
filename = frame.filename
|
# filename = frame.filename
|
||||||
line_number = frame.lineno
|
# line_number = frame.lineno
|
||||||
self.key = f"{title}_{filename}_{line_number}"
|
# self.key = f"{title}_{filename}_{line_number}"
|
||||||
if self.key not in self.storedSecs:
|
# if self.key not in self.storedSecs:
|
||||||
self.storedSecs[self.key] = {}
|
# self.storedSecs[self.key] = {}
|
||||||
|
|
||||||
def __enter__(self):
|
# def __enter__(self):
|
||||||
if self.enable is False:
|
# if self.enable is False:
|
||||||
return
|
# return
|
||||||
self.start = time.time()
|
# self.start = time.time()
|
||||||
return self
|
# return self
|
||||||
|
|
||||||
def __exit__(self, *_):
|
# def __exit__(self, *_):
|
||||||
if self.enable is False:
|
# if self.enable is False:
|
||||||
return
|
# return
|
||||||
self.end = time.time()
|
# self.end = time.time()
|
||||||
self.secs = self.end - self.start
|
# self.secs = self.end - self.start
|
||||||
self.msecs = self.secs * 1000 # millisecs
|
# self.msecs = self.secs * 1000 # millisecs
|
||||||
self.storedSecs[self.key].append(self.secs)
|
# self.storedSecs[self.key].append(self.secs)
|
||||||
self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
|
# self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
|
||||||
self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
|
# self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
|
||||||
|
|
||||||
|
|
||||||
class Timer2(object):
|
class Timer2(object):
|
||||||
|
Loading…
Reference in New Issue
Block a user