diff --git a/.gitignore b/.gitignore
index 11ec2bd0..44b31e86 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,9 @@ server/samples_0003_o.json
server/samples_0003_t2.json
server/samples_0003_o2.json
server/samples_0003_d2.json
+server/samples_0004_t.json
+server/samples_0004_o.json
+server/samples_0004_d.json
server/test_official_v1_v2.json
server/test_ddpn_v1_v2.json
diff --git a/client/demo/src/001_globalHooks/100_useWebInfo.ts b/client/demo/src/001_globalHooks/100_useWebInfo.ts
index 3e058522..13091959 100644
--- a/client/demo/src/001_globalHooks/100_useWebInfo.ts
+++ b/client/demo/src/001_globalHooks/100_useWebInfo.ts
@@ -45,6 +45,7 @@ export type WebInfoStateAndMethod = WebInfoState & {
const ModelSampleRateStr = {
"40k": "40k",
"32k": "32k",
+ "16k": "16k",
} as const;
type ModelSampleRateStr = (typeof ModelSampleRateStr)[keyof typeof ModelSampleRateStr];
@@ -71,18 +72,22 @@ const noF0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLe
"24000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_24000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_24000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_24000.bin",
},
"16000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_16000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_16000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_16000.bin",
},
"12000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_12000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_12000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_12000.bin",
},
"8000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_nof0_8000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_nof0_8000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_nof0_8000.bin",
},
},
};
@@ -109,18 +114,22 @@ const f0ModelUrl: { [modelType in VoiceChangerType]: { [inputLength in InputLeng
"24000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_24000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_24000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_24000.bin",
},
"16000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_16000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_16000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_16000.bin",
},
"12000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_12000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_12000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_12000.bin",
},
"8000": {
"40k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_40k_f0_8000.bin",
"32k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_32k_f0_8000.bin",
+ "16k": "https://huggingface.co/wok000/vcclient_model/resolve/main/web_model/v_01_alpha/amitaro/rvcv2_amitaro_v2_16k_f0_8000.bin",
},
},
};
diff --git a/client/demo/src/components/demo/components2/101-0_Portrait.tsx b/client/demo/src/components/demo/components2/101-0_Portrait.tsx
index 278e8297..ae2d2df7 100644
--- a/client/demo/src/components/demo/components2/101-0_Portrait.tsx
+++ b/client/demo/src/components/demo/components2/101-0_Portrait.tsx
@@ -50,7 +50,7 @@ export const Portrait = (_props: PortraitProps) => {
}
vol.innerText = volume.toFixed(4);
if (webEdition) {
- buf.innerText = webInfoState.responseTimeInfo.realDuration.toString() ?? "0";
+ buf.innerText = bufferingTime.toString();
res.innerText = webInfoState.responseTimeInfo.responseTime.toString() ?? "0";
rtf.innerText = webInfoState.responseTimeInfo.rtf.toString() ?? "0";
} else {
diff --git a/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx b/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx
index 9db41bbc..ef4d80e7 100644
--- a/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx
+++ b/client/demo/src/components/demo/components2/101-8_web-editionSettingArea.tsx
@@ -55,6 +55,7 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
);
+ const sr16KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "16k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sr32KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "32k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sr40KClassName = "character-area-control-button" + (webInfoState.voiceChangerConfig.sampleRate == "40k" ? " character-area-control-button-active" : " character-area-control-button-stanby");
const sampleRate = (
@@ -64,6 +65,15 @@ export const WebEditionSettingArea = (_props: WebEditionSettingAreaProps) => {
+ {
+ if (webInfoState.voiceChangerConfig.sampleRate == "16k" || !readyForConfig) return;
+ webInfoState.setVoiceChangerConfig("rvcv2", "16k", webInfoState.voiceChangerConfig.useF0, webInfoState.voiceChangerConfig.inputLength);
+ }}
+ >
+ 16k
+
{
diff --git a/server/const.py b/server/const.py
index 1af55e3f..2ab78af7 100644
--- a/server/const.py
+++ b/server/const.py
@@ -98,11 +98,9 @@ RVCSampleMode: TypeAlias = Literal[
def getSampleJsonAndModelIds(mode: RVCSampleMode):
if mode == "production":
return [
- # "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0001.json",
- # "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0002.json",
- "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_t2.json",
- "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_o2.json",
- "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0003_d2.json",
+ "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_t.json",
+ "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_o.json",
+ "https://huggingface.co/wok000/vcclient_model/raw/main/samples_0004_d.json",
], [
("Tsukuyomi-chan_o", {"useIndex": False}),
("Amitaro_o", {"useIndex": False}),
diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
index 3a364f2d..1f180b19 100644
--- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
@@ -7,7 +7,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder
from voice_changer.DiffusionSVC.inferencer.onnx.VocoderOnnx import VocoderOnnx
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
-from voice_changer.utils.Timer import Timer
+from voice_changer.utils.Timer import Timer2
class DiffusionSVCInferencer(Inferencer):
@@ -49,18 +49,14 @@ class DiffusionSVCInferencer(Inferencer):
return model_block_size, model_sampling_rate
@torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道
- def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
- gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
- spk_emb=None):
-
+ def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, gt_spec=None, infer_speedup=10, method="dpm-solver", k_step=None, use_tqdm=True, spk_emb=None):
if self.diff_args.model.k_step_max is not None:
if k_step is None:
raise ValueError("k_step must not None when Shallow Diffusion Model inferring")
if k_step > int(self.diff_args.model.k_step_max):
raise ValueError("k_step must <= k_step_max of Shallow Diffusion Model")
if gt_spec is None:
- raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from "
- "input mel or output of naive model")
+ raise ValueError("gt_spec must not None when Shallow Diffusion Model inferring, gt_spec can from " "input mel or output of naive model")
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
@@ -75,8 +71,7 @@ class DiffusionSVCInferencer(Inferencer):
return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
@torch.no_grad()
- def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,
- aug_shift=0, spk_emb=None):
+ def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, spk_emb=None):
# spk_id
spk_emb_dict = None
if self.diff_args.model.use_speaker_encoder: # with speaker encoder
@@ -85,9 +80,7 @@ class DiffusionSVCInferencer(Inferencer):
else:
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
- out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict,
- aug_shift=aug_shift, infer=True,
- spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
+ out_spec = self.naive_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, infer=True, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
return out_spec
@torch.no_grad()
@@ -114,19 +107,18 @@ class DiffusionSVCInferencer(Inferencer):
silence_front: float,
skip_diffusion: bool = True,
) -> torch.Tensor:
- with Timer("pre-process", False) as t:
+ with Timer2("pre-process", False) as t:
gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
# print("[ ----Timer::1: ]", t.secs)
- with Timer("pre-process", False) as t:
+ with Timer2("pre-process", False) as t:
if skip_diffusion == 0:
- out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+ out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method="dpm-solver", k_step=k_step, use_tqdm=False, spk_emb=None)
gt_spec = out_mel
# print("[ ----Timer::2: ]", t.secs)
-
- with Timer("pre-process", False) as t: # NOQA
+ with Timer2("pre-process", False) as t: # NOQA
if self.vocoder_onnx is None:
start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
out_wav = self.mel2wav(gt_spec, pitch, start_frame=start_frame)
diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
index c808cf4c..1d4a3c9a 100644
--- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
+++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
@@ -17,7 +17,7 @@ from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.common.VolumeExtractor import VolumeExtractor
from torchaudio.transforms import Resample
-from voice_changer.utils.Timer import Timer
+from voice_changer.utils.Timer import Timer2
logger = VoiceChangaerLogger.get_instance().getLogger()
@@ -45,7 +45,7 @@ class Pipeline(object):
device,
isHalf,
resamplerIn: Resample,
- resamplerOut: Resample
+ resamplerOut: Resample,
):
self.inferencer = inferencer
inferencer_block_size, inferencer_sampling_rate = inferencer.getConfig()
@@ -64,7 +64,7 @@ class Pipeline(object):
logger.info("GENERATE INFERENCER" + str(self.inferencer))
logger.info("GENERATE EMBEDDER" + str(self.embedder))
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
-
+
self.targetSR = targetSR
self.device = device
self.isHalf = False
@@ -103,7 +103,7 @@ class Pipeline(object):
skip_diffusion=True,
):
# print("---------- pipe line --------------------")
- with Timer("pre-process", False) as t:
+ with Timer2("pre-process", False) as t:
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio16k = self.resamplerIn(audio_t)
volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
@@ -111,7 +111,7 @@ class Pipeline(object):
n_frames = int(audio16k.size(-1) // self.hop_size + 1)
# print("[Timer::1: ]", t.secs)
- with Timer("pre-process", False) as t:
+ with Timer2("pre-process", False) as t:
# ピッチ検出
try:
# pitch = self.pitchExtractor.extract(
@@ -141,8 +141,7 @@ class Pipeline(object):
feats = feats.view(1, -1)
# print("[Timer::2: ]", t.secs)
- with Timer("pre-process", False) as t:
-
+ with Timer2("pre-process", False) as t:
# embedding
with autocast(enabled=self.isHalf):
try:
@@ -156,28 +155,17 @@ class Pipeline(object):
raise DeviceChangingException()
else:
raise e
- feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
+ feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode="nearest").permute(0, 2, 1)
# print("[Timer::3: ]", t.secs)
- with Timer("pre-process", False) as t:
+ with Timer2("pre-process", False) as t:
# 推論実行
try:
with torch.no_grad():
with autocast(enabled=self.isHalf):
audio1 = (
torch.clip(
- self.inferencer.infer(
- audio16k,
- feats,
- pitch.unsqueeze(-1),
- volume,
- mask,
- sid,
- k_step,
- infer_speedup,
- silence_front=silence_front,
- skip_diffusion=skip_diffusion
- ).to(dtype=torch.float32),
+ self.inferencer.infer(audio16k, feats, pitch.unsqueeze(-1), volume, mask, sid, k_step, infer_speedup, silence_front=silence_front, skip_diffusion=skip_diffusion).to(dtype=torch.float32),
-1.0,
1.0,
)
@@ -191,7 +179,7 @@ class Pipeline(object):
raise e
# print("[Timer::4: ]", t.secs)
- with Timer("pre-process", False) as t: # NOQA
+ with Timer2("pre-process", False) as t: # NOQA
feats_buffer = feats.squeeze(0).detach().cpu()
if pitch is not None:
pitch_buffer = pitch.squeeze(0).detach().cpu()
diff --git a/server/voice_changer/Local/ServerDevice.py b/server/voice_changer/Local/ServerDevice.py
index 1f46409c..3e8217d9 100644
--- a/server/voice_changer/Local/ServerDevice.py
+++ b/server/voice_changer/Local/ServerDevice.py
@@ -9,7 +9,7 @@ from mods.log_control import VoiceChangaerLogger
from voice_changer.Local.AudioDeviceList import checkSamplingRate, list_audio_device
import time
import sounddevice as sd
-from voice_changer.utils.Timer import Timer
+from voice_changer.utils.Timer import Timer2
import librosa
from voice_changer.utils.VoiceChangerModel import AudioInOut
@@ -139,7 +139,7 @@ class ServerDevice:
return out_wav, times
def _processDataWithTime(self, indata: np.ndarray):
- with Timer("all_inference_time") as t:
+ with Timer2("all_inference_time", False) as t:
out_wav, times = self._processData(indata)
all_inference_time = t.secs
self.performance = [all_inference_time] + times
diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py
index f14c1617..309b3131 100644
--- a/server/voice_changer/VoiceChangerManager.py
+++ b/server/voice_changer/VoiceChangerManager.py
@@ -364,7 +364,7 @@ class VoiceChangerManager(ServerDeviceCallbacks):
req = json.loads(request)
req = ModelMergerRequest(**req)
req.files = [MergeElement(**f) for f in req.files]
- slot = len(self.modelSlotManager.getAllSlotInfo()) - 1
+ slot = len(self.modelSlotManager.getAllSlotInfo()) - 2 # Beatrice-JVS が追加されたので -1 -> -2
if req.voiceChangerType == "RVC":
merged = RVCModelMerger.merge_models(self.params, req, slot)
loadParam = LoadModelParams(voiceChangerType="RVC", slot=slot, isSampleMode=False, sampleId="", files=[LoadModelParamFile(name=os.path.basename(merged), kind="rvcModel", dir="")], params={})
diff --git a/server/voice_changer/utils/Timer.py b/server/voice_changer/utils/Timer.py
index 43dc0b16..81eee9af 100644
--- a/server/voice_changer/utils/Timer.py
+++ b/server/voice_changer/utils/Timer.py
@@ -3,45 +3,45 @@ import inspect
from typing import Dict, List
-class Timer(object):
- storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
+# class Timer(object):
+# storedSecs: Dict[str, Dict[str, List[float]]] = {} # Class variable
- def __init__(self, title: str, enalbe: bool = True):
- self.title = title
- self.enable = enalbe
- self.secs = 0
- self.msecs = 0
- self.avrSecs = 0
+# def __init__(self, title: str, enalbe: bool = True):
+# self.title = title
+# self.enable = enalbe
+# self.secs = 0
+# self.msecs = 0
+# self.avrSecs = 0
- if self.enable is False:
- return
+# if self.enable is False:
+# return
- self.maxStores = 10
+# self.maxStores = 10
- current_frame = inspect.currentframe()
- caller_frame = inspect.getouterframes(current_frame, 2)
- frame = caller_frame[1]
- filename = frame.filename
- line_number = frame.lineno
- self.key = f"{title}_{filename}_{line_number}"
- if self.key not in self.storedSecs:
- self.storedSecs[self.key] = {}
+# current_frame = inspect.currentframe()
+# caller_frame = inspect.getouterframes(current_frame, 2)
+# frame = caller_frame[1]
+# filename = frame.filename
+# line_number = frame.lineno
+# self.key = f"{title}_{filename}_{line_number}"
+# if self.key not in self.storedSecs:
+# self.storedSecs[self.key] = {}
- def __enter__(self):
- if self.enable is False:
- return
- self.start = time.time()
- return self
+# def __enter__(self):
+# if self.enable is False:
+# return
+# self.start = time.time()
+# return self
- def __exit__(self, *_):
- if self.enable is False:
- return
- self.end = time.time()
- self.secs = self.end - self.start
- self.msecs = self.secs * 1000 # millisecs
- self.storedSecs[self.key].append(self.secs)
- self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
- self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
+# def __exit__(self, *_):
+# if self.enable is False:
+# return
+# self.end = time.time()
+# self.secs = self.end - self.start
+# self.msecs = self.secs * 1000 # millisecs
+# self.storedSecs[self.key].append(self.secs)
+# self.storedSecs[self.key] = self.storedSecs[self.key][-self.maxStores :]
+# self.avrSecs = sum(self.storedSecs[self.key]) / len(self.storedSecs[self.key])
class Timer2(object):