From 0d06fcd16bef074376dbe019749f1a6b609d9ab2 Mon Sep 17 00:00:00 2001 From: wataru Date: Sat, 8 Apr 2023 05:34:26 +0900 Subject: [PATCH] customize pipline --- .../demo/public/assets/gui_settings/RVC.json | 4 ---- server/voice_changer/RVC/RVC.py | 9 +++----- .../RVC/custom_vc_infer_pipeline.py | 23 ++++--------------- 3 files changed, 7 insertions(+), 29 deletions(-) diff --git a/client/demo/public/assets/gui_settings/RVC.json b/client/demo/public/assets/gui_settings/RVC.json index 5a0e9f4f..9fe5690f 100644 --- a/client/demo/public/assets/gui_settings/RVC.json +++ b/client/demo/public/assets/gui_settings/RVC.json @@ -105,10 +105,6 @@ "name": "indexRatio", "options": {} }, - { - "name": "noiseScale", - "options": {} - }, { "name": "silentThreshold", "options": {} diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index cd299152..ea44a8c4 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -180,14 +180,12 @@ class RVC: convertSize = convertSize + (128 - (convertSize % 128)) self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出 - print("convert size", convertSize, self.audio_buffer.shape) crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] rms = np.sqrt(np.square(crop).mean(axis=0)) vol = max(rms, self.prevVol * 0.0) self.prevVol = vol - print("audio len 01,", len(self.audio_buffer)) return (self.audio_buffer, convertSize, vol) def _onnx_inference(self, data): @@ -212,7 +210,9 @@ class RVC: return np.zeros(convertSize).astype(np.int16) with torch.no_grad(): - vc = VC(self.settings.modelSamplingRate, dev, self.is_half) + repeat = 3 if self.is_half else 1 + repeat *= self.settings.rvcQuality # 0 or 3 + vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat) sid = 0 times = [0, 0, 0] f0_up_key = self.settings.tran @@ -245,9 +245,7 @@ class RVC: audio = data[0] convertSize = data[1] vol = data[2] - print("audio len 02,", len(audio)) audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000) - print("audio len 03,", len(audio)) if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) @@ -266,7 +264,6 @@ class RVC: if_f0 = 1 f0_file = None - print("audio len 0,", len(audio)) audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file) result = audio_out * np.sqrt(vol) diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py index c39b5706..6aee1fd8 100644 --- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py +++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py @@ -76,7 +76,6 @@ class VC(object): assert feats.dim() == 1, feats.dim() feats = feats.view(1, -1) padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - print("padding_mask", padding_mask) inputs = { "source": feats.to(self.device), @@ -98,9 +97,8 @@ class VC(object): npy = npy.astype("float16") feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats - print("feats shape1", feats.shape) feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - print("feats shape2", feats.shape) + t1 = ttime() p_len = audio0.shape[0] // self.window if (feats.shape[1] < p_len): @@ -109,23 +107,18 @@ class VC(object): pitch = pitch[:, :p_len] pitchf = pitchf[:, :p_len] p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): - print("vc audio len feat 1,", feats.shape) - if (pitch != None and pitchf != None): - print("vc audio len feat use pitch!!!!!!!,", feats.shape) - audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) - else: - audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) + audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16) + del feats, p_len, padding_mask torch.cuda.empty_cache() t2 = ttime() times[0] += (t1 - t0) times[2] += (t2 - t1) - print("vc audio return", len(audio1), audio1) return audio1 def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None): - print("audio len 1,", len(audio)) if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0): try: index = faiss.read_index(file_index) @@ -135,13 +128,7 @@ class VC(object): index = big_npy = None else: index = big_npy = None - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect') - print("audio_pad len 1,", len(audio_pad)) - opt_ts = [] - print("audio_pad len 2,", len(audio_pad), opt_ts) - - s = 0 audio_opt = [] t = None t1 = ttime() @@ -153,7 +140,6 @@ class VC(object): pitch, pitchf = None, None if (if_f0 == 1): pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0) - print("pitch!", pitch) pitch = pitch[:p_len] pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() @@ -170,5 +156,4 @@ class VC(object): audio_opt = np.concatenate(audio_opt) del pitch, pitchf, sid torch.cuda.empty_cache() - print("result", audio_opt) return audio_opt