From 0d06fcd16bef074376dbe019749f1a6b609d9ab2 Mon Sep 17 00:00:00 2001
From: wataru <wataru@fdev.local.com>
Date: Sat, 8 Apr 2023 05:34:26 +0900
Subject: [PATCH] customize pipline

---
 .../demo/public/assets/gui_settings/RVC.json  |  4 ----
 server/voice_changer/RVC/RVC.py               |  9 +++-----
 .../RVC/custom_vc_infer_pipeline.py           | 23 ++++---------------
 3 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/client/demo/public/assets/gui_settings/RVC.json b/client/demo/public/assets/gui_settings/RVC.json
index 5a0e9f4f..9fe5690f 100644
--- a/client/demo/public/assets/gui_settings/RVC.json
+++ b/client/demo/public/assets/gui_settings/RVC.json
@@ -105,10 +105,6 @@
                 "name": "indexRatio",
                 "options": {}
             },
-            {
-                "name": "noiseScale",
-                "options": {}
-            },
             {
                 "name": "silentThreshold",
                 "options": {}
diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py
index cd299152..ea44a8c4 100644
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@@ -180,14 +180,12 @@ class RVC:
             convertSize = convertSize + (128 - (convertSize % 128))
 
         self.audio_buffer = self.audio_buffer[-1 * convertSize:]  # 変換対象の部分だけ抽出
-        print("convert size", convertSize, self.audio_buffer.shape)
 
         crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
         rms = np.sqrt(np.square(crop).mean(axis=0))
         vol = max(rms, self.prevVol * 0.0)
         self.prevVol = vol
 
-        print("audio len 01,", len(self.audio_buffer))
         return (self.audio_buffer, convertSize, vol)
 
     def _onnx_inference(self, data):
@@ -212,7 +210,9 @@ class RVC:
             return np.zeros(convertSize).astype(np.int16)
 
         with torch.no_grad():
-            vc = VC(self.settings.modelSamplingRate, dev, self.is_half)
+            repeat = 3 if self.is_half else 1
+            repeat *= self.settings.rvcQuality  # 0 or 3
+            vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat)
             sid = 0
             times = [0, 0, 0]
             f0_up_key = self.settings.tran
@@ -245,9 +245,7 @@ class RVC:
         audio = data[0]
         convertSize = data[1]
         vol = data[2]
-        print("audio len 02,", len(audio))
         audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
-        print("audio len 03,", len(audio))
 
         if vol < self.settings.silentThreshold:
             return np.zeros(convertSize).astype(np.int16)
@@ -266,7 +264,6 @@ class RVC:
             if_f0 = 1
             f0_file = None
 
-            print("audio len 0,", len(audio))
             audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
                                     file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
             result = audio_out * np.sqrt(vol)
diff --git a/server/voice_changer/RVC/custom_vc_infer_pipeline.py b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
index c39b5706..6aee1fd8 100644
--- a/server/voice_changer/RVC/custom_vc_infer_pipeline.py
+++ b/server/voice_changer/RVC/custom_vc_infer_pipeline.py
@@ -76,7 +76,6 @@ class VC(object):
         assert feats.dim() == 1, feats.dim()
         feats = feats.view(1, -1)
         padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
-        print("padding_mask", padding_mask)
 
         inputs = {
             "source": feats.to(self.device),
@@ -98,9 +97,8 @@ class VC(object):
                 npy = npy.astype("float16")
             feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
 
-        print("feats shape1", feats.shape)
         feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
-        print("feats shape2", feats.shape)
+
         t1 = ttime()
         p_len = audio0.shape[0] // self.window
         if (feats.shape[1] < p_len):
@@ -109,23 +107,18 @@ class VC(object):
                 pitch = pitch[:, :p_len]
                 pitchf = pitchf[:, :p_len]
         p_len = torch.tensor([p_len], device=self.device).long()
+
         with torch.no_grad():
-            print("vc audio len feat 1,", feats.shape)
-            if (pitch != None and pitchf != None):
-                print("vc audio len feat use pitch!!!!!!!,", feats.shape)
-                audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
-            else:
-                audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+            audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
+
         del feats, p_len, padding_mask
         torch.cuda.empty_cache()
         t2 = ttime()
         times[0] += (t1 - t0)
         times[2] += (t2 - t1)
-        print("vc audio return", len(audio1), audio1)
         return audio1
 
     def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None):
-        print("audio len 1,", len(audio))
         if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
             try:
                 index = faiss.read_index(file_index)
@@ -135,13 +128,7 @@ class VC(object):
                 index = big_npy = None
         else:
             index = big_npy = None
-        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
-        print("audio_pad len 1,", len(audio_pad))
-        opt_ts = []
 
-        print("audio_pad len 2,", len(audio_pad), opt_ts)
-
-        s = 0
         audio_opt = []
         t = None
         t1 = ttime()
@@ -153,7 +140,6 @@ class VC(object):
         pitch, pitchf = None, None
         if (if_f0 == 1):
             pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
-            print("pitch!", pitch)
             pitch = pitch[:p_len]
             pitchf = pitchf[:p_len]
             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
@@ -170,5 +156,4 @@ class VC(object):
         audio_opt = np.concatenate(audio_opt)
         del pitch, pitchf, sid
         torch.cuda.empty_cache()
-        print("result", audio_opt)
         return audio_opt