From 04847306afa42c9f521f286173f70cfa9018f6fe Mon Sep 17 00:00:00 2001
From: nadare <1na2da0re3@gmail.com>
Date: Sun, 28 May 2023 01:13:33 +0900
Subject: [PATCH 1/4] fix infer faiss params

---
 server/voice_changer/RVC/pipeline/Pipeline.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py
index 8c4364e5..14584699 100644
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@@ -146,11 +146,16 @@ class Pipeline(object):
             # D, I = self.index.search(npy, 1)
             # npy = self.feature[I.squeeze()]
 
-            score, ix = self.index.search(npy, k=8)
-            weight = np.square(1 / score)
-            weight /= weight.sum(axis=1, keepdims=True)
-
-            npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+            # TODO: kは調整できるようにする
+            k = 1
+            if k == 1:
+                _, ix = self.index.search(npy, 1)
+                npy = self.big_npy[ix.squeeze()]               
+            else:
+                score, ix = self.index.search(npy, k=8)
+                weight = np.square(1 / score)
+                weight /= weight.sum(axis=1, keepdims=True)
+                npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
 
             if self.isHalf is True:
                 npy = npy.astype("float16")

From 78ccc10a5398098df0f7663c8d3a878a07818bdf Mon Sep 17 00:00:00 2001
From: nadare <1na2da0re3@gmail.com>
Date: Sun, 28 May 2023 13:54:57 +0900
Subject: [PATCH 2/4] update resample

---
 server/voice_changer/RVC/RVC.py               | 17 +++++++++--------
 server/voice_changer/RVC/pipeline/Pipeline.py | 13 +++++--------
 .../RVC/pitchExtractor/CrepePitchExtractor.py | 19 ++++++++-----------
 .../RVC/pitchExtractor/DioPitchExtractor.py   |  1 +
 .../pitchExtractor/HarvestPitchExtractor.py   |  1 +
 5 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py
index 4bbe4dad..09003cc5 100644
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@@ -1,10 +1,10 @@
 import sys
 import os
-import resampy
 from dataclasses import asdict
 from typing import cast
 import numpy as np
 import torch
+import torchaudio
 from ModelSample import getModelSamples
 from voice_changer.RVC.SampleDownloader import downloadModelFiles
 
@@ -89,6 +89,7 @@ class RVC:
                     self.switchModel(self.settings.modelSlotIndex)
                     self.initialLoad = False
                     break
+        self.prevVol = 0.
 
     def getSampleInfo(self, id: str):
         sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
@@ -293,16 +294,17 @@ class RVC:
 
         convertOffset = -1 * convertSize
         self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
+        audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
 
         # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
         cropOffset = -1 * (inputSize + crossfadeSize)
         cropEnd = -1 * (crossfadeSize)
-        crop = self.audio_buffer[cropOffset:cropEnd]
-        rms = np.sqrt(np.square(crop).mean(axis=0))
-        vol = max(rms, self.prevVol * 0.0)
+        crop = audio_buffer[cropOffset:cropEnd]
+        vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
+        vol = max(vol, self.prevVol * 0.0)
         self.prevVol = vol
 
-        return (self.audio_buffer, convertSize, vol)
+        return (audio_buffer, convertSize, vol)
 
     def inference(self, data):
         if self.settings.modelSlotIndex < 0:
@@ -325,11 +327,10 @@ class RVC:
         convertSize = data[1]
         vol = data[2]
 
-        audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
-
         if vol < self.settings.silentThreshold:
             return np.zeros(convertSize).astype(np.int16)
 
+        audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
         repeat = 3 if half else 1
         repeat *= self.settings.rvcQuality  # 0 or 3
         sid = 0
@@ -351,7 +352,7 @@ class RVC:
             repeat,
         )
 
-        result = audio_out * np.sqrt(vol)
+        result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
 
         return result
 
diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py
index 14584699..34612c7f 100644
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@@ -89,7 +89,7 @@ class Pipeline(object):
         self.t_pad = self.sr * repeat
         self.t_pad_tgt = self.targetSR * repeat
 
-        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
         p_len = audio_pad.shape[0] // self.window
         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
 
@@ -115,7 +115,7 @@ class Pipeline(object):
             raise NotEnoughDataExtimateF0()
 
         # tensor型調整
-        feats = torch.from_numpy(audio_pad)
+        feats = audio_pad
         if self.isHalf is True:
             feats = feats.half()
         else:
@@ -180,13 +180,10 @@ class Pipeline(object):
             with torch.no_grad():
                 audio1 = (
                     (
-                        self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
-                        * 32768
+                    torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5
                     )
-                    .data.cpu()
-                    .float()
-                    .numpy()
-                    .astype(np.int16)
+                    .data
+                    .to(dtype=torch.int16)
                 )
         except RuntimeError as e:
             if "HALF" in e.__str__().upper():
diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
index cd13bcca..d1849f02 100644
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
             decoder=torchcrepe.decode.weighted_argmax,
             device=self.device,
         )
-        f0 = f0.squeeze().detach().cpu().numpy()
+        f0 = torchcrepe.filter.median(f0, 3)
+        f0 = f0.squeeze()
 
-        f0 = np.pad(
-            f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
+        f0 = torch.nn.functional.pad(
+            f0, (start_frame, n_frames - f0.shape[0] - start_frame)
         )
 
         f0 *= pow(2, f0_up_key / 12)
-        f0bak = f0.copy()
-        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
-            f0_mel_max - f0_mel_min
-        ) + 1
-        f0_mel[f0_mel <= 1] = 1
-        f0_mel[f0_mel > 255] = 255
-        f0_coarse = np.rint(f0_mel).astype(np.int)
+        f0bak = f0.detach().cpu().numpy()
+        f0_mel = 1127. * torch.log(1. + f0 / 700.)
+        f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
+        f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
 
         return f0_coarse, f0bak
diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
index eafc72be..ac0d61cd 100644
--- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
@@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 
 class DioPitchExtractor(PitchExtractor):
     def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
         n_frames = int(len(audio) // window) + 1
         start_frame = int(silence_front * sr / window)
         real_silence_front = start_frame * window / sr
diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
index 4043092f..b4c60886 100644
--- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
@@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 
 class HarvestPitchExtractor(PitchExtractor):
     def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
         n_frames = int(len(audio) // window) + 1
         start_frame = int(silence_front * sr / window)
         real_silence_front = start_frame * window / sr

From 2b452ead0ba57cf96a109f787553d284e25c7acc Mon Sep 17 00:00:00 2001
From: nadare <1na2da0re3@gmail.com>
Date: Sun, 28 May 2023 16:47:13 +0900
Subject: [PATCH 3/4] mend

---
 server/voice_changer/RVC/pipeline/Pipeline.py                  | 2 +-
 server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py
index 34612c7f..e44d0e6c 100644
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@@ -180,7 +180,7 @@ class Pipeline(object):
             with torch.no_grad():
                 audio1 = (
                     (
-                    torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5
+                    torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5
                     )
                     .data
                     .to(dtype=torch.int16)
diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
index d1849f02..493ef945 100644
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor):
         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 
         f0 = torchcrepe.predict(
-            torch.tensor(audio).unsqueeze(0),
+            audio.unsqueeze(0),
             sr,
             hop_length=window,
             fmin=f0_min,

From 52f0e496ef70c97b7b3a4e240d62c5fb6948200c Mon Sep 17 00:00:00 2001
From: nadare <1na2da0re3@gmail.com>
Date: Sun, 28 May 2023 13:54:57 +0900
Subject: [PATCH 4/4] update resample

---
 server/voice_changer/RVC/RVC.py               | 17 ++++++++-------
 server/voice_changer/RVC/pipeline/Pipeline.py | 13 +++++-------
 .../RVC/pitchExtractor/CrepePitchExtractor.py | 21 ++++++++-----------
 .../RVC/pitchExtractor/DioPitchExtractor.py   |  1 +
 .../pitchExtractor/HarvestPitchExtractor.py   |  1 +
 5 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py
index 4bbe4dad..09003cc5 100644
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@@ -1,10 +1,10 @@
 import sys
 import os
-import resampy
 from dataclasses import asdict
 from typing import cast
 import numpy as np
 import torch
+import torchaudio
 from ModelSample import getModelSamples
 from voice_changer.RVC.SampleDownloader import downloadModelFiles
 
@@ -89,6 +89,7 @@ class RVC:
                     self.switchModel(self.settings.modelSlotIndex)
                     self.initialLoad = False
                     break
+        self.prevVol = 0.
 
     def getSampleInfo(self, id: str):
         sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
@@ -293,16 +294,17 @@ class RVC:
 
         convertOffset = -1 * convertSize
         self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
+        audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
 
         # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
         cropOffset = -1 * (inputSize + crossfadeSize)
         cropEnd = -1 * (crossfadeSize)
-        crop = self.audio_buffer[cropOffset:cropEnd]
-        rms = np.sqrt(np.square(crop).mean(axis=0))
-        vol = max(rms, self.prevVol * 0.0)
+        crop = audio_buffer[cropOffset:cropEnd]
+        vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
+        vol = max(vol, self.prevVol * 0.0)
         self.prevVol = vol
 
-        return (self.audio_buffer, convertSize, vol)
+        return (audio_buffer, convertSize, vol)
 
     def inference(self, data):
         if self.settings.modelSlotIndex < 0:
@@ -325,11 +327,10 @@ class RVC:
         convertSize = data[1]
         vol = data[2]
 
-        audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
-
         if vol < self.settings.silentThreshold:
             return np.zeros(convertSize).astype(np.int16)
 
+        audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
         repeat = 3 if half else 1
         repeat *= self.settings.rvcQuality  # 0 or 3
         sid = 0
@@ -351,7 +352,7 @@ class RVC:
             repeat,
         )
 
-        result = audio_out * np.sqrt(vol)
+        result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
 
         return result
 
diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py
index 14584699..e44d0e6c 100644
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@@ -89,7 +89,7 @@ class Pipeline(object):
         self.t_pad = self.sr * repeat
         self.t_pad_tgt = self.targetSR * repeat
 
-        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
         p_len = audio_pad.shape[0] // self.window
         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
 
@@ -115,7 +115,7 @@ class Pipeline(object):
             raise NotEnoughDataExtimateF0()
 
         # tensor型調整
-        feats = torch.from_numpy(audio_pad)
+        feats = audio_pad
         if self.isHalf is True:
             feats = feats.half()
         else:
@@ -180,13 +180,10 @@ class Pipeline(object):
             with torch.no_grad():
                 audio1 = (
                     (
-                        self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
-                        * 32768
+                    torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5
                     )
-                    .data.cpu()
-                    .float()
-                    .numpy()
-                    .astype(np.int16)
+                    .data
+                    .to(dtype=torch.int16)
                 )
         except RuntimeError as e:
             if "HALF" in e.__str__().upper():
diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
index cd13bcca..493ef945 100644
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor):
         f0_mel_max = 1127 * np.log(1 + f0_max / 700)
 
         f0 = torchcrepe.predict(
-            torch.tensor(audio).unsqueeze(0),
+            audio.unsqueeze(0),
             sr,
             hop_length=window,
             fmin=f0_min,
@@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
             decoder=torchcrepe.decode.weighted_argmax,
             device=self.device,
         )
-        f0 = f0.squeeze().detach().cpu().numpy()
+        f0 = torchcrepe.filter.median(f0, 3)
+        f0 = f0.squeeze()
 
-        f0 = np.pad(
-            f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
+        f0 = torch.nn.functional.pad(
+            f0, (start_frame, n_frames - f0.shape[0] - start_frame)
         )
 
         f0 *= pow(2, f0_up_key / 12)
-        f0bak = f0.copy()
-        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
-            f0_mel_max - f0_mel_min
-        ) + 1
-        f0_mel[f0_mel <= 1] = 1
-        f0_mel[f0_mel > 255] = 255
-        f0_coarse = np.rint(f0_mel).astype(np.int)
+        f0bak = f0.detach().cpu().numpy()
+        f0_mel = 1127. * torch.log(1. + f0 / 700.)
+        f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
+        f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
 
         return f0_coarse, f0bak
diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
index eafc72be..ac0d61cd 100644
--- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
@@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 
 class DioPitchExtractor(PitchExtractor):
     def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
         n_frames = int(len(audio) // window) + 1
         start_frame = int(silence_front * sr / window)
         real_silence_front = start_frame * window / sr
diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
index 4043092f..b4c60886 100644
--- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
@@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
 
 class HarvestPitchExtractor(PitchExtractor):
     def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
         n_frames = int(len(audio) // window) + 1
         start_frame = int(silence_front * sr / window)
         real_silence_front = start_frame * window / sr