update resample

2025-02-02 16:23:58 +03:00 · 2023-05-28 13:54:57 +09:00 · 2023-05-28 13:54:57 +09:00 · 78ccc10a53
commit 78ccc10a53
parent 04847306af
5 changed files with 24 additions and 27 deletions
--- a/server/voice_changer/RVC/RVC.py
+++ b/server/voice_changer/RVC/RVC.py
@ -1,10 +1,10 @@
 import sys
 import os
-import resampy
 from dataclasses import asdict
 from typing import cast
 import numpy as np
 import torch
+import torchaudio
 from ModelSample import getModelSamples
 from voice_changer.RVC.SampleDownloader import downloadModelFiles

@ -89,6 +89,7 @@ class RVC:
                    self.switchModel(self.settings.modelSlotIndex)
                    self.initialLoad = False
                    break
+        self.prevVol = 0.

    def getSampleInfo(self, id: str):
        sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
@ -293,16 +294,17 @@ class RVC:

        convertOffset = -1 * convertSize
        self.audio_buffer = self.audio_buffer[convertOffset:]  # 変換対象の部分だけ抽出
+        audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)

        # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
        cropOffset = -1 * (inputSize + crossfadeSize)
        cropEnd = -1 * (crossfadeSize)
-        crop = self.audio_buffer[cropOffset:cropEnd]
-        rms = np.sqrt(np.square(crop).mean(axis=0))
-        vol = max(rms, self.prevVol * 0.0)
+        crop = audio_buffer[cropOffset:cropEnd]
+        vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
+        vol = max(vol, self.prevVol * 0.0)
        self.prevVol = vol

-        return (self.audio_buffer, convertSize, vol)
+        return (audio_buffer, convertSize, vol)

    def inference(self, data):
        if self.settings.modelSlotIndex < 0:
@ -325,11 +327,10 @@ class RVC:
        convertSize = data[1]
        vol = data[2]

-        audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
-
        if vol < self.settings.silentThreshold:
            return np.zeros(convertSize).astype(np.int16)

+        audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
        repeat = 3 if half else 1
        repeat *= self.settings.rvcQuality  # 0 or 3
        sid = 0
@ -351,7 +352,7 @@ class RVC:
            repeat,
        )

-        result = audio_out * np.sqrt(vol)
+        result = audio_out.detach().cpu().numpy() * np.sqrt(vol)

        return result

--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -89,7 +89,7 @@ class Pipeline(object):
        self.t_pad = self.sr * repeat
        self.t_pad_tgt = self.targetSR * repeat

-        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
+        audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
        p_len = audio_pad.shape[0] // self.window
        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()

@ -115,7 +115,7 @@ class Pipeline(object):
            raise NotEnoughDataExtimateF0()

        # tensor型調整
-        feats = torch.from_numpy(audio_pad)
+        feats = audio_pad
        if self.isHalf is True:
            feats = feats.half()
        else:
@ -180,13 +180,10 @@ class Pipeline(object):
            with torch.no_grad():
                audio1 = (
                    (
-                        self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
-                        * 32768
+                    torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5
                    )
-                    .data.cpu()
-                    .float()
-                    .numpy()
-                    .astype(np.int16)
+                    .data
+                    .to(dtype=torch.int16)
                )
        except RuntimeError as e:
            if "HALF" in e.__str__().upper():
--- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py
@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
            decoder=torchcrepe.decode.weighted_argmax,
            device=self.device,
        )
-        f0 = f0.squeeze().detach().cpu().numpy()
+        f0 = torchcrepe.filter.median(f0, 3)
+        f0 = f0.squeeze()

-        f0 = np.pad(
-            f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
+        f0 = torch.nn.functional.pad(
+            f0, (start_frame, n_frames - f0.shape[0] - start_frame)
        )

        f0 *= pow(2, f0_up_key / 12)
-        f0bak = f0.copy()
-        f0_mel = 1127 * np.log(1 + f0 / 700)
-        f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
-            f0_mel_max - f0_mel_min
-        ) + 1
-        f0_mel[f0_mel <= 1] = 1
-        f0_mel[f0_mel > 255] = 255
-        f0_coarse = np.rint(f0_mel).astype(np.int)
+        f0bak = f0.detach().cpu().numpy()
+        f0_mel = 1127. * torch.log(1. + f0 / 700.)
+        f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
+        f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)

        return f0_coarse, f0bak
--- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py
@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor

 class DioPitchExtractor(PitchExtractor):
    def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
        n_frames = int(len(audio) // window) + 1
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr
--- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
+++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py
@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor

 class HarvestPitchExtractor(PitchExtractor):
    def extract(self, audio, f0_up_key, sr, window, silence_front=0):
+        audio = audio.detach().cpu().numpy()
        n_frames = int(len(audio) // window) + 1
        start_frame = int(silence_front * sr / window)
        real_silence_front = start_frame * window / sr