From 04847306afa42c9f521f286173f70cfa9018f6fe Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 28 May 2023 01:13:33 +0900 Subject: [PATCH 1/4] fix infer faiss params --- server/voice_changer/RVC/pipeline/Pipeline.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 8c4364e5..14584699 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -146,11 +146,16 @@ class Pipeline(object): # D, I = self.index.search(npy, 1) # npy = self.feature[I.squeeze()] - score, ix = self.index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - - npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + # TODO: kは調整できるようにする + k = 1 + if k == 1: + _, ix = self.index.search(npy, 1) + npy = self.big_npy[ix.squeeze()] + else: + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) if self.isHalf is True: npy = npy.astype("float16") From 78ccc10a5398098df0f7663c8d3a878a07818bdf Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 28 May 2023 13:54:57 +0900 Subject: [PATCH 2/4] update resample --- server/voice_changer/RVC/RVC.py | 17 +++++++++-------- server/voice_changer/RVC/pipeline/Pipeline.py | 13 +++++-------- .../RVC/pitchExtractor/CrepePitchExtractor.py | 19 ++++++++----------- .../RVC/pitchExtractor/DioPitchExtractor.py | 1 + .../pitchExtractor/HarvestPitchExtractor.py | 1 + 5 files changed, 24 insertions(+), 27 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 4bbe4dad..09003cc5 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -1,10 +1,10 @@ import sys import os -import resampy from dataclasses import asdict from typing import cast import numpy as np import torch +import torchaudio from ModelSample import getModelSamples from voice_changer.RVC.SampleDownloader import downloadModelFiles @@ -89,6 +89,7 @@ class RVC: self.switchModel(self.settings.modelSlotIndex) self.initialLoad = False break + self.prevVol = 0. def getSampleInfo(self, id: str): sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels)) @@ -293,16 +294,17 @@ class RVC: convertOffset = -1 * convertSize self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 + audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32) # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) cropOffset = -1 * (inputSize + crossfadeSize) cropEnd = -1 * (crossfadeSize) - crop = self.audio_buffer[cropOffset:cropEnd] - rms = np.sqrt(np.square(crop).mean(axis=0)) - vol = max(rms, self.prevVol * 0.0) + crop = audio_buffer[cropOffset:cropEnd] + vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy() + vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - return (self.audio_buffer, convertSize, vol) + return (audio_buffer, convertSize, vol) def inference(self, data): if self.settings.modelSlotIndex < 0: @@ -325,11 +327,10 @@ class RVC: convertSize = data[1] vol = data[2] - audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000) - if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) + audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99) repeat = 3 if half else 1 repeat *= self.settings.rvcQuality # 0 or 3 sid = 0 @@ -351,7 +352,7 @@ class RVC: repeat, ) - result = audio_out * np.sqrt(vol) + result = audio_out.detach().cpu().numpy() * np.sqrt(vol) return result diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 14584699..34612c7f 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -89,7 +89,7 @@ class Pipeline(object): self.t_pad = self.sr * repeat self.t_pad_tgt = self.targetSR * repeat - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0) p_len = audio_pad.shape[0] // self.window sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() @@ -115,7 +115,7 @@ class Pipeline(object): raise NotEnoughDataExtimateF0() # tensor型調整 - feats = torch.from_numpy(audio_pad) + feats = audio_pad if self.isHalf is True: feats = feats.half() else: @@ -180,13 +180,10 @@ class Pipeline(object): with torch.no_grad(): audio1 = ( ( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] - * 32768 + torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5 ) - .data.cpu() - .float() - .numpy() - .astype(np.int16) + .data + .to(dtype=torch.int16) ) except RuntimeError as e: if "HALF" in e.__str__().upper(): diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py index cd13bcca..d1849f02 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py @@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor): decoder=torchcrepe.decode.weighted_argmax, device=self.device, ) - f0 = f0.squeeze().detach().cpu().numpy() + f0 = torchcrepe.filter.median(f0, 3) + f0 = f0.squeeze() - f0 = np.pad( - f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame) + f0 = torch.nn.functional.pad( + f0, (start_frame, n_frames - f0.shape[0] - start_frame) ) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int) + f0bak = f0.detach().cpu().numpy() + f0_mel = 1127. * torch.log(1. + f0 / 700.) + f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.) + f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int) return f0_coarse, f0bak diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py index eafc72be..ac0d61cd 100644 --- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class DioPitchExtractor(PitchExtractor): def extract(self, audio, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py index 4043092f..b4c60886 100644 --- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class HarvestPitchExtractor(PitchExtractor): def extract(self, audio, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr From 2b452ead0ba57cf96a109f787553d284e25c7acc Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 28 May 2023 16:47:13 +0900 Subject: [PATCH 3/4] mend --- server/voice_changer/RVC/pipeline/Pipeline.py | 2 +- server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 34612c7f..e44d0e6c 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -180,7 +180,7 @@ class Pipeline(object): with torch.no_grad(): audio1 = ( ( - torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5 + torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5 ) .data .to(dtype=torch.int16) diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py index d1849f02..493ef945 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py @@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor): f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0 = torchcrepe.predict( - torch.tensor(audio).unsqueeze(0), + audio.unsqueeze(0), sr, hop_length=window, fmin=f0_min, From 52f0e496ef70c97b7b3a4e240d62c5fb6948200c Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sun, 28 May 2023 13:54:57 +0900 Subject: [PATCH 4/4] update resample --- server/voice_changer/RVC/RVC.py | 17 ++++++++------- server/voice_changer/RVC/pipeline/Pipeline.py | 13 +++++------- .../RVC/pitchExtractor/CrepePitchExtractor.py | 21 ++++++++----------- .../RVC/pitchExtractor/DioPitchExtractor.py | 1 + .../pitchExtractor/HarvestPitchExtractor.py | 1 + 5 files changed, 25 insertions(+), 28 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 4bbe4dad..09003cc5 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -1,10 +1,10 @@ import sys import os -import resampy from dataclasses import asdict from typing import cast import numpy as np import torch +import torchaudio from ModelSample import getModelSamples from voice_changer.RVC.SampleDownloader import downloadModelFiles @@ -89,6 +89,7 @@ class RVC: self.switchModel(self.settings.modelSlotIndex) self.initialLoad = False break + self.prevVol = 0. def getSampleInfo(self, id: str): sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels)) @@ -293,16 +294,17 @@ class RVC: convertOffset = -1 * convertSize self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 + audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32) # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) cropOffset = -1 * (inputSize + crossfadeSize) cropEnd = -1 * (crossfadeSize) - crop = self.audio_buffer[cropOffset:cropEnd] - rms = np.sqrt(np.square(crop).mean(axis=0)) - vol = max(rms, self.prevVol * 0.0) + crop = audio_buffer[cropOffset:cropEnd] + vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy() + vol = max(vol, self.prevVol * 0.0) self.prevVol = vol - return (self.audio_buffer, convertSize, vol) + return (audio_buffer, convertSize, vol) def inference(self, data): if self.settings.modelSlotIndex < 0: @@ -325,11 +327,10 @@ class RVC: convertSize = data[1] vol = data[2] - audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000) - if vol < self.settings.silentThreshold: return np.zeros(convertSize).astype(np.int16) + audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99) repeat = 3 if half else 1 repeat *= self.settings.rvcQuality # 0 or 3 sid = 0 @@ -351,7 +352,7 @@ class RVC: repeat, ) - result = audio_out * np.sqrt(vol) + result = audio_out.detach().cpu().numpy() * np.sqrt(vol) return result diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 14584699..e44d0e6c 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -89,7 +89,7 @@ class Pipeline(object): self.t_pad = self.sr * repeat self.t_pad_tgt = self.targetSR * repeat - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0) p_len = audio_pad.shape[0] // self.window sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() @@ -115,7 +115,7 @@ class Pipeline(object): raise NotEnoughDataExtimateF0() # tensor型調整 - feats = torch.from_numpy(audio_pad) + feats = audio_pad if self.isHalf is True: feats = feats.half() else: @@ -180,13 +180,10 @@ class Pipeline(object): with torch.no_grad(): audio1 = ( ( - self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] - * 32768 + torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5 ) - .data.cpu() - .float() - .numpy() - .astype(np.int16) + .data + .to(dtype=torch.int16) ) except RuntimeError as e: if "HALF" in e.__str__().upper(): diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py index cd13bcca..493ef945 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py @@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor): f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0 = torchcrepe.predict( - torch.tensor(audio).unsqueeze(0), + audio.unsqueeze(0), sr, hop_length=window, fmin=f0_min, @@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor): decoder=torchcrepe.decode.weighted_argmax, device=self.device, ) - f0 = f0.squeeze().detach().cpu().numpy() + f0 = torchcrepe.filter.median(f0, 3) + f0 = f0.squeeze() - f0 = np.pad( - f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame) + f0 = torch.nn.functional.pad( + f0, (start_frame, n_frames - f0.shape[0] - start_frame) ) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int) + f0bak = f0.detach().cpu().numpy() + f0_mel = 1127. * torch.log(1. + f0 / 700.) + f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.) + f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int) return f0_coarse, f0bak diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py index eafc72be..ac0d61cd 100644 --- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class DioPitchExtractor(PitchExtractor): def extract(self, audio, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py index 4043092f..b4c60886 100644 --- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class HarvestPitchExtractor(PitchExtractor): def extract(self, audio, f0_up_key, sr, window, silence_front=0): + audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr