update resample

This commit is contained in:
nadare 2023-05-28 13:54:57 +09:00
parent 04847306af
commit 78ccc10a53
5 changed files with 24 additions and 27 deletions

View File

@ -1,10 +1,10 @@
import sys import sys
import os import os
import resampy
from dataclasses import asdict from dataclasses import asdict
from typing import cast from typing import cast
import numpy as np import numpy as np
import torch import torch
import torchaudio
from ModelSample import getModelSamples from ModelSample import getModelSamples
from voice_changer.RVC.SampleDownloader import downloadModelFiles from voice_changer.RVC.SampleDownloader import downloadModelFiles
@ -89,6 +89,7 @@ class RVC:
self.switchModel(self.settings.modelSlotIndex) self.switchModel(self.settings.modelSlotIndex)
self.initialLoad = False self.initialLoad = False
break break
self.prevVol = 0.
def getSampleInfo(self, id: str): def getSampleInfo(self, id: str):
sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels)) sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
@ -293,16 +294,17 @@ class RVC:
convertOffset = -1 * convertSize convertOffset = -1 * convertSize
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize) cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize) cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd] crop = audio_buffer[cropOffset:cropEnd]
rms = np.sqrt(np.square(crop).mean(axis=0)) vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
vol = max(rms, self.prevVol * 0.0) vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol self.prevVol = vol
return (self.audio_buffer, convertSize, vol) return (audio_buffer, convertSize, vol)
def inference(self, data): def inference(self, data):
if self.settings.modelSlotIndex < 0: if self.settings.modelSlotIndex < 0:
@ -325,11 +327,10 @@ class RVC:
convertSize = data[1] convertSize = data[1]
vol = data[2] vol = data[2]
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
if vol < self.settings.silentThreshold: if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) return np.zeros(convertSize).astype(np.int16)
audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
repeat = 3 if half else 1 repeat = 3 if half else 1
repeat *= self.settings.rvcQuality # 0 or 3 repeat *= self.settings.rvcQuality # 0 or 3
sid = 0 sid = 0
@ -351,7 +352,7 @@ class RVC:
repeat, repeat,
) )
result = audio_out * np.sqrt(vol) result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
return result return result

View File

@ -89,7 +89,7 @@ class Pipeline(object):
self.t_pad = self.sr * repeat self.t_pad = self.sr * repeat
self.t_pad_tgt = self.targetSR * repeat self.t_pad_tgt = self.targetSR * repeat
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
p_len = audio_pad.shape[0] // self.window p_len = audio_pad.shape[0] // self.window
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
@ -115,7 +115,7 @@ class Pipeline(object):
raise NotEnoughDataExtimateF0() raise NotEnoughDataExtimateF0()
# tensor型調整 # tensor型調整
feats = torch.from_numpy(audio_pad) feats = audio_pad
if self.isHalf is True: if self.isHalf is True:
feats = feats.half() feats = feats.half()
else: else:
@ -180,13 +180,10 @@ class Pipeline(object):
with torch.no_grad(): with torch.no_grad():
audio1 = ( audio1 = (
( (
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5
* 32768
) )
.data.cpu() .data
.float() .to(dtype=torch.int16)
.numpy()
.astype(np.int16)
) )
except RuntimeError as e: except RuntimeError as e:
if "HALF" in e.__str__().upper(): if "HALF" in e.__str__().upper():

View File

@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
decoder=torchcrepe.decode.weighted_argmax, decoder=torchcrepe.decode.weighted_argmax,
device=self.device, device=self.device,
) )
f0 = f0.squeeze().detach().cpu().numpy() f0 = torchcrepe.filter.median(f0, 3)
f0 = f0.squeeze()
f0 = np.pad( f0 = torch.nn.functional.pad(
f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame) f0, (start_frame, n_frames - f0.shape[0] - start_frame)
) )
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy() f0bak = f0.detach().cpu().numpy()
f0_mel = 1127 * np.log(1 + f0 / 700) f0_mel = 1127. * torch.log(1. + f0 / 700.)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
f0_mel_max - f0_mel_min f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
return f0_coarse, f0bak return f0_coarse, f0bak

View File

@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor): class DioPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0): def extract(self, audio, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1 n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window) start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr real_silence_front = start_frame * window / sr

View File

@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor): class HarvestPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0): def extract(self, audio, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1 n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window) start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr real_silence_front = start_frame * window / sr