update resample

This commit is contained in:
nadare 2023-05-28 13:54:57 +09:00
parent 04847306af
commit 78ccc10a53
5 changed files with 24 additions and 27 deletions

View File

@ -1,10 +1,10 @@
import sys
import os
import resampy
from dataclasses import asdict
from typing import cast
import numpy as np
import torch
import torchaudio
from ModelSample import getModelSamples
from voice_changer.RVC.SampleDownloader import downloadModelFiles
@ -89,6 +89,7 @@ class RVC:
self.switchModel(self.settings.modelSlotIndex)
self.initialLoad = False
break
self.prevVol = 0.
def getSampleInfo(self, id: str):
sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
@ -293,16 +294,17 @@ class RVC:
convertOffset = -1 * convertSize
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd]
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.0)
crop = audio_buffer[cropOffset:cropEnd]
vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
return (self.audio_buffer, convertSize, vol)
return (audio_buffer, convertSize, vol)
def inference(self, data):
if self.settings.modelSlotIndex < 0:
@ -325,11 +327,10 @@ class RVC:
convertSize = data[1]
vol = data[2]
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
repeat = 3 if half else 1
repeat *= self.settings.rvcQuality # 0 or 3
sid = 0
@ -351,7 +352,7 @@ class RVC:
repeat,
)
result = audio_out * np.sqrt(vol)
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
return result

View File

@ -89,7 +89,7 @@ class Pipeline(object):
self.t_pad = self.sr * repeat
self.t_pad_tgt = self.targetSR * repeat
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
p_len = audio_pad.shape[0] // self.window
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
@ -115,7 +115,7 @@ class Pipeline(object):
raise NotEnoughDataExtimateF0()
# tensor型調整
feats = torch.from_numpy(audio_pad)
feats = audio_pad
if self.isHalf is True:
feats = feats.half()
else:
@ -180,13 +180,10 @@ class Pipeline(object):
with torch.no_grad():
audio1 = (
(
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
* 32768
torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0], -1., 1.) * 32767.5 - .5
)
.data.cpu()
.float()
.numpy()
.astype(np.int16)
.data
.to(dtype=torch.int16)
)
except RuntimeError as e:
if "HALF" in e.__str__().upper():

View File

@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
decoder=torchcrepe.decode.weighted_argmax,
device=self.device,
)
f0 = f0.squeeze().detach().cpu().numpy()
f0 = torchcrepe.filter.median(f0, 3)
f0 = f0.squeeze()
f0 = np.pad(
f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
f0 = torch.nn.functional.pad(
f0, (start_frame, n_frames - f0.shape[0] - start_frame)
)
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(np.int)
f0bak = f0.detach().cpu().numpy()
f0_mel = 1127. * torch.log(1. + f0 / 700.)
f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
return f0_coarse, f0bak

View File

@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr

View File

@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr