mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
update resample
This commit is contained in:
parent
04847306af
commit
52f0e496ef
@ -1,10 +1,10 @@
|
||||
import sys
|
||||
import os
|
||||
import resampy
|
||||
from dataclasses import asdict
|
||||
from typing import cast
|
||||
import numpy as np
|
||||
import torch
|
||||
import torchaudio
|
||||
from ModelSample import getModelSamples
|
||||
from voice_changer.RVC.SampleDownloader import downloadModelFiles
|
||||
|
||||
@ -89,6 +89,7 @@ class RVC:
|
||||
self.switchModel(self.settings.modelSlotIndex)
|
||||
self.initialLoad = False
|
||||
break
|
||||
self.prevVol = 0.
|
||||
|
||||
def getSampleInfo(self, id: str):
|
||||
sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
|
||||
@ -293,16 +294,17 @@ class RVC:
|
||||
|
||||
convertOffset = -1 * convertSize
|
||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||
audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
|
||||
|
||||
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
||||
cropOffset = -1 * (inputSize + crossfadeSize)
|
||||
cropEnd = -1 * (crossfadeSize)
|
||||
crop = self.audio_buffer[cropOffset:cropEnd]
|
||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||
vol = max(rms, self.prevVol * 0.0)
|
||||
crop = audio_buffer[cropOffset:cropEnd]
|
||||
vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
|
||||
vol = max(vol, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
|
||||
return (self.audio_buffer, convertSize, vol)
|
||||
return (audio_buffer, convertSize, vol)
|
||||
|
||||
def inference(self, data):
|
||||
if self.settings.modelSlotIndex < 0:
|
||||
@ -325,11 +327,10 @@ class RVC:
|
||||
convertSize = data[1]
|
||||
vol = data[2]
|
||||
|
||||
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
|
||||
|
||||
if vol < self.settings.silentThreshold:
|
||||
return np.zeros(convertSize).astype(np.int16)
|
||||
|
||||
audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
|
||||
repeat = 3 if half else 1
|
||||
repeat *= self.settings.rvcQuality # 0 or 3
|
||||
sid = 0
|
||||
@ -351,7 +352,7 @@ class RVC:
|
||||
repeat,
|
||||
)
|
||||
|
||||
result = audio_out * np.sqrt(vol)
|
||||
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||
|
||||
return result
|
||||
|
||||
|
@ -89,7 +89,7 @@ class Pipeline(object):
|
||||
self.t_pad = self.sr * repeat
|
||||
self.t_pad_tgt = self.targetSR * repeat
|
||||
|
||||
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
||||
audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
|
||||
@ -115,7 +115,7 @@ class Pipeline(object):
|
||||
raise NotEnoughDataExtimateF0()
|
||||
|
||||
# tensor型調整
|
||||
feats = torch.from_numpy(audio_pad)
|
||||
feats = audio_pad
|
||||
if self.isHalf is True:
|
||||
feats = feats.half()
|
||||
else:
|
||||
@ -180,13 +180,10 @@ class Pipeline(object):
|
||||
with torch.no_grad():
|
||||
audio1 = (
|
||||
(
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
||||
* 32768
|
||||
torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5
|
||||
)
|
||||
.data.cpu()
|
||||
.float()
|
||||
.numpy()
|
||||
.astype(np.int16)
|
||||
.data
|
||||
.to(dtype=torch.int16)
|
||||
)
|
||||
except RuntimeError as e:
|
||||
if "HALF" in e.__str__().upper():
|
||||
|
@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor):
|
||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||
|
||||
f0 = torchcrepe.predict(
|
||||
torch.tensor(audio).unsqueeze(0),
|
||||
audio.unsqueeze(0),
|
||||
sr,
|
||||
hop_length=window,
|
||||
fmin=f0_min,
|
||||
@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
|
||||
decoder=torchcrepe.decode.weighted_argmax,
|
||||
device=self.device,
|
||||
)
|
||||
f0 = f0.squeeze().detach().cpu().numpy()
|
||||
f0 = torchcrepe.filter.median(f0, 3)
|
||||
f0 = f0.squeeze()
|
||||
|
||||
f0 = np.pad(
|
||||
f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
|
||||
f0 = torch.nn.functional.pad(
|
||||
f0, (start_frame, n_frames - f0.shape[0] - start_frame)
|
||||
)
|
||||
|
||||
f0 *= pow(2, f0_up_key / 12)
|
||||
f0bak = f0.copy()
|
||||
f0_mel = 1127 * np.log(1 + f0 / 700)
|
||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
||||
f0_mel_max - f0_mel_min
|
||||
) + 1
|
||||
f0_mel[f0_mel <= 1] = 1
|
||||
f0_mel[f0_mel > 255] = 255
|
||||
f0_coarse = np.rint(f0_mel).astype(np.int)
|
||||
f0bak = f0.detach().cpu().numpy()
|
||||
f0_mel = 1127. * torch.log(1. + f0 / 700.)
|
||||
f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
|
||||
f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
|
||||
|
||||
return f0_coarse, f0bak
|
||||
|
@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
class DioPitchExtractor(PitchExtractor):
|
||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
class HarvestPitchExtractor(PitchExtractor):
|
||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
Loading…
Reference in New Issue
Block a user