mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
update resample
This commit is contained in:
parent
04847306af
commit
52f0e496ef
@ -1,10 +1,10 @@
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import resampy
|
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
from typing import cast
|
from typing import cast
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
import torchaudio
|
||||||
from ModelSample import getModelSamples
|
from ModelSample import getModelSamples
|
||||||
from voice_changer.RVC.SampleDownloader import downloadModelFiles
|
from voice_changer.RVC.SampleDownloader import downloadModelFiles
|
||||||
|
|
||||||
@ -89,6 +89,7 @@ class RVC:
|
|||||||
self.switchModel(self.settings.modelSlotIndex)
|
self.switchModel(self.settings.modelSlotIndex)
|
||||||
self.initialLoad = False
|
self.initialLoad = False
|
||||||
break
|
break
|
||||||
|
self.prevVol = 0.
|
||||||
|
|
||||||
def getSampleInfo(self, id: str):
|
def getSampleInfo(self, id: str):
|
||||||
sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
|
sampleInfos = list(filter(lambda x: x.id == id, self.settings.sampleModels))
|
||||||
@ -293,16 +294,17 @@ class RVC:
|
|||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||||
|
audio_buffer = torch.from_numpy(self.audio_buffer).to(device=self.pipeline.device, dtype=torch.float32)
|
||||||
|
|
||||||
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
|
||||||
cropOffset = -1 * (inputSize + crossfadeSize)
|
cropOffset = -1 * (inputSize + crossfadeSize)
|
||||||
cropEnd = -1 * (crossfadeSize)
|
cropEnd = -1 * (crossfadeSize)
|
||||||
crop = self.audio_buffer[cropOffset:cropEnd]
|
crop = audio_buffer[cropOffset:cropEnd]
|
||||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
vol = torch.sqrt(torch.square(crop).mean(axis=0)).detach().cpu().numpy()
|
||||||
vol = max(rms, self.prevVol * 0.0)
|
vol = max(vol, self.prevVol * 0.0)
|
||||||
self.prevVol = vol
|
self.prevVol = vol
|
||||||
|
|
||||||
return (self.audio_buffer, convertSize, vol)
|
return (audio_buffer, convertSize, vol)
|
||||||
|
|
||||||
def inference(self, data):
|
def inference(self, data):
|
||||||
if self.settings.modelSlotIndex < 0:
|
if self.settings.modelSlotIndex < 0:
|
||||||
@ -325,11 +327,10 @@ class RVC:
|
|||||||
convertSize = data[1]
|
convertSize = data[1]
|
||||||
vol = data[2]
|
vol = data[2]
|
||||||
|
|
||||||
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
|
|
||||||
|
|
||||||
if vol < self.settings.silentThreshold:
|
if vol < self.settings.silentThreshold:
|
||||||
return np.zeros(convertSize).astype(np.int16)
|
return np.zeros(convertSize).astype(np.int16)
|
||||||
|
|
||||||
|
audio = torchaudio.functional.resample(audio, self.settings.modelSamplingRate, 16000, rolloff=0.99)
|
||||||
repeat = 3 if half else 1
|
repeat = 3 if half else 1
|
||||||
repeat *= self.settings.rvcQuality # 0 or 3
|
repeat *= self.settings.rvcQuality # 0 or 3
|
||||||
sid = 0
|
sid = 0
|
||||||
@ -351,7 +352,7 @@ class RVC:
|
|||||||
repeat,
|
repeat,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = audio_out * np.sqrt(vol)
|
result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ class Pipeline(object):
|
|||||||
self.t_pad = self.sr * repeat
|
self.t_pad = self.sr * repeat
|
||||||
self.t_pad_tgt = self.targetSR * repeat
|
self.t_pad_tgt = self.targetSR * repeat
|
||||||
|
|
||||||
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
audio_pad = F.pad(audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
||||||
p_len = audio_pad.shape[0] // self.window
|
p_len = audio_pad.shape[0] // self.window
|
||||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||||
|
|
||||||
@ -115,7 +115,7 @@ class Pipeline(object):
|
|||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
|
|
||||||
# tensor型調整
|
# tensor型調整
|
||||||
feats = torch.from_numpy(audio_pad)
|
feats = audio_pad
|
||||||
if self.isHalf is True:
|
if self.isHalf is True:
|
||||||
feats = feats.half()
|
feats = feats.half()
|
||||||
else:
|
else:
|
||||||
@ -180,13 +180,10 @@ class Pipeline(object):
|
|||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
audio1 = (
|
audio1 = (
|
||||||
(
|
(
|
||||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
torch.clip(self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0].to(dtype=torch.float32), -1., 1.) * 32767.5 - .5
|
||||||
* 32768
|
|
||||||
)
|
)
|
||||||
.data.cpu()
|
.data
|
||||||
.float()
|
.to(dtype=torch.int16)
|
||||||
.numpy()
|
|
||||||
.astype(np.int16)
|
|
||||||
)
|
)
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
if "HALF" in e.__str__().upper():
|
if "HALF" in e.__str__().upper():
|
||||||
|
@ -27,7 +27,7 @@ class CrepePitchExtractor(PitchExtractor):
|
|||||||
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
f0_mel_max = 1127 * np.log(1 + f0_max / 700)
|
||||||
|
|
||||||
f0 = torchcrepe.predict(
|
f0 = torchcrepe.predict(
|
||||||
torch.tensor(audio).unsqueeze(0),
|
audio.unsqueeze(0),
|
||||||
sr,
|
sr,
|
||||||
hop_length=window,
|
hop_length=window,
|
||||||
fmin=f0_min,
|
fmin=f0_min,
|
||||||
@ -38,20 +38,17 @@ class CrepePitchExtractor(PitchExtractor):
|
|||||||
decoder=torchcrepe.decode.weighted_argmax,
|
decoder=torchcrepe.decode.weighted_argmax,
|
||||||
device=self.device,
|
device=self.device,
|
||||||
)
|
)
|
||||||
f0 = f0.squeeze().detach().cpu().numpy()
|
f0 = torchcrepe.filter.median(f0, 3)
|
||||||
|
f0 = f0.squeeze()
|
||||||
|
|
||||||
f0 = np.pad(
|
f0 = torch.nn.functional.pad(
|
||||||
f0.astype("float"), (start_frame, n_frames - f0.shape[0] - start_frame)
|
f0, (start_frame, n_frames - f0.shape[0] - start_frame)
|
||||||
)
|
)
|
||||||
|
|
||||||
f0 *= pow(2, f0_up_key / 12)
|
f0 *= pow(2, f0_up_key / 12)
|
||||||
f0bak = f0.copy()
|
f0bak = f0.detach().cpu().numpy()
|
||||||
f0_mel = 1127 * np.log(1 + f0 / 700)
|
f0_mel = 1127. * torch.log(1. + f0 / 700.)
|
||||||
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
|
f0_mel = torch.clip((f0_mel - f0_mel_min) * 254. / (f0_mel_max - f0_mel_min) + 1., 1., 255.)
|
||||||
f0_mel_max - f0_mel_min
|
f0_coarse = f0_mel.round().detach().cpu().numpy().astype(np.int)
|
||||||
) + 1
|
|
||||||
f0_mel[f0_mel <= 1] = 1
|
|
||||||
f0_mel[f0_mel > 255] = 255
|
|
||||||
f0_coarse = np.rint(f0_mel).astype(np.int)
|
|
||||||
|
|
||||||
return f0_coarse, f0bak
|
return f0_coarse, f0bak
|
||||||
|
@ -6,6 +6,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
|||||||
|
|
||||||
class DioPitchExtractor(PitchExtractor):
|
class DioPitchExtractor(PitchExtractor):
|
||||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||||
|
audio = audio.detach().cpu().numpy()
|
||||||
n_frames = int(len(audio) // window) + 1
|
n_frames = int(len(audio) // window) + 1
|
||||||
start_frame = int(silence_front * sr / window)
|
start_frame = int(silence_front * sr / window)
|
||||||
real_silence_front = start_frame * window / sr
|
real_silence_front = start_frame * window / sr
|
||||||
|
@ -7,6 +7,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
|||||||
|
|
||||||
class HarvestPitchExtractor(PitchExtractor):
|
class HarvestPitchExtractor(PitchExtractor):
|
||||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||||
|
audio = audio.detach().cpu().numpy()
|
||||||
n_frames = int(len(audio) // window) + 1
|
n_frames = int(len(audio) // window) + 1
|
||||||
start_frame = int(silence_front * sr / window)
|
start_frame = int(silence_front * sr / window)
|
||||||
real_silence_front = start_frame * window / sr
|
real_silence_front = start_frame * window / sr
|
||||||
|
Loading…
Reference in New Issue
Block a user