mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 21:45:00 +03:00
optimize starting RVC
This commit is contained in:
parent
2e96f8072a
commit
3a556ebc5f
@ -284,7 +284,6 @@ class RVC:
|
||||
newData.astype(np.float32) / 32768.0
|
||||
) # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||
|
||||
print("newData", newData.shape, crossfadeSize, solaSearchFrame)
|
||||
if self.audio_buffer is not None:
|
||||
# 過去のデータに連結
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||
@ -295,10 +294,14 @@ class RVC:
|
||||
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||
)
|
||||
|
||||
print("convertSize1", convertSize)
|
||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (128 - (convertSize % 128))
|
||||
print("convertSize2", convertSize)
|
||||
|
||||
# バッファがたまっていない場合はzeroで補う
|
||||
if self.audio_buffer.shape[0] < convertSize:
|
||||
self.audio_buffer = np.concatenate(
|
||||
[np.zeros([convertSize]), self.audio_buffer]
|
||||
)
|
||||
|
||||
convertOffset = -1 * convertSize
|
||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||
@ -319,7 +322,6 @@ class RVC:
|
||||
vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy()
|
||||
vol = max(vol, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
print("inf0 : ", audio_buffer.shape, convertSize)
|
||||
|
||||
return (audio_buffer, convertSize, vol)
|
||||
|
||||
@ -347,7 +349,6 @@ class RVC:
|
||||
if vol < self.settings.silentThreshold:
|
||||
return np.zeros(convertSize).astype(np.int16)
|
||||
|
||||
print("inf1 : ", audio.shape)
|
||||
audio = torchaudio.functional.resample(
|
||||
audio, self.settings.modelSamplingRate, 16000, rolloff=0.99
|
||||
)
|
||||
|
@ -14,10 +14,6 @@ from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
# isHalfが変わる場合はPipeline作り直し
|
||||
# device(GPU, isHalf変更が伴わない場合), pitchExtractorの変更は、入れ替えで対応
|
||||
|
||||
|
||||
class Pipeline(object):
|
||||
embedder: Embedder
|
||||
inferencer: Inferencer
|
||||
@ -85,12 +81,13 @@ class Pipeline(object):
|
||||
repeat,
|
||||
protect=0.5,
|
||||
):
|
||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
|
||||
search_index = (
|
||||
self.index is not None and self.big_npy is not None and index_rate != 0
|
||||
)
|
||||
self.t_pad = self.sr * repeat
|
||||
self.t_pad_tgt = self.targetSR * repeat
|
||||
print("Audio Feature1", audio.shape) # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||
audio_pad = F.pad(
|
||||
audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect"
|
||||
).squeeze(0)
|
||||
@ -130,21 +127,9 @@ class Pipeline(object):
|
||||
feats = feats.view(1, -1)
|
||||
|
||||
# embedding
|
||||
print("audio feature", feats.shape)
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
try:
|
||||
# testFeat = feats.clone()
|
||||
# while True:
|
||||
# print("embedding audio;", testFeat.shape)
|
||||
# testFeatOut = self.embedder.extractFeatures(
|
||||
# testFeat, embOutputLayer, useFinalProj
|
||||
# )
|
||||
# testFeat = testFeat[:, 1:]
|
||||
# print("embedding vector;", testFeatOut.shape)
|
||||
|
||||
print("embedding audio;", feats.shape)
|
||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
||||
print("embedding vector;", feats.shape)
|
||||
except RuntimeError as e:
|
||||
if "HALF" in e.__str__().upper():
|
||||
raise HalfPrecisionChangingException()
|
||||
@ -159,30 +144,17 @@ class Pipeline(object):
|
||||
# if self.index is not None and self.feature is not None and index_rate != 0:
|
||||
if search_index:
|
||||
npy = feats[0].cpu().numpy()
|
||||
print("npy shape", npy.shape, npy.shape[0] * 16000)
|
||||
# apply silent front for indexsearch
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
print(
|
||||
"npyOffset",
|
||||
silence_front,
|
||||
self.targetSR,
|
||||
(silence_front * self.targetSR),
|
||||
npyOffset,
|
||||
)
|
||||
npy = npy[npyOffset:]
|
||||
print(
|
||||
"npy trimmed shape",
|
||||
npy.shape,
|
||||
)
|
||||
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float32")
|
||||
# D, I = self.index.search(npy, 1)
|
||||
# npy = self.feature[I.squeeze()]
|
||||
|
||||
# TODO: kは調整できるようにする
|
||||
k = 1
|
||||
if k == 1:
|
||||
_, ix = self.index.search(npy, 1)
|
||||
print("ix shape", ix.shape)
|
||||
npy = self.big_npy[ix.squeeze()]
|
||||
else:
|
||||
score, ix = self.index.search(npy, k=8)
|
||||
@ -193,11 +165,9 @@ class Pipeline(object):
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float16")
|
||||
|
||||
# recover silient font
|
||||
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]), npy])
|
||||
print(
|
||||
"npy last shape",
|
||||
npy.shape,
|
||||
)
|
||||
|
||||
feats = (
|
||||
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
||||
+ (1 - index_rate) * feats
|
||||
@ -207,6 +177,7 @@ class Pipeline(object):
|
||||
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(
|
||||
0, 2, 1
|
||||
)
|
||||
|
||||
# ピッチサイズ調整
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
if feats.shape[1] < p_len:
|
||||
@ -227,22 +198,14 @@ class Pipeline(object):
|
||||
feats = feats.to(feats0.dtype)
|
||||
p_len = torch.tensor([p_len], device=self.device).long()
|
||||
|
||||
# apply silent front for inference
|
||||
npyOffset = math.floor(silence_front * 16000) // 360
|
||||
print(
|
||||
"npy last shape2",
|
||||
feats.shape,
|
||||
)
|
||||
feats = feats[:, npyOffset * 2 :, :]
|
||||
feats_len = feats.shape[1]
|
||||
pitch = pitch[:, -feats_len:]
|
||||
pitchf = pitchf[:, -feats_len:]
|
||||
p_len = torch.tensor([feats_len], device=self.device).long()
|
||||
|
||||
print(
|
||||
"npy last shape3",
|
||||
feats.shape,
|
||||
feats_len,
|
||||
)
|
||||
# 推論実行
|
||||
try:
|
||||
with torch.no_grad():
|
||||
|
@ -435,7 +435,6 @@ class VoiceChanger:
|
||||
raise RuntimeError("Voice Changer is not selected.")
|
||||
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
print("original frame", receivedData.shape[0])
|
||||
# 前処理
|
||||
with Timer("pre-process") as t:
|
||||
if self.settings.inputSampleRate != processing_sampling_rate:
|
||||
@ -453,7 +452,6 @@ class VoiceChanger:
|
||||
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||
# sola_search_frame = 0
|
||||
block_frame = newData.shape[0]
|
||||
print("block frame", newData.shape[0])
|
||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
||||
self._generate_strength(crossfade_frame)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user