optimize convert

This commit is contained in:
wataru 2023-06-02 23:33:46 +09:00
parent bfeef443f5
commit 2e96f8072a
3 changed files with 63 additions and 7 deletions

View File

@ -280,8 +280,11 @@ class RVC:
crossfadeSize: int,
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0
newData = (
newData.astype(np.float32) / 32768.0
) # RVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
print("newData", newData.shape, crossfadeSize, solaSearchFrame)
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
@ -292,8 +295,10 @@ class RVC:
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
)
print("convertSize1", convertSize)
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
print("convertSize2", convertSize)
convertOffset = -1 * convertSize
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
@ -314,6 +319,7 @@ class RVC:
vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy()
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
print("inf0 : ", audio_buffer.shape, convertSize)
return (audio_buffer, convertSize, vol)
@ -341,6 +347,7 @@ class RVC:
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16)
print("inf1 : ", audio.shape)
audio = torchaudio.functional.resample(
audio, self.settings.modelSamplingRate, 16000, rolloff=0.99
)
@ -360,7 +367,8 @@ class RVC:
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.settings.modelSamplingRate,
self.settings.extraConvertSize
/ self.settings.modelSamplingRate, # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
embOutputLayer,
useFinalProj,
repeat,

View File

@ -1,6 +1,6 @@
import numpy as np
from typing import Any
import math
import torch
import torch.nn.functional as F
from Exceptions import (
@ -90,7 +90,7 @@ class Pipeline(object):
)
self.t_pad = self.sr * repeat
self.t_pad_tgt = self.targetSR * repeat
print("Audio Feature1", audio.shape) # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
audio_pad = F.pad(
audio.unsqueeze(0), (self.t_pad, self.t_pad), mode="reflect"
).squeeze(0)
@ -130,9 +130,21 @@ class Pipeline(object):
feats = feats.view(1, -1)
# embedding
print("audio feature", feats.shape)
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
try:
# testFeat = feats.clone()
# while True:
# print("embedding audio;", testFeat.shape)
# testFeatOut = self.embedder.extractFeatures(
# testFeat, embOutputLayer, useFinalProj
# )
# testFeat = testFeat[:, 1:]
# print("embedding vector;", testFeatOut.shape)
print("embedding audio;", feats.shape)
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
print("embedding vector;", feats.shape)
except RuntimeError as e:
if "HALF" in e.__str__().upper():
raise HalfPrecisionChangingException()
@ -147,6 +159,20 @@ class Pipeline(object):
# if self.index is not None and self.feature is not None and index_rate != 0:
if search_index:
npy = feats[0].cpu().numpy()
print("npy shape", npy.shape, npy.shape[0] * 16000)
npyOffset = math.floor(silence_front * 16000) // 360
print(
"npyOffset",
silence_front,
self.targetSR,
(silence_front * self.targetSR),
npyOffset,
)
npy = npy[npyOffset:]
print(
"npy trimmed shape",
npy.shape,
)
if self.isHalf is True:
npy = npy.astype("float32")
# D, I = self.index.search(npy, 1)
@ -156,6 +182,7 @@ class Pipeline(object):
k = 1
if k == 1:
_, ix = self.index.search(npy, 1)
print("ix shape", ix.shape)
npy = self.big_npy[ix.squeeze()]
else:
score, ix = self.index.search(npy, k=8)
@ -166,6 +193,11 @@ class Pipeline(object):
if self.isHalf is True:
npy = npy.astype("float16")
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]), npy])
print(
"npy last shape",
npy.shape,
)
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
@ -195,6 +227,22 @@ class Pipeline(object):
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
npyOffset = math.floor(silence_front * 16000) // 360
print(
"npy last shape2",
feats.shape,
)
feats = feats[:, npyOffset * 2 :, :]
feats_len = feats.shape[1]
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
print(
"npy last shape3",
feats.shape,
feats_len,
)
# 推論実行
try:
with torch.no_grad():

View File

@ -435,7 +435,7 @@ class VoiceChanger:
raise RuntimeError("Voice Changer is not selected.")
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
print("original frame", receivedData.shape[0])
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != processing_sampling_rate:
@ -453,6 +453,7 @@ class VoiceChanger:
sola_search_frame = int(0.012 * processing_sampling_rate)
# sola_search_frame = 0
block_frame = newData.shape[0]
print("block frame", newData.shape[0])
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
@ -472,8 +473,7 @@ class VoiceChanger:
sola_search_frame + crossfade_frame + block_frame
)
audio = audio[audio_offset:]
a = 0
audio = audio[a:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],