inferenceの高品質化+高速化

This commit is contained in:
nadare 2023-07-01 16:45:25 +09:00
parent 92cd384486
commit 5a5f7feefd
23 changed files with 141 additions and 92 deletions

View File

@ -23,7 +23,7 @@ else:
from voice_changer.RVC.RVCSettings import RVCSettings
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
@ -46,6 +46,8 @@ class RVC(VoiceChangerModel):
self.pipeline: Pipeline | None = None
self.audio_buffer: AudioInOut | None = None
self.pitchf_buffer: PitchfInOut | None = None
self.feature_buffer: FeatureInOut | None = None
self.prevVol = 0.0
self.slotInfo = slotInfo
self.initialize()
@ -99,11 +101,18 @@ class RVC(VoiceChangerModel):
):
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
else:
self.audio_buffer = newData
if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(new_feature_length)
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
@ -114,36 +123,43 @@ class RVC(VoiceChangerModel):
# バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
convertOffset = -1 * convertSize
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
if self.slotInfo.f0:
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
self.feature_buffer = self.feature_buffer[featureOffset:]
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd]
vol = np.sqrt(np.square(crop).mean())
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)
def inference(self, data):
audio = data[0]
pitchf = data[1]
feature = data[2]
convertSize = data[3]
vol = data[4]
outSize = data[5]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
if self.pipeline is not None:
device = self.pipeline.device
else:
device = torch.device("cpu")
audio_buffer = torch.from_numpy(self.audio_buffer).to(device=device, dtype=torch.float32)
# 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = audio_buffer[cropOffset:cropEnd]
vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy()
vol = max(vol, self.prevVol * 0.0)
self.prevVol = vol
return (audio_buffer, convertSize, vol, outSize)
def inference(self, data):
audio = data[0]
convertSize = data[1]
vol = data[2]
outSize = data[3]
if vol < self.settings.silentThreshold:
return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol)
audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32)
audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99)
repeat = 1 if self.settings.rvcQuality else 0
sid = 0
@ -156,13 +172,15 @@ class RVC(VoiceChangerModel):
useFinalProj = self.slotInfo.useFinalProj
try:
audio_out = self.pipeline.exec(
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid,
audio,
pitchf,
feature,
f0_up_key,
index_rate,
if_f0,
self.settings.extraConvertSize / self.slotInfo.samplingRate, # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。
embOutputLayer,
useFinalProj,
repeat,

View File

@ -35,6 +35,7 @@ class OnnxRVCInferencer(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
if pitch is None or pitchf is None:
raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.")
@ -50,7 +51,7 @@ class OnnxRVCInferencer(Inferencer):
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64)
},
)
else:
@ -61,7 +62,7 @@ class OnnxRVCInferencer(Inferencer):
"p_len": pitch_length.cpu().numpy().astype(np.int64),
"pitch": pitch.cpu().numpy().astype(np.int64),
"pitchf": pitchf.cpu().numpy().astype(np.float32),
"sid": sid.cpu().numpy().astype(np.int64),
"sid": sid.cpu().numpy().astype(np.int64)
},
)

View File

@ -4,7 +4,6 @@ from const import EnumInferenceTypes
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
class OnnxRVCInferencerNono(OnnxRVCInferencer):
def loadModel(self, file: str, gpu: int):
super().loadModel(file, gpu)
@ -18,6 +17,7 @@ class OnnxRVCInferencerNono(OnnxRVCInferencer):
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
if self.isHalf:
audio1 = self.model.run(

View File

@ -33,5 +33,6 @@ class RVCInferencer(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)

View File

@ -33,5 +33,6 @@ class RVCInferencerNono(Inferencer):
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)

View File

@ -3,7 +3,6 @@ from const import EnumInferenceTypes
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid
from typing import Optional
class RVCInferencerv2(Inferencer):
@ -33,6 +32,6 @@ class RVCInferencerv2(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
out_length: Optional[int] = None,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)

View File

@ -33,5 +33,6 @@ class RVCInferencerv2Nono(Inferencer):
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)

View File

@ -35,5 +35,6 @@ class VoRASInferencer(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)

View File

@ -33,5 +33,6 @@ class WebUIInferencer(Inferencer):
pitch: torch.Tensor,
pitchf: torch.Tensor,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, pitch, pitchf, sid)
return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length)

View File

@ -33,5 +33,6 @@ class WebUIInferencerNono(Inferencer):
pitch: torch.Tensor | None,
pitchf: torch.Tensor | None,
sid: torch.Tensor,
convert_length: int | None,
) -> torch.Tensor:
return self.model.infer(feats, pitch_length, sid)
return self.model.infer(feats, pitch_length, sid, convert_length=convert_length)

View File

@ -138,6 +138,7 @@ class SynthesizerTrnMsNSFsid(nn.Module):
return o, x_mask, (z, z_p, m_p, logs_p)
class SynthesizerTrnMsNSFsidNono(nn.Module):
def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, emb_channels, sr=None, **kwargs):
super().__init__()
@ -208,10 +209,10 @@ class SynthesizerTrnMsNSFsidNono(nn.Module):
o = self.dec(z_slice, g=g)
return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None):
def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -203,6 +203,7 @@ class Generator(torch.nn.Module):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.upsample_rates = upsample_rates
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
resblock = ResBlock1 if resblock == "1" else ResBlock2
@ -245,7 +246,7 @@ class Generator(torch.nn.Module):
# conv2
self.ups_size[i] += (k - 1)//2
# conv1
self.ups_size[i] += d * (k - 1)//2
self.ups_size[i] += d[-1] * (k - 1)//2
# upsampling
self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2
if i:
@ -297,7 +298,7 @@ class Generator(torch.nn.Module):
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype)
out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype)
out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:]
return out

View File

@ -58,10 +58,12 @@ class SynthesizerTrnMs256NSFsid_ONNX(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -57,10 +57,10 @@ class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, sid, max_len=None):
def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
return o, x_mask, (z, z_p, m_p, logs_p)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -59,10 +59,10 @@ class SynthesizerTrnMs768NSFsid_ONNX(nn.Module):
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -81,10 +81,10 @@ class SynthesizerTrnMs768NSFsid_nono_ONNX(nn.Module):
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, sid, max_len=None):
def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -60,10 +60,10 @@ class SynthesizerTrnMsNSFsidNono_webui_ONNX(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, sid, max_len=None):
def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -61,10 +61,11 @@ class SynthesizerTrnMsNSFsid_webui_ONNX(nn.Module):
self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim)
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None):
def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None):
g = self.emb_g(sid).unsqueeze(-1)
m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
z = self.flow(z_p, x_mask, g=g, reverse=True)
o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length)
return o, x_mask, (z, z_p, m_p, logs_p)

View File

@ -13,6 +13,9 @@ from Exceptions import (
from voice_changer.RVC.embedder.Embedder import Embedder
from voice_changer.RVC.inferencer.Inferencer import Inferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer
from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
@ -70,7 +73,9 @@ class Pipeline(object):
def exec(
self,
sid,
audio,
audio, # torch.tensor [n]
pitchf, # np.array [m]
feature, # np.array [m, feat]
f0_up_key,
index_rate,
if_f0,
@ -98,13 +103,14 @@ class Pipeline(object):
# RVC QualityがOnのときにはsilence_frontをオフに。
silence_front = silence_front if repeat == 0 else 0
pitchf = pitchf if repeat == 0 else torch.zeros([pitchf.shape[0], pitchf.shape[1] * 2])
# ピッチ検出
pitch, pitchf = None, None
try:
if if_f0 == 1:
pitch, pitchf = self.pitchExtractor.extract(
audio_pad,
pitchf,
f0_up_key,
self.sr,
self.window,
@ -114,6 +120,9 @@ class Pipeline(object):
pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
else:
pitch = None
pitchf = None
except IndexError:
# print(e)
raise NotEnoughDataExtimateF0()
@ -165,9 +174,8 @@ class Pipeline(object):
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
# recover silient font
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy])
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
if protect < 0.5 and search_index:
feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
@ -192,14 +200,21 @@ class Pipeline(object):
feats = feats.to(feats0.dtype)
p_len = torch.tensor([p_len], device=self.device).long()
feats_buffer = feats.squeeze(0).detach().cpu()
if pitchf is not None:
pitchf_buffer = pitchf.squeeze(0).detach().cpu()
else:
pitchf_buffer = None
# apply silent front for inference
npyOffset = math.floor(silence_front * 16000) // 360
feats = feats[:, npyOffset * 2 :, :]
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]:
npyOffset = math.floor(silence_front * 16000) // 360
feats = feats[:, npyOffset * 2 :, :]
feats_len = feats.shape[1]
if pitch is not None and pitchf is not None:
pitch = pitch[:, -feats_len:]
pitchf = pitchf[:, -feats_len:]
p_len = torch.tensor([feats_len], device=self.device).long()
# 推論実行
try:
@ -220,7 +235,7 @@ class Pipeline(object):
else:
raise e
del feats, p_len, padding_mask
del p_len, padding_mask, pitch, pitchf, feats
torch.cuda.empty_cache()
# inferで出力されるサンプリングレートはモデルのサンプリングレートになる。
@ -230,6 +245,6 @@ class Pipeline(object):
end = -1 * self.t_pad_tgt
audio1 = audio1[offset:end]
del pitch, pitchf, sid
del sid
torch.cuda.empty_cache()
return audio1
return audio1, pitchf_buffer, feats_buffer

View File

@ -16,7 +16,7 @@ class CrepePitchExtractor(PitchExtractor):
else:
self.device = torch.device("cpu")
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr
@ -52,11 +52,12 @@ class CrepePitchExtractor(PitchExtractor):
)
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.detach().cpu().numpy()
f0_mel = 1127.0 * torch.log(1.0 + f0 / 700.0)
f0_mel = torch.clip(
pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
f0_coarse = f0_mel.round().detach().cpu().numpy().astype(int)
pitch_coarse = f0_mel.astype(int)
return f0_coarse, f0bak
return pitch_coarse, pitchf

View File

@ -8,7 +8,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class DioPitchExtractor(PitchExtractor):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
@ -34,13 +34,13 @@ class DioPitchExtractor(PitchExtractor):
f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(int)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf
return f0_coarse, f0bak

View File

@ -9,7 +9,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
class HarvestPitchExtractor(PitchExtractor):
pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
audio = audio.detach().cpu().numpy()
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
@ -35,13 +35,14 @@ class HarvestPitchExtractor(PitchExtractor):
f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame))
f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy()
f0_mel = 1127 * np.log(1 + f0 / 700)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
f0_mel_max - f0_mel_min
) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(int)
pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]]
f0bak = pitchf.copy()
f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0)
f0_mel = np.clip(
(f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0
)
pitch_coarse = f0_mel.astype(int)
return pitch_coarse, pitchf
return f0_coarse, f0bak

View File

@ -5,6 +5,9 @@ from voice_changer.utils.LoadModelParams import LoadModelParams
AudioInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
class VoiceChangerModel(Protocol):