From 5a5f7feefd0f62ef94166e6bd00ddab3a7b655b0 Mon Sep 17 00:00:00 2001 From: nadare <1na2da0re3@gmail.com> Date: Sat, 1 Jul 2023 16:45:25 +0900 Subject: [PATCH] =?UTF-8?q?inference=E3=81=AE=E9=AB=98=E5=93=81=E8=B3=AA?= =?UTF-8?q?=E5=8C=96+=E9=AB=98=E9=80=9F=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/voice_changer/RVC/RVC.py | 68 ++++++++++++------- .../RVC/inferencer/OnnxRVCInferencer.py | 5 +- .../RVC/inferencer/OnnxRVCInferencerNono.py | 2 +- .../RVC/inferencer/RVCInferencer.py | 3 +- .../RVC/inferencer/RVCInferencerNono.py | 3 +- .../RVC/inferencer/RVCInferencerv2.py | 5 +- .../RVC/inferencer/RVCInferencerv2Nono.py | 3 +- .../RVC/inferencer/VorasInferencebeta.py | 1 + .../RVC/inferencer/WebUIInferencer.py | 3 +- .../RVC/inferencer/WebUIInferencerNono.py | 3 +- server/voice_changer/RVC/inferencer/models.py | 5 +- .../rvc_models/infer_pack/models.py | 5 +- .../SynthesizerTrnMs256NSFsid_ONNX.py | 6 +- .../SynthesizerTrnMs256NSFsid_nono_ONNX.py | 6 +- .../SynthesizerTrnMs768NSFsid_ONNX.py | 4 +- .../SynthesizerTrnMs768NSFsid_nono_ONNX.py | 4 +- .../SynthesizerTrnMsNSFsidNono_webui_ONNX.py | 4 +- .../SynthesizerTrnMsNSFsid_webui_ONNX.py | 5 +- server/voice_changer/RVC/pipeline/Pipeline.py | 43 ++++++++---- .../RVC/pitchExtractor/CrepePitchExtractor.py | 13 ++-- .../RVC/pitchExtractor/DioPitchExtractor.py | 20 +++--- .../pitchExtractor/HarvestPitchExtractor.py | 19 +++--- .../voice_changer/utils/VoiceChangerModel.py | 3 + 23 files changed, 141 insertions(+), 92 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index f520e26f..c46bae29 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -23,7 +23,7 @@ else: from voice_changer.RVC.RVCSettings import RVCSettings from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel +from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel from voice_changer.utils.VoiceChangerParams import VoiceChangerParams from voice_changer.RVC.onnxExporter.export2onnx import export2onnx from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager @@ -46,6 +46,8 @@ class RVC(VoiceChangerModel): self.pipeline: Pipeline | None = None self.audio_buffer: AudioInOut | None = None + self.pitchf_buffer: PitchfInOut | None = None + self.feature_buffer: FeatureInOut | None = None self.prevVol = 0.0 self.slotInfo = slotInfo self.initialize() @@ -99,11 +101,18 @@ class RVC(VoiceChangerModel): ): newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) + new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate if self.audio_buffer is not None: # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0) + self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0) else: self.audio_buffer = newData + if self.slotInfo.f0: + self.pitchf_buffer = np.zeros(new_feature_length) + self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels]) convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize @@ -114,36 +123,43 @@ class RVC(VoiceChangerModel): # バッファがたまっていない場合はzeroで補う if self.audio_buffer.shape[0] < convertSize: self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) + if self.slotInfo.f0: + self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer]) + self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer]) convertOffset = -1 * convertSize + featureOffset = -convertSize * 100 // self.slotInfo.samplingRate self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 + if self.slotInfo.f0: + self.pitchf_buffer = self.pitchf_buffer[featureOffset:] + self.feature_buffer = self.feature_buffer[featureOffset:] + + # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) + cropOffset = -1 * (inputSize + crossfadeSize) + cropEnd = -1 * (crossfadeSize) + crop = self.audio_buffer[cropOffset:cropEnd] + vol = np.sqrt(np.square(crop).mean()) + vol = max(vol, self.prevVol * 0.0) + self.prevVol = vol + + return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize) + + def inference(self, data): + audio = data[0] + pitchf = data[1] + feature = data[2] + convertSize = data[3] + vol = data[4] + outSize = data[5] + + if vol < self.settings.silentThreshold: + return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) if self.pipeline is not None: device = self.pipeline.device else: device = torch.device("cpu") - - audio_buffer = torch.from_numpy(self.audio_buffer).to(device=device, dtype=torch.float32) - - # 出力部分だけ切り出して音量を確認。(TODO:段階的消音にする) - cropOffset = -1 * (inputSize + crossfadeSize) - cropEnd = -1 * (crossfadeSize) - crop = audio_buffer[cropOffset:cropEnd] - vol = torch.sqrt(torch.square(crop).mean()).detach().cpu().numpy() - vol = max(vol, self.prevVol * 0.0) - self.prevVol = vol - - return (audio_buffer, convertSize, vol, outSize) - - def inference(self, data): - audio = data[0] - convertSize = data[1] - vol = data[2] - outSize = data[3] - - if vol < self.settings.silentThreshold: - return np.zeros(convertSize).astype(np.int16) * np.sqrt(vol) - + audio = torch.from_numpy(audio).to(device=device, dtype=torch.float32) audio = torchaudio.functional.resample(audio, self.slotInfo.samplingRate, 16000, rolloff=0.99) repeat = 1 if self.settings.rvcQuality else 0 sid = 0 @@ -156,13 +172,15 @@ class RVC(VoiceChangerModel): useFinalProj = self.slotInfo.useFinalProj try: - audio_out = self.pipeline.exec( + audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( sid, audio, + pitchf, + feature, f0_up_key, index_rate, if_f0, - self.settings.extraConvertSize / self.slotInfo.samplingRate, # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 + self.settings.extraConvertSize / self.slotInfo.samplingRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。RVCのモデルのサンプリングレートで処理(★1)。 embOutputLayer, useFinalProj, repeat, diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py index e9bfeec7..9d1c62a1 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencer.py @@ -35,6 +35,7 @@ class OnnxRVCInferencer(Inferencer): pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: if pitch is None or pitchf is None: raise RuntimeError("[Voice Changer] Pitch or Pitchf is not found.") @@ -50,7 +51,7 @@ class OnnxRVCInferencer(Inferencer): "p_len": pitch_length.cpu().numpy().astype(np.int64), "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), - "sid": sid.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64) }, ) else: @@ -61,7 +62,7 @@ class OnnxRVCInferencer(Inferencer): "p_len": pitch_length.cpu().numpy().astype(np.int64), "pitch": pitch.cpu().numpy().astype(np.int64), "pitchf": pitchf.cpu().numpy().astype(np.float32), - "sid": sid.cpu().numpy().astype(np.int64), + "sid": sid.cpu().numpy().astype(np.int64) }, ) diff --git a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py index 7c6f8136..d85d4292 100644 --- a/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/OnnxRVCInferencerNono.py @@ -4,7 +4,6 @@ from const import EnumInferenceTypes from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer - class OnnxRVCInferencerNono(OnnxRVCInferencer): def loadModel(self, file: str, gpu: int): super().loadModel(file, gpu) @@ -18,6 +17,7 @@ class OnnxRVCInferencerNono(OnnxRVCInferencer): pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: if self.isHalf: audio1 = self.model.run( diff --git a/server/voice_changer/RVC/inferencer/RVCInferencer.py b/server/voice_changer/RVC/inferencer/RVCInferencer.py index fb6368fe..9539d77c 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencer.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencer.py @@ -33,5 +33,6 @@ class RVCInferencer(Inferencer): pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py index a796fcbd..30355b9a 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerNono.py @@ -33,5 +33,6 @@ class RVCInferencerNono(Inferencer): pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py index d9fa7e77..31fbe484 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2.py @@ -3,7 +3,6 @@ from const import EnumInferenceTypes from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.inferencer.Inferencer import Inferencer from .rvc_models.infer_pack.models import SynthesizerTrnMs768NSFsid -from typing import Optional class RVCInferencerv2(Inferencer): @@ -33,6 +32,6 @@ class RVCInferencerv2(Inferencer): pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, - out_length: Optional[int] = None, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=out_length) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py index 9aac3260..7b85dc96 100644 --- a/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py +++ b/server/voice_changer/RVC/inferencer/RVCInferencerv2Nono.py @@ -33,5 +33,6 @@ class RVCInferencerv2Nono(Inferencer): pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py index a5b02f40..e7b77cce 100644 --- a/server/voice_changer/RVC/inferencer/VorasInferencebeta.py +++ b/server/voice_changer/RVC/inferencer/VorasInferencebeta.py @@ -35,5 +35,6 @@ class VoRASInferencer(Inferencer): pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: return self.model.infer(feats, pitch_length, pitch, pitchf, sid) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencer.py b/server/voice_changer/RVC/inferencer/WebUIInferencer.py index eb3d442a..d884f4c7 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencer.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencer.py @@ -33,5 +33,6 @@ class WebUIInferencer(Inferencer): pitch: torch.Tensor, pitchf: torch.Tensor, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, pitch, pitchf, sid) + return self.model.infer(feats, pitch_length, pitch, pitchf, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py index f6448443..7bc54f14 100644 --- a/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py +++ b/server/voice_changer/RVC/inferencer/WebUIInferencerNono.py @@ -33,5 +33,6 @@ class WebUIInferencerNono(Inferencer): pitch: torch.Tensor | None, pitchf: torch.Tensor | None, sid: torch.Tensor, + convert_length: int | None, ) -> torch.Tensor: - return self.model.infer(feats, pitch_length, sid) + return self.model.infer(feats, pitch_length, sid, convert_length=convert_length) diff --git a/server/voice_changer/RVC/inferencer/models.py b/server/voice_changer/RVC/inferencer/models.py index 7b80bb74..ac14a671 100644 --- a/server/voice_changer/RVC/inferencer/models.py +++ b/server/voice_changer/RVC/inferencer/models.py @@ -138,6 +138,7 @@ class SynthesizerTrnMsNSFsid(nn.Module): return o, x_mask, (z, z_p, m_p, logs_p) + class SynthesizerTrnMsNSFsidNono(nn.Module): def __init__(self, spec_channels, segment_size, inter_channels, hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, spk_embed_dim, gin_channels, emb_channels, sr=None, **kwargs): super().__init__() @@ -208,10 +209,10 @@ class SynthesizerTrnMsNSFsidNono(nn.Module): o = self.dec(z_slice, g=g) return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) - def infer(self, phone, phone_lengths, sid, max_len=None, out_length=None): + def infer(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=out_length) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py index fc7814d3..abd2dbb8 100644 --- a/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py +++ b/server/voice_changer/RVC/inferencer/rvc_models/infer_pack/models.py @@ -203,6 +203,7 @@ class Generator(torch.nn.Module): super(Generator, self).__init__() self.num_kernels = len(resblock_kernel_sizes) self.num_upsamples = len(upsample_rates) + self.upsample_rates = upsample_rates self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3) resblock = ResBlock1 if resblock == "1" else ResBlock2 @@ -245,7 +246,7 @@ class Generator(torch.nn.Module): # conv2 self.ups_size[i] += (k - 1)//2 # conv1 - self.ups_size[i] += d * (k - 1)//2 + self.ups_size[i] += d[-1] * (k - 1)//2 # upsampling self.ups_size[i] = -(-self.ups_size[i] // upsample_rates[i]) + (upsample_kernel_sizes[i] - upsample_rates[i]) // 2 if i: @@ -297,7 +298,7 @@ class Generator(torch.nn.Module): x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) - out = torch.zeros([x.shape[0], 1, x.shape[0] * np.prod(self.upsample_rates)], device=x.device, dtype=x.dtype) + out = torch.zeros([x.shape[0], 1, out_length], device=x.device, dtype=x.dtype) out[:, :, -x.shape[2]:] = x[:, :, -out.shape[2]:] return out diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py index c3c6121f..827a955c 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_ONNX.py @@ -58,10 +58,12 @@ class SynthesizerTrnMs256NSFsid_ONNX(nn.Module): self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) + + diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py index d200478f..db216d45 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs256NSFsid_nono_ONNX.py @@ -57,10 +57,10 @@ class SynthesizerTrnMs256NSFsid_nono_ONNX(nn.Module): self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) - return o, x_mask, (z, z_p, m_p, logs_p) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) + return o, x_mask, (z, z_p, m_p, logs_p) \ No newline at end of file diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py index ca1d3096..a7193feb 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_ONNX.py @@ -59,10 +59,10 @@ class SynthesizerTrnMs768NSFsid_ONNX(nn.Module): print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py index 1971c3a3..d6d8365e 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMs768NSFsid_nono_ONNX.py @@ -81,10 +81,10 @@ class SynthesizerTrnMs768NSFsid_nono_ONNX(nn.Module): print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py index 2be4d28b..36ad2ebe 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsidNono_webui_ONNX.py @@ -60,10 +60,10 @@ class SynthesizerTrnMsNSFsidNono_webui_ONNX(nn.Module): self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, sid, max_len=None): + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) diff --git a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py index 61b71901..421f3ddd 100644 --- a/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py +++ b/server/voice_changer/RVC/onnxExporter/SynthesizerTrnMsNSFsid_webui_ONNX.py @@ -61,10 +61,11 @@ class SynthesizerTrnMsNSFsid_webui_ONNX(nn.Module): self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) - def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + def forward(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None, convert_length=None): g = self.emb_g(sid).unsqueeze(-1) m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask z = self.flow(z_p, x_mask, g=g, reverse=True) - o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], nsff0, g=g, convert_length=convert_length) return o, x_mask, (z, z_p, m_p, logs_p) + diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 6bf78915..4c321f26 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -13,6 +13,9 @@ from Exceptions import ( from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.inferencer.Inferencer import Inferencer +from voice_changer.RVC.inferencer.OnnxRVCInferencer import OnnxRVCInferencer +from voice_changer.RVC.inferencer.OnnxRVCInferencerNono import OnnxRVCInferencerNono + from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor @@ -70,7 +73,9 @@ class Pipeline(object): def exec( self, sid, - audio, + audio, # torch.tensor [n] + pitchf, # np.array [m] + feature, # np.array [m, feat] f0_up_key, index_rate, if_f0, @@ -98,13 +103,14 @@ class Pipeline(object): # RVC QualityがOnのときにはsilence_frontをオフに。 silence_front = silence_front if repeat == 0 else 0 + pitchf = pitchf if repeat == 0 else torch.zeros([pitchf.shape[0], pitchf.shape[1] * 2]) # ピッチ検出 - pitch, pitchf = None, None try: if if_f0 == 1: pitch, pitchf = self.pitchExtractor.extract( audio_pad, + pitchf, f0_up_key, self.sr, self.window, @@ -114,6 +120,9 @@ class Pipeline(object): pitchf = pitchf[:p_len] pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0) + else: + pitch = None + pitchf = None except IndexError: # print(e) raise NotEnoughDataExtimateF0() @@ -165,9 +174,8 @@ class Pipeline(object): npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) # recover silient font - npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]]).astype("float32"), npy]) + npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:] feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) if protect < 0.5 and search_index: feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) @@ -192,14 +200,21 @@ class Pipeline(object): feats = feats.to(feats0.dtype) p_len = torch.tensor([p_len], device=self.device).long() + feats_buffer = feats.squeeze(0).detach().cpu() + if pitchf is not None: + pitchf_buffer = pitchf.squeeze(0).detach().cpu() + else: + pitchf_buffer = None # apply silent front for inference - npyOffset = math.floor(silence_front * 16000) // 360 - feats = feats[:, npyOffset * 2 :, :] - feats_len = feats.shape[1] - if pitch is not None and pitchf is not None: - pitch = pitch[:, -feats_len:] - pitchf = pitchf[:, -feats_len:] - p_len = torch.tensor([feats_len], device=self.device).long() + if type(self.inferencer) in [OnnxRVCInferencer, OnnxRVCInferencerNono]: + npyOffset = math.floor(silence_front * 16000) // 360 + feats = feats[:, npyOffset * 2 :, :] + feats_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, -feats_len:] + pitchf = pitchf[:, -feats_len:] + p_len = torch.tensor([feats_len], device=self.device).long() + # 推論実行 try: @@ -220,7 +235,7 @@ class Pipeline(object): else: raise e - del feats, p_len, padding_mask + del p_len, padding_mask, pitch, pitchf, feats torch.cuda.empty_cache() # inferで出力されるサンプリングレートはモデルのサンプリングレートになる。 @@ -230,6 +245,6 @@ class Pipeline(object): end = -1 * self.t_pad_tgt audio1 = audio1[offset:end] - del pitch, pitchf, sid + del sid torch.cuda.empty_cache() - return audio1 + return audio1, pitchf_buffer, feats_buffer diff --git a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py index 2f510411..39a30e3b 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepePitchExtractor.py @@ -16,7 +16,7 @@ class CrepePitchExtractor(PitchExtractor): else: self.device = torch.device("cpu") - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr @@ -52,11 +52,12 @@ class CrepePitchExtractor(PitchExtractor): ) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.detach().cpu().numpy() - f0_mel = 1127.0 * torch.log(1.0 + f0 / 700.0) - f0_mel = torch.clip( + pitchf[-f0.shape[0]:] = f0.detach().cpu().numpy()[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 ) - f0_coarse = f0_mel.round().detach().cpu().numpy().astype(int) + pitch_coarse = f0_mel.astype(int) - return f0_coarse, f0bak + return pitch_coarse, pitchf diff --git a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py index 4ef62203..b6a520e3 100644 --- a/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/DioPitchExtractor.py @@ -8,7 +8,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class DioPitchExtractor(PitchExtractor): pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.dio - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) @@ -34,13 +34,13 @@ class DioPitchExtractor(PitchExtractor): f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(int) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf - return f0_coarse, f0bak diff --git a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py index 9e166b7c..11c27b16 100644 --- a/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/HarvestPitchExtractor.py @@ -9,7 +9,7 @@ from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor class HarvestPitchExtractor(PitchExtractor): pitchExtractorType: EnumPitchExtractorTypes = EnumPitchExtractorTypes.harvest - def extract(self, audio, f0_up_key, sr, window, silence_front=0): + def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): audio = audio.detach().cpu().numpy() n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) @@ -35,13 +35,14 @@ class HarvestPitchExtractor(PitchExtractor): f0 = np.pad(f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)) f0 *= pow(2, f0_up_key / 12) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(int) + pitchf[-f0.shape[0]:] = f0[:pitchf.shape[0]] + f0bak = pitchf.copy() + f0_mel = 1127.0 * np.log(1.0 + f0bak / 700.0) + f0_mel = np.clip( + (f0_mel - f0_mel_min) * 254.0 / (f0_mel_max - f0_mel_min) + 1.0, 1.0, 255.0 + ) + pitch_coarse = f0_mel.astype(int) + + return pitch_coarse, pitchf return f0_coarse, f0bak diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index 7dd4fda9..e28690ac 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -5,6 +5,9 @@ from voice_changer.utils.LoadModelParams import LoadModelParams AudioInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] +PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] +FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] + class VoiceChangerModel(Protocol):