voice-changer/server/voice_changer/RVC/pitchExtractor/RMVPOnnxEPitchExtractor.py
2023-08-26 13:48:03 +09:00

77 lines
3.0 KiB
Python

import numpy as np
from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
import onnxruntime
class RMVPOnnxEPitchExtractor(PitchExtractor):
def __init__(self, file: str, gpu: int):
super().__init__()
self.file = file
self.pitchExtractorType: PitchExtractorType = "rmvpe_onnx"
self.f0_min = 50
self.f0_max = 1100
self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700)
self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700)
(
onnxProviders,
onnxProviderOptions,
) = DeviceManager.get_instance().getOnnxExecutionProvider(gpu)
self.onnxProviders = onnxProviders
self.onnxProviderOptions = onnxProviderOptions
so = onnxruntime.SessionOptions()
so.log_severity_level = 3
self.onnx_session = onnxruntime.InferenceSession(self.file, sess_options=so, providers=onnxProviders, provider_options=onnxProviderOptions)
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
try:
# データ変換
if isinstance(audio, np.ndarray) is False:
audio = audio = audio.cpu().numpy()
if isinstance(pitchf, np.ndarray) is False:
pitchf = pitchf.cpu().numpy().astype(np.float32)
if audio.ndim != 1:
raise RuntimeError(f"Exeption in {self.__class__.__name__} audio.ndim is not 1 (size :{audio.ndim}, {audio.shape})")
if pitchf.ndim != 1:
raise RuntimeError(f"Exeption in {self.__class__.__name__} pitchf.ndim is not 1 (size :{pitchf.ndim}, {pitchf.shape})")
# 処理
silenceFrontFrame = silence_front * sr
startWindow = int(silenceFrontFrame / window) # 小数点以下切り捨て
slienceFrontFrameOffset = startWindow * window
targetFrameLength = len(audio) - slienceFrontFrameOffset
minimumFrames = 0.01 * sr
targetFrameLength = max(minimumFrames, targetFrameLength)
audio = audio[-targetFrameLength:]
audio = np.expand_dims(audio, axis=0)
output = self.onnx_session.run(
["f0", "uv"],
{
"waveform": audio.astype(np.float32),
"threshold": np.array([0.3]).astype(np.float32),
},
)
f0 = output[0].squeeze()
f0 *= pow(2, f0_up_key / 12)
pitchf[-f0.shape[0]:] = f0[: pitchf.shape[0]]
f0_mel = 1127.0 * np.log(1.0 + pitchf / 700.0)
f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - self.f0_mel_min) * 254 / (self.f0_mel_max - self.f0_mel_min) + 1
f0_mel[f0_mel <= 1] = 1
f0_mel[f0_mel > 255] = 255
f0_coarse = np.rint(f0_mel).astype(int)
except Exception as e:
raise RuntimeError(f"Exeption in {self.__class__.__name__}", e)
return f0_coarse, pitchf