mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
108 lines
4.0 KiB
Python
108 lines
4.0 KiB
Python
import onnxruntime
|
|
import torch
|
|
import numpy as np
|
|
import json
|
|
# providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
|
|
providers = ["CPUExecutionProvider"]
|
|
|
|
|
|
class ModelWrapper:
|
|
def __init__(self, onnx_model):
|
|
self.onnx_model = onnx_model
|
|
|
|
# ort_options = onnxruntime.SessionOptions()
|
|
# ort_options.intra_op_num_threads = 8
|
|
self.onnx_session = onnxruntime.InferenceSession(
|
|
self.onnx_model,
|
|
providers=providers
|
|
)
|
|
# input_info = s
|
|
first_input_type = self.onnx_session.get_inputs()[0].type
|
|
if first_input_type == "tensor(float)":
|
|
self.is_half = False
|
|
else:
|
|
self.is_half = True
|
|
modelmeta = self.onnx_session.get_modelmeta()
|
|
try:
|
|
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
|
|
self.samplingRate = metadata["samplingRate"]
|
|
self.f0 = metadata["f0"]
|
|
self.embChannels = metadata["embChannels"]
|
|
self.modelType = metadata["modelType"]
|
|
self.deprecated = False
|
|
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
|
|
except:
|
|
self.samplingRate = 48000
|
|
self.f0 = True
|
|
self.embChannels = 256
|
|
self.modelType = 0
|
|
self.deprecated = True
|
|
print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
|
print(f"[Voice Changer] This onnx's version is depricated. Please regenerate onnxfile. Fallback to default")
|
|
print(f"[Voice Changer] Onnx metadata: sr:{self.samplingRate}, f0:{self.f0}")
|
|
print(f"[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
|
|
|
def getSamplingRate(self):
|
|
return self.samplingRate
|
|
|
|
def getF0(self):
|
|
return self.f0
|
|
|
|
def getEmbChannels(self):
|
|
return self.embChannels
|
|
|
|
def getModelType(self):
|
|
return self.modelType
|
|
|
|
def getDeprecated(self):
|
|
return self.deprecated
|
|
|
|
def set_providers(self, providers, provider_options=[{}]):
|
|
self.onnx_session.set_providers(providers=providers, provider_options=provider_options)
|
|
|
|
def get_providers(self):
|
|
return self.onnx_session.get_providers()
|
|
|
|
def infer_pitchless(self, feats, p_len, sid):
|
|
if self.is_half:
|
|
audio1 = self.onnx_session.run(
|
|
["audio"],
|
|
{
|
|
"feats": feats.cpu().numpy().astype(np.float16),
|
|
"p_len": p_len.cpu().numpy().astype(np.int64),
|
|
"sid": sid.cpu().numpy().astype(np.int64),
|
|
})
|
|
else:
|
|
audio1 = self.onnx_session.run(
|
|
["audio"],
|
|
{
|
|
"feats": feats.cpu().numpy().astype(np.float32),
|
|
"p_len": p_len.cpu().numpy().astype(np.int64),
|
|
"sid": sid.cpu().numpy().astype(np.int64),
|
|
})
|
|
return torch.tensor(np.array(audio1))
|
|
|
|
def infer(self, feats, p_len, pitch, pitchf, sid):
|
|
if self.is_half:
|
|
audio1 = self.onnx_session.run(
|
|
["audio"],
|
|
{
|
|
"feats": feats.cpu().numpy().astype(np.float16),
|
|
"p_len": p_len.cpu().numpy().astype(np.int64),
|
|
"pitch": pitch.cpu().numpy().astype(np.int64),
|
|
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
|
"sid": sid.cpu().numpy().astype(np.int64),
|
|
})
|
|
else:
|
|
audio1 = self.onnx_session.run(
|
|
["audio"],
|
|
{
|
|
"feats": feats.cpu().numpy().astype(np.float32),
|
|
"p_len": p_len.cpu().numpy().astype(np.int64),
|
|
"pitch": pitch.cpu().numpy().astype(np.int64),
|
|
"pitchf": pitchf.cpu().numpy().astype(np.float32),
|
|
"sid": sid.cpu().numpy().astype(np.int64),
|
|
})
|
|
|
|
return torch.tensor(np.array(audio1))
|