mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
refactor pipeline
This commit is contained in:
parent
e14e823fec
commit
617551db49
@ -13,6 +13,16 @@ class HalfPrecisionChangingException(Exception):
|
||||
return repr("HalfPrecision related exception.")
|
||||
|
||||
|
||||
class DeviceChangingException(Exception):
|
||||
def __str__(self):
|
||||
return repr("Device changing...")
|
||||
|
||||
|
||||
class NotEnoughDataExtimateF0(Exception):
|
||||
def __str__(self):
|
||||
return repr("Not enough data to estimate f0.")
|
||||
|
||||
|
||||
class ONNXInputArgumentException(Exception):
|
||||
def __str__(self):
|
||||
return repr("ONNX received invalid argument.")
|
||||
|
@ -1,63 +1,109 @@
|
||||
import numpy as np
|
||||
from typing import Any
|
||||
|
||||
# import parselmouth
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from Exceptions import HalfPrecisionChangingException
|
||||
from Exceptions import (
|
||||
DeviceChangingException,
|
||||
HalfPrecisionChangingException,
|
||||
NotEnoughDataExtimateF0,
|
||||
)
|
||||
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
|
||||
|
||||
class VC(object):
|
||||
def __init__(self, tgt_sr, device, is_half, x_pad):
|
||||
self.sr = 16000 # hubert输入采样率
|
||||
self.window = 160 # 每帧点数
|
||||
self.t_pad = self.sr * x_pad # 每条前后pad时间
|
||||
self.t_pad_tgt = tgt_sr * x_pad
|
||||
self.device = device
|
||||
self.is_half = is_half
|
||||
class Pipeline(object):
|
||||
embedder: Embedder
|
||||
inferencer: Inferencer
|
||||
pitchExtractor: PitchExtractor
|
||||
|
||||
def pipeline(
|
||||
index: Any | None
|
||||
feature: Any | None
|
||||
|
||||
targetSR: int
|
||||
device: torch.device
|
||||
isHalf: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedder: Embedder,
|
||||
inferencer: Inferencer,
|
||||
pitchExtractor: PitchExtractor,
|
||||
index: Any | None,
|
||||
feature: Any | None,
|
||||
targetSR,
|
||||
device,
|
||||
isHalf,
|
||||
):
|
||||
self.embedder = embedder
|
||||
self.inferencer = inferencer
|
||||
self.pitchExtractor = pitchExtractor
|
||||
|
||||
self.index = index
|
||||
self.feature = feature
|
||||
|
||||
self.targetSR = targetSR
|
||||
self.device = device
|
||||
self.isHalf = isHalf
|
||||
|
||||
self.sr = 16000
|
||||
self.window = 160
|
||||
|
||||
self.device = device
|
||||
self.isHalf = isHalf
|
||||
|
||||
def setDevice(self, device: torch.device):
|
||||
self.device = device
|
||||
self.embedder.setDevice(device)
|
||||
self.inferencer.setDevice(device)
|
||||
|
||||
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
||||
self.pitchExtractor = pitchExtractor
|
||||
|
||||
def exec(
|
||||
self,
|
||||
sid,
|
||||
audio,
|
||||
f0_up_key,
|
||||
index,
|
||||
big_npy,
|
||||
index_rate,
|
||||
if_f0,
|
||||
silence_front=0,
|
||||
embChannels=256,
|
||||
silence_front,
|
||||
embChannels,
|
||||
repeat,
|
||||
):
|
||||
self.t_pad = self.sr * repeat
|
||||
self.t_pad_tgt = self.targetSR * repeat
|
||||
|
||||
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
||||
p_len = audio_pad.shape[0] // self.window
|
||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||
|
||||
# ピッチ検出
|
||||
pitch, pitchf = None, None
|
||||
if if_f0 == 1:
|
||||
pitch, pitchf = pitchExtractor.extract(
|
||||
audio_pad,
|
||||
f0_up_key,
|
||||
self.sr,
|
||||
self.window,
|
||||
silence_front=silence_front,
|
||||
)
|
||||
pitch = pitch[:p_len]
|
||||
pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
pitchf = torch.tensor(
|
||||
pitchf, device=self.device, dtype=torch.float
|
||||
).unsqueeze(0)
|
||||
try:
|
||||
if if_f0 == 1:
|
||||
pitch, pitchf = self.pitchExtractor.extract(
|
||||
audio_pad,
|
||||
f0_up_key,
|
||||
self.sr,
|
||||
self.window,
|
||||
silence_front=silence_front,
|
||||
)
|
||||
pitch = pitch[:p_len]
|
||||
pitchf = pitchf[:p_len]
|
||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||
pitchf = torch.tensor(
|
||||
pitchf, device=self.device, dtype=torch.float
|
||||
).unsqueeze(0)
|
||||
except IndexError as e:
|
||||
print(e)
|
||||
raise NotEnoughDataExtimateF0()
|
||||
|
||||
# tensor型調整
|
||||
feats = torch.from_numpy(audio_pad)
|
||||
if self.is_half is True:
|
||||
if self.isHalf is True:
|
||||
feats = feats.half()
|
||||
else:
|
||||
feats = feats.float()
|
||||
@ -69,25 +115,23 @@ class VC(object):
|
||||
# embedding
|
||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||
try:
|
||||
feats = embedder.extractFeatures(feats, embChannels)
|
||||
feats = self.embedder.extractFeatures(feats, embChannels)
|
||||
except RuntimeError as e:
|
||||
if "HALF" in e.__str__().upper():
|
||||
raise HalfPrecisionChangingException()
|
||||
elif "same device" in e.__str__():
|
||||
raise DeviceChangingException()
|
||||
else:
|
||||
raise e
|
||||
|
||||
# Index - feature抽出
|
||||
if (
|
||||
isinstance(index, type(None)) is False
|
||||
and isinstance(big_npy, type(None)) is False
|
||||
and index_rate != 0
|
||||
):
|
||||
if self.index is not None and self.feature is not None and index_rate != 0:
|
||||
npy = feats[0].cpu().numpy()
|
||||
if self.is_half is True:
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float32")
|
||||
D, I = index.search(npy, 1)
|
||||
npy = big_npy[I.squeeze()]
|
||||
if self.is_half is True:
|
||||
D, I = self.index.search(npy, 1)
|
||||
npy = self.feature[I.squeeze()]
|
||||
if self.isHalf is True:
|
||||
npy = npy.astype("float16")
|
||||
|
||||
feats = (
|
||||
@ -110,7 +154,7 @@ class VC(object):
|
||||
with torch.no_grad():
|
||||
audio1 = (
|
||||
(
|
||||
inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
||||
self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]
|
||||
* 32768
|
||||
)
|
||||
.data.cpu()
|
||||
|
@ -4,7 +4,6 @@ from Exceptions import NoModeLoadedException
|
||||
from voice_changer.RVC.ModelSlot import ModelSlot
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||
|
||||
# avoiding parse arg error in RVC
|
||||
@ -25,9 +24,7 @@ from voice_changer.RVC.modelMerger.MergeModel import merge_model
|
||||
from voice_changer.RVC.modelMerger.MergeModelRequest import MergeModelRequest
|
||||
from voice_changer.RVC.ModelSlotGenerator import generateModelSlot
|
||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||
from voice_changer.RVC.embedder.Embedder import Embedder
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from voice_changer.RVC.inferencer.Inferencer import Inferencer
|
||||
from voice_changer.RVC.inferencer.InferencerManager import InferencerManager
|
||||
from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
@ -47,7 +44,7 @@ import faiss
|
||||
from const import UPLOAD_DIR, EnumEmbedderTypes
|
||||
|
||||
|
||||
from voice_changer.RVC.Pipeline import VC
|
||||
from voice_changer.RVC.Pipeline import Pipeline
|
||||
|
||||
providers = [
|
||||
"OpenVINOExecutionProvider",
|
||||
@ -61,10 +58,8 @@ class RVC:
|
||||
initialLoad: bool = True
|
||||
settings: RVCSettings = RVCSettings()
|
||||
|
||||
embedder: Embedder | None = None
|
||||
inferencer: Inferencer | None = None
|
||||
pipeline: Pipeline | None = None
|
||||
|
||||
pitchExtractor: PitchExtractor | None = None
|
||||
deviceManager = DeviceManager.get_instance()
|
||||
|
||||
audio_buffer: AudioInOut | None = None
|
||||
@ -149,7 +144,26 @@ class RVC:
|
||||
print("[Voice Changer] exception! loading embedder", e)
|
||||
traceback.print_exc()
|
||||
|
||||
return inferencer, embedder
|
||||
# pitchExtractor
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||
self.settings.f0Detector
|
||||
)
|
||||
|
||||
# index, feature
|
||||
index, feature = self.loadIndex(modelSlot)
|
||||
|
||||
pipeline = Pipeline(
|
||||
embedder,
|
||||
inferencer,
|
||||
pitchExtractor,
|
||||
index,
|
||||
feature,
|
||||
modelSlot.samplingRate,
|
||||
dev,
|
||||
half,
|
||||
)
|
||||
|
||||
return pipeline
|
||||
|
||||
def loadIndex(self, modelSlot: ModelSlot):
|
||||
# Indexのロード
|
||||
@ -187,16 +201,8 @@ class RVC:
|
||||
|
||||
print("[Voice Changer] Prepare Model of slot:", slot)
|
||||
|
||||
# Inferencer, embedderのロード
|
||||
inferencer, embedder = self.createPipeline(modelSlot)
|
||||
|
||||
self.next_inferencer = inferencer
|
||||
self.next_embedder = embedder
|
||||
|
||||
# Indexのロード
|
||||
index, feature = self.loadIndex(modelSlot)
|
||||
self.next_index = index
|
||||
self.next_feature = feature
|
||||
# pipelineの生成
|
||||
self.next_pipeline = self.createPipeline(modelSlot)
|
||||
|
||||
# その他の設定
|
||||
self.next_trans = modelSlot.defaultTrans
|
||||
@ -208,10 +214,7 @@ class RVC:
|
||||
|
||||
def switchModel(self):
|
||||
print("[Voice Changer] Switching model..")
|
||||
self.embedder = self.next_embedder
|
||||
self.inferencer = self.next_inferencer
|
||||
self.feature = self.next_feature
|
||||
self.index = self.next_index
|
||||
self.pipeline = self.next_pipeline
|
||||
self.settings.tran = self.next_trans
|
||||
self.settings.modelSamplingRate = self.next_samplingRate
|
||||
self.settings.framework = self.next_framework
|
||||
@ -229,27 +232,21 @@ class RVC:
|
||||
return True
|
||||
val = val % 1000 # Quick hack for same slot is selected
|
||||
self.prepareModel(val)
|
||||
self.needSwitch = True
|
||||
|
||||
# 設定
|
||||
setattr(self.settings, key, val)
|
||||
|
||||
if key == "gpu" and self.embedder is not None:
|
||||
if key == "gpu":
|
||||
dev = self.deviceManager.getDevice(val)
|
||||
half = self.deviceManager.halfPrecisionAvailable(val)
|
||||
|
||||
# half-precisionの使用可否が変わるときは作り直し
|
||||
if (
|
||||
self.inferencer is not None
|
||||
and self.inferencer.isHalf == half
|
||||
and self.embedder.isHalf == half
|
||||
):
|
||||
if self.pipeline is not None and self.pipeline.isHalf == half:
|
||||
print(
|
||||
"USE EXSISTING PIPELINE",
|
||||
half,
|
||||
)
|
||||
self.embedder.setDevice(dev)
|
||||
self.inferencer.setDevice(dev)
|
||||
self.pipeline.setDevice(dev)
|
||||
else:
|
||||
print("CHAGE TO NEW PIPELINE", half)
|
||||
self.prepareModel(self.settings.modelSlotIndex)
|
||||
@ -257,10 +254,11 @@ class RVC:
|
||||
setattr(self.settings, key, float(val))
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
if key == "f0Detector":
|
||||
self.pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||
if key == "f0Detector" and self.pipeline is not None:
|
||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||
self.settings.f0Detector
|
||||
)
|
||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||
else:
|
||||
return False
|
||||
return True
|
||||
@ -323,7 +321,6 @@ class RVC:
|
||||
self.switchModel()
|
||||
self.needSwitch = False
|
||||
|
||||
dev = self.deviceManager.getDevice(self.settings.gpu)
|
||||
half = self.deviceManager.halfPrecisionAvailable(self.settings.gpu)
|
||||
|
||||
audio = data[0]
|
||||
@ -337,7 +334,6 @@ class RVC:
|
||||
|
||||
repeat = 3 if half else 1
|
||||
repeat *= self.settings.rvcQuality # 0 or 3
|
||||
vc = VC(self.settings.modelSamplingRate, dev, half, repeat)
|
||||
sid = 0
|
||||
f0_up_key = self.settings.tran
|
||||
index_rate = self.settings.indexRatio
|
||||
@ -345,20 +341,15 @@ class RVC:
|
||||
|
||||
embChannels = self.settings.modelSlots[self.currentSlot].embChannels
|
||||
|
||||
audio_out = vc.pipeline(
|
||||
self.embedder,
|
||||
self.inferencer,
|
||||
self.pitchExtractor,
|
||||
audio_out = self.pipeline.exec(
|
||||
sid,
|
||||
audio,
|
||||
f0_up_key,
|
||||
self.index,
|
||||
self.feature,
|
||||
index_rate,
|
||||
if_f0,
|
||||
silence_front=self.settings.extraConvertSize
|
||||
/ self.settings.modelSamplingRate,
|
||||
embChannels=embChannels,
|
||||
self.settings.extraConvertSize / self.settings.modelSamplingRate,
|
||||
embChannels,
|
||||
repeat,
|
||||
)
|
||||
|
||||
result = audio_out * np.sqrt(vol)
|
||||
@ -366,8 +357,7 @@ class RVC:
|
||||
return result
|
||||
|
||||
def __del__(self):
|
||||
del self.inferencer
|
||||
del self.embedder
|
||||
del self.pipeline
|
||||
|
||||
print("---------- REMOVING ---------------")
|
||||
|
||||
|
@ -32,13 +32,17 @@ class DeviceManager(object):
|
||||
if id < 0:
|
||||
return False
|
||||
|
||||
gpuName = torch.cuda.get_device_name(id).upper()
|
||||
if (
|
||||
("16" in gpuName and "V100" not in gpuName)
|
||||
or "P40" in gpuName.upper()
|
||||
or "1070" in gpuName
|
||||
or "1080" in gpuName
|
||||
):
|
||||
try:
|
||||
gpuName = torch.cuda.get_device_name(id).upper()
|
||||
if (
|
||||
("16" in gpuName and "V100" not in gpuName)
|
||||
or "P40" in gpuName.upper()
|
||||
or "1070" in gpuName
|
||||
or "1080" in gpuName
|
||||
):
|
||||
return False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
@ -14,8 +14,10 @@ from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
from voice_changer.utils.Timer import Timer
|
||||
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
|
||||
from Exceptions import (
|
||||
DeviceChangingException,
|
||||
HalfPrecisionChangingException,
|
||||
NoModeLoadedException,
|
||||
NotEnoughDataExtimateF0,
|
||||
ONNXInputArgumentException,
|
||||
)
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
@ -348,6 +350,12 @@ class VoiceChanger:
|
||||
except HalfPrecisionChangingException as e:
|
||||
print("[Voice Changer] Switching model configuration....", e)
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except NotEnoughDataExtimateF0 as e:
|
||||
print("[Voice Changer] not enough data", e)
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except DeviceChangingException as e:
|
||||
print("[Voice Changer] embedder:", e)
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except Exception as e:
|
||||
print("VC PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
|
Loading…
Reference in New Issue
Block a user