diff --git a/server/Exceptions.py b/server/Exceptions.py index 22f61a1a..34bab4e7 100644 --- a/server/Exceptions.py +++ b/server/Exceptions.py @@ -13,6 +13,16 @@ class HalfPrecisionChangingException(Exception): return repr("HalfPrecision related exception.") +class DeviceChangingException(Exception): + def __str__(self): + return repr("Device changing...") + + +class NotEnoughDataExtimateF0(Exception): + def __str__(self): + return repr("Not enough data to estimate f0.") + + class ONNXInputArgumentException(Exception): def __str__(self): return repr("ONNX received invalid argument.") diff --git a/server/voice_changer/RVC/Pipeline.py b/server/voice_changer/RVC/Pipeline.py index ac79f678..845926d6 100644 --- a/server/voice_changer/RVC/Pipeline.py +++ b/server/voice_changer/RVC/Pipeline.py @@ -1,63 +1,109 @@ import numpy as np +from typing import Any -# import parselmouth import torch import torch.nn.functional as F -from Exceptions import HalfPrecisionChangingException +from Exceptions import ( + DeviceChangingException, + HalfPrecisionChangingException, + NotEnoughDataExtimateF0, +) from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor -class VC(object): - def __init__(self, tgt_sr, device, is_half, x_pad): - self.sr = 16000 # hubert输入采样率 - self.window = 160 # 每帧点数 - self.t_pad = self.sr * x_pad # 每条前后pad时间 - self.t_pad_tgt = tgt_sr * x_pad - self.device = device - self.is_half = is_half +class Pipeline(object): + embedder: Embedder + inferencer: Inferencer + pitchExtractor: PitchExtractor - def pipeline( + index: Any | None + feature: Any | None + + targetSR: int + device: torch.device + isHalf: bool + + def __init__( self, embedder: Embedder, inferencer: Inferencer, pitchExtractor: PitchExtractor, + index: Any | None, + feature: Any | None, + targetSR, + device, + isHalf, + ): + self.embedder = embedder + self.inferencer = inferencer + self.pitchExtractor = pitchExtractor + + self.index = index + self.feature = feature + + self.targetSR = targetSR + self.device = device + self.isHalf = isHalf + + self.sr = 16000 + self.window = 160 + + self.device = device + self.isHalf = isHalf + + def setDevice(self, device: torch.device): + self.device = device + self.embedder.setDevice(device) + self.inferencer.setDevice(device) + + def setPitchExtractor(self, pitchExtractor: PitchExtractor): + self.pitchExtractor = pitchExtractor + + def exec( + self, sid, audio, f0_up_key, - index, - big_npy, index_rate, if_f0, - silence_front=0, - embChannels=256, + silence_front, + embChannels, + repeat, ): + self.t_pad = self.sr * repeat + self.t_pad_tgt = self.targetSR * repeat + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") p_len = audio_pad.shape[0] // self.window sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() # ピッチ検出 pitch, pitchf = None, None - if if_f0 == 1: - pitch, pitchf = pitchExtractor.extract( - audio_pad, - f0_up_key, - self.sr, - self.window, - silence_front=silence_front, - ) - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor( - pitchf, device=self.device, dtype=torch.float - ).unsqueeze(0) + try: + if if_f0 == 1: + pitch, pitchf = self.pitchExtractor.extract( + audio_pad, + f0_up_key, + self.sr, + self.window, + silence_front=silence_front, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor( + pitchf, device=self.device, dtype=torch.float + ).unsqueeze(0) + except IndexError as e: + print(e) + raise NotEnoughDataExtimateF0() # tensor型調整 feats = torch.from_numpy(audio_pad) - if self.is_half is True: + if self.isHalf is True: feats = feats.half() else: feats = feats.float() @@ -69,25 +115,23 @@ class VC(object): # embedding padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) try: - feats = embedder.extractFeatures(feats, embChannels) + feats = self.embedder.extractFeatures(feats, embChannels) except RuntimeError as e: if "HALF" in e.__str__().upper(): raise HalfPrecisionChangingException() + elif "same device" in e.__str__(): + raise DeviceChangingException() else: raise e # Index - feature抽出 - if ( - isinstance(index, type(None)) is False - and isinstance(big_npy, type(None)) is False - and index_rate != 0 - ): + if self.index is not None and self.feature is not None and index_rate != 0: npy = feats[0].cpu().numpy() - if self.is_half is True: + if self.isHalf is True: npy = npy.astype("float32") - D, I = index.search(npy, 1) - npy = big_npy[I.squeeze()] - if self.is_half is True: + D, I = self.index.search(npy, 1) + npy = self.feature[I.squeeze()] + if self.isHalf is True: npy = npy.astype("float16") feats = ( @@ -110,7 +154,7 @@ class VC(object): with torch.no_grad(): audio1 = ( ( - inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] + self.inferencer.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768 ) .data.cpu() diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 5700f89e..606bde8d 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -4,7 +4,6 @@ from Exceptions import NoModeLoadedException from voice_changer.RVC.ModelSlot import ModelSlot from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager -from voice_changer.RVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager # avoiding parse arg error in RVC @@ -25,9 +24,7 @@ from voice_changer.RVC.modelMerger.MergeModel import merge_model from voice_changer.RVC.modelMerger.MergeModelRequest import MergeModelRequest from voice_changer.RVC.ModelSlotGenerator import generateModelSlot from voice_changer.RVC.RVCSettings import RVCSettings -from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager -from voice_changer.RVC.inferencer.Inferencer import Inferencer from voice_changer.RVC.inferencer.InferencerManager import InferencerManager from voice_changer.utils.LoadModelParams import FilePaths, LoadModelParams from voice_changer.utils.VoiceChangerModel import AudioInOut @@ -47,7 +44,7 @@ import faiss from const import UPLOAD_DIR, EnumEmbedderTypes -from voice_changer.RVC.Pipeline import VC +from voice_changer.RVC.Pipeline import Pipeline providers = [ "OpenVINOExecutionProvider", @@ -61,10 +58,8 @@ class RVC: initialLoad: bool = True settings: RVCSettings = RVCSettings() - embedder: Embedder | None = None - inferencer: Inferencer | None = None + pipeline: Pipeline | None = None - pitchExtractor: PitchExtractor | None = None deviceManager = DeviceManager.get_instance() audio_buffer: AudioInOut | None = None @@ -149,7 +144,26 @@ class RVC: print("[Voice Changer] exception! loading embedder", e) traceback.print_exc() - return inferencer, embedder + # pitchExtractor + pitchExtractor = PitchExtractorManager.getPitchExtractor( + self.settings.f0Detector + ) + + # index, feature + index, feature = self.loadIndex(modelSlot) + + pipeline = Pipeline( + embedder, + inferencer, + pitchExtractor, + index, + feature, + modelSlot.samplingRate, + dev, + half, + ) + + return pipeline def loadIndex(self, modelSlot: ModelSlot): # Indexのロード @@ -187,16 +201,8 @@ class RVC: print("[Voice Changer] Prepare Model of slot:", slot) - # Inferencer, embedderのロード - inferencer, embedder = self.createPipeline(modelSlot) - - self.next_inferencer = inferencer - self.next_embedder = embedder - - # Indexのロード - index, feature = self.loadIndex(modelSlot) - self.next_index = index - self.next_feature = feature + # pipelineの生成 + self.next_pipeline = self.createPipeline(modelSlot) # その他の設定 self.next_trans = modelSlot.defaultTrans @@ -208,10 +214,7 @@ class RVC: def switchModel(self): print("[Voice Changer] Switching model..") - self.embedder = self.next_embedder - self.inferencer = self.next_inferencer - self.feature = self.next_feature - self.index = self.next_index + self.pipeline = self.next_pipeline self.settings.tran = self.next_trans self.settings.modelSamplingRate = self.next_samplingRate self.settings.framework = self.next_framework @@ -229,27 +232,21 @@ class RVC: return True val = val % 1000 # Quick hack for same slot is selected self.prepareModel(val) - self.needSwitch = True # 設定 setattr(self.settings, key, val) - if key == "gpu" and self.embedder is not None: + if key == "gpu": dev = self.deviceManager.getDevice(val) half = self.deviceManager.halfPrecisionAvailable(val) # half-precisionの使用可否が変わるときは作り直し - if ( - self.inferencer is not None - and self.inferencer.isHalf == half - and self.embedder.isHalf == half - ): + if self.pipeline is not None and self.pipeline.isHalf == half: print( "USE EXSISTING PIPELINE", half, ) - self.embedder.setDevice(dev) - self.inferencer.setDevice(dev) + self.pipeline.setDevice(dev) else: print("CHAGE TO NEW PIPELINE", half) self.prepareModel(self.settings.modelSlotIndex) @@ -257,10 +254,11 @@ class RVC: setattr(self.settings, key, float(val)) elif key in self.settings.strData: setattr(self.settings, key, str(val)) - if key == "f0Detector": - self.pitchExtractor = PitchExtractorManager.getPitchExtractor( + if key == "f0Detector" and self.pipeline is not None: + pitchExtractor = PitchExtractorManager.getPitchExtractor( self.settings.f0Detector ) + self.pipeline.setPitchExtractor(pitchExtractor) else: return False return True @@ -323,7 +321,6 @@ class RVC: self.switchModel() self.needSwitch = False - dev = self.deviceManager.getDevice(self.settings.gpu) half = self.deviceManager.halfPrecisionAvailable(self.settings.gpu) audio = data[0] @@ -337,7 +334,6 @@ class RVC: repeat = 3 if half else 1 repeat *= self.settings.rvcQuality # 0 or 3 - vc = VC(self.settings.modelSamplingRate, dev, half, repeat) sid = 0 f0_up_key = self.settings.tran index_rate = self.settings.indexRatio @@ -345,20 +341,15 @@ class RVC: embChannels = self.settings.modelSlots[self.currentSlot].embChannels - audio_out = vc.pipeline( - self.embedder, - self.inferencer, - self.pitchExtractor, + audio_out = self.pipeline.exec( sid, audio, f0_up_key, - self.index, - self.feature, index_rate, if_f0, - silence_front=self.settings.extraConvertSize - / self.settings.modelSamplingRate, - embChannels=embChannels, + self.settings.extraConvertSize / self.settings.modelSamplingRate, + embChannels, + repeat, ) result = audio_out * np.sqrt(vol) @@ -366,8 +357,7 @@ class RVC: return result def __del__(self): - del self.inferencer - del self.embedder + del self.pipeline print("---------- REMOVING ---------------") diff --git a/server/voice_changer/RVC/deviceManager/DeviceManager.py b/server/voice_changer/RVC/deviceManager/DeviceManager.py index b61acdff..9075e99a 100644 --- a/server/voice_changer/RVC/deviceManager/DeviceManager.py +++ b/server/voice_changer/RVC/deviceManager/DeviceManager.py @@ -32,13 +32,17 @@ class DeviceManager(object): if id < 0: return False - gpuName = torch.cuda.get_device_name(id).upper() - if ( - ("16" in gpuName and "V100" not in gpuName) - or "P40" in gpuName.upper() - or "1070" in gpuName - or "1080" in gpuName - ): + try: + gpuName = torch.cuda.get_device_name(id).upper() + if ( + ("16" in gpuName and "V100" not in gpuName) + or "P40" in gpuName.upper() + or "1070" in gpuName + or "1080" in gpuName + ): + return False + except Exception as e: + print(e) return False return True diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index d00d3dc8..092a1e12 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -14,8 +14,10 @@ from voice_changer.utils.LoadModelParams import LoadModelParams from voice_changer.utils.Timer import Timer from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut from Exceptions import ( + DeviceChangingException, HalfPrecisionChangingException, NoModeLoadedException, + NotEnoughDataExtimateF0, ONNXInputArgumentException, ) from voice_changer.utils.VoiceChangerParams import VoiceChangerParams @@ -348,6 +350,12 @@ class VoiceChanger: except HalfPrecisionChangingException as e: print("[Voice Changer] Switching model configuration....", e) return np.zeros(1).astype(np.int16), [0, 0, 0] + except NotEnoughDataExtimateF0 as e: + print("[Voice Changer] not enough data", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] + except DeviceChangingException as e: + print("[Voice Changer] embedder:", e) + return np.zeros(1).astype(np.int16), [0, 0, 0] except Exception as e: print("VC PROCESSING!!!! EXCEPTION!!!", e) print(traceback.format_exc())