From 1fd0422b433f164d57e097c357c08899facdb89a Mon Sep 17 00:00:00 2001 From: wataru Date: Thu, 6 Apr 2023 02:38:50 +0900 Subject: [PATCH] WIP: supprt vrc --- server/voice_changer/RVC/RVC.py | 101 +-------------------------- server/voice_changer/VoiceChanger.py | 14 ---- 2 files changed, 1 insertion(+), 114 deletions(-) diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index 5c083968..695672b0 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -1,6 +1,7 @@ import sys import os +# avoiding parse arg error in RVC sys.argv = ["MMVCServerSIO.py"] if sys.platform.startswith('darwin'): @@ -13,7 +14,6 @@ if sys.platform.startswith('darwin'): else: sys.path.append("RVC") -print("RVC 3") import io from dataclasses import dataclass, asdict, field from functools import reduce @@ -74,30 +74,8 @@ class RVC: def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None, clusterTorchModel: str = None): self.device = torch.device("cuda", index=self.settings.gpu) self.settings.configFile = config - # self.hps = utils.get_hparams_from_file(config) - # self.settings.speakers = self.hps.spk - # hubert model try: - # hubert_path = self.params["hubert"] - # useHubertOnnx = self.params["useHubertOnnx"] - # self.useHubertOnnx = useHubertOnnx - - # if useHubertOnnx == True: - # ort_options = onnxruntime.SessionOptions() - # ort_options.intra_op_num_threads = 8 - # self.hubert_onnx = onnxruntime.InferenceSession( - # HUBERT_ONNX_MODEL_PATH, - # providers=providers - # ) - # else: - # models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - # [hubert_path], - # suffix="", - # ) - # model = models[0] - # model.eval() - # self.hubert_model = model.cpu() models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(["hubert_base.pt"], suffix="",) model = models[0] model.eval() @@ -117,7 +95,6 @@ class RVC: if pyTorch_model_file != None: cpt = torch.load(pyTorch_model_file, map_location="cpu") self.tgt_sr = cpt["config"][-1] - # n_spk = cpt["config"][-3] is_half = False net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) net_g.eval() @@ -126,14 +103,6 @@ class RVC: self.net_g = net_g self.net_g = self.net_g.to(self.device) - # self.net_g = SynthesizerTrn( - # self.hps.data.filter_length // 2 + 1, - # self.hps.train.segment_size // self.hps.data.hop_length, - # **self.hps.model - # ) - # self.net_g.eval() - # utils.load_checkpoint(pyTorch_model_file, self.net_g, None) - # ONNXモデル生成 if onnx_model_file != None: ort_options = onnxruntime.SessionOptions() @@ -201,20 +170,6 @@ class RVC: # return 24000 def generate_input(self, newData: any, inputSize: int, crossfadeSize: int): - # import wave - # filename = "testc2.wav" - # if os.path.exists(filename): - # print("[IORecorder] delete old analyze file.", filename) - # os.remove(filename) - # fo = wave.open(filename, 'wb') - # fo.setnchannels(1) - # fo.setsampwidth(2) - # # fo.setframerate(24000) - # fo.setframerate(self.tgt_sr) - # fo.writeframes(newData.astype(np.int16)) - # fo.close() - - # newData = newData.astype(np.float32) / self.hps.data.max_wav_value newData = newData.astype(np.float32) / 32768.0 if hasattr(self, "audio_buffer"): @@ -267,9 +222,6 @@ class RVC: audio = data[0] convertSize = data[1] vol = data[2] - # from scipy.io import wavfile - # # wavfile.write("testa.wav", self.tgt_sr, audio * 32768.0) - # wavfile.write("testa.wav", 24000, audio * 32768.0) filename = "testc2.wav" audio = load_audio(filename, 16000) @@ -309,57 +261,6 @@ class RVC: del self.onnx_session -# def resize_f0(x, target_len): -# source = np.array(x) -# source[source < 0.001] = np.nan -# target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)), source) -# res = np.nan_to_num(target) -# return res - - -# def compute_f0_dio(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): -# if p_len is None: -# p_len = wav_numpy.shape[0] // hop_length -# f0, t = pw.dio( -# wav_numpy.astype(np.double), -# fs=sampling_rate, -# f0_ceil=800, -# frame_period=1000 * hop_length / sampling_rate, -# ) -# f0 = pw.stonemask(wav_numpy.astype(np.double), f0, t, sampling_rate) -# for index, pitch in enumerate(f0): -# f0[index] = round(pitch, 1) -# return resize_f0(f0, p_len) - - -# def compute_f0_harvest(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512): -# if p_len is None: -# p_len = wav_numpy.shape[0] // hop_length -# f0, t = pw.harvest(wav_numpy.astype(np.double), fs=sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0) - -# for index, pitch in enumerate(f0): -# f0[index] = round(pitch, 1) -# return resize_f0(f0, p_len) - - -# def get_hubert_content_layer9(hmodel, wav_16k_tensor): -# feats = wav_16k_tensor -# if feats.dim() == 2: # double channels -# feats = feats.mean(-1) -# assert feats.dim() == 1, feats.dim() -# feats = feats.view(1, -1) -# padding_mask = torch.BoolTensor(feats.shape).fill_(False) -# inputs = { -# "source": feats.to(wav_16k_tensor.device), -# "padding_mask": padding_mask.to(wav_16k_tensor.device), -# "output_layer": 9, # layer 9 -# } -# with torch.no_grad(): -# logits = hmodel.extract_features(**inputs) - -# return logits[0].transpose(1, 2) - - import ffmpeg diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index 6ef03765..023128b5 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -49,35 +49,25 @@ class VoiceChanger(): self.modelType = getModelType() print("[VoiceChanger] activate model type:", self.modelType) - print("RVC!!! 1") if self.modelType == "MMVCv15": - print("RVC!!! 2") from voice_changer.MMVCv15.MMVCv15 import MMVCv15 self.voiceChanger = MMVCv15() elif self.modelType == "MMVCv13": - print("RVC!!! 2") from voice_changer.MMVCv13.MMVCv13 import MMVCv13 self.voiceChanger = MMVCv13() elif self.modelType == "so-vits-svc-40v2": - print("RVC!!! 2") from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2 self.voiceChanger = SoVitsSvc40v2(params) elif self.modelType == "so-vits-svc-40" or self.modelType == "so-vits-svc-40_c": - print("RVC!!! 2") from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40 self.voiceChanger = SoVitsSvc40(params) elif self.modelType == "DDSP-SVC": - print("RVC!!! 2") from voice_changer.DDSP_SVC.DDSP_SVC import DDSP_SVC self.voiceChanger = DDSP_SVC(params) elif self.modelType == "RVC": - print("RVC!!! 22222222222") from voice_changer.RVC.RVC import RVC - print("RVC!!! 2") self.voiceChanger = RVC(params) - else: - print("RVC!!! 3") from voice_changer.MMVCv13.MMVCv13 import MMVCv13 self.voiceChanger = MMVCv13() @@ -220,10 +210,6 @@ class VoiceChanger(): f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}") print_convert_processing(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}") - print( - f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}") - print(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}") - powered_cur = cur_overlap * self.np_cur_strength powered_result = powered_prev + powered_cur