From dadab1ad131cd39d59d812faa64ffc12a20d7f6e Mon Sep 17 00:00:00 2001 From: w-okada Date: Sun, 12 Nov 2023 23:10:58 +0900 Subject: [PATCH] Experimental LLVC --- .../components/demo/904-3_FileUploader.tsx | 12 + client/lib/src/VoiceChangerClient.ts | 1 + client/lib/src/const.ts | 13 +- client/lib/src/hooks/useServerSetting.ts | 3 + server/const.py | 1 + server/data/ModelSlot.py | 25 +- server/voice_changer/LLVC/LLVC.py | 198 ++++++++ server/voice_changer/LLVC/LLVCInferencer.py | 71 +++ .../LLVC/LLVCModelSlotGenerator.py | 19 + .../LLVC/model/cached_convnet.py | 156 ++++++ server/voice_changer/LLVC/model/llvc.py | 464 ++++++++++++++++++ server/voice_changer/VoiceChangerManager.py | 15 + server/voice_changer/VoiceChangerV2.py | 104 ++-- server/voice_changer/utils/LoadModelParams.py | 2 + .../voice_changer/utils/VoiceChangerModel.py | 2 + 15 files changed, 1009 insertions(+), 77 deletions(-) create mode 100644 server/voice_changer/LLVC/LLVC.py create mode 100644 server/voice_changer/LLVC/LLVCInferencer.py create mode 100644 server/voice_changer/LLVC/LLVCModelSlotGenerator.py create mode 100644 server/voice_changer/LLVC/model/cached_convnet.py create mode 100644 server/voice_changer/LLVC/model/llvc.py diff --git a/client/demo/src/components/demo/904-3_FileUploader.tsx b/client/demo/src/components/demo/904-3_FileUploader.tsx index a3a56e8f..db36a93d 100644 --- a/client/demo/src/components/demo/904-3_FileUploader.tsx +++ b/client/demo/src/components/demo/904-3_FileUploader.tsx @@ -116,6 +116,15 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => { return x.kind == "beatriceModel"; }); return enough; + } else if (setting.voiceChangerType == "LLVC") { + const enough = + !!setting.files.find((x) => { + return x.kind == "llvcModel"; + }) && + !!setting.files.find((x) => { + return x.kind == "llvcConfig"; + }); + return enough; } return false; }; @@ -177,6 +186,9 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => { rows.push(generateFileRow(uploadSetting!, "Model(combo)", "diffusionSVCModel", ["ptc"])); } else if (vcType == "Beatrice") { rows.push(generateFileRow(uploadSetting!, "Beatrice", "beatriceModel", ["bin"])); + } else if (vcType == "LLVC") { + rows.push(generateFileRow(uploadSetting!, "Model", "llvcModel", ["pth"])); + rows.push(generateFileRow(uploadSetting!, "Config", "llvcConfig", ["json"])); } return rows; }; diff --git a/client/lib/src/VoiceChangerClient.ts b/client/lib/src/VoiceChangerClient.ts index 03fc67d9..d3ad5288 100644 --- a/client/lib/src/VoiceChangerClient.ts +++ b/client/lib/src/VoiceChangerClient.ts @@ -58,6 +58,7 @@ export class VoiceChangerClient { // const ctx44k = new AudioContext({ sampleRate: 44100 }) // これでもプチプチが残る const ctx44k = new AudioContext({ sampleRate: 48000 }); // 結局これが一番まし。 + // const ctx44k = new AudioContext({ sampleRate: 16000 }); // LLVCテスト⇒16K出力でプチプチなしで行ける。 console.log("audio out:", ctx44k); try { this.vcOutNode = new VoiceChangerWorkletNode(ctx44k, voiceChangerWorkletListener); // vc node diff --git a/client/lib/src/const.ts b/client/lib/src/const.ts index b971c900..45ad2ab0 100644 --- a/client/lib/src/const.ts +++ b/client/lib/src/const.ts @@ -11,6 +11,7 @@ export const VoiceChangerType = { RVC: "RVC", "Diffusion-SVC": "Diffusion-SVC", Beatrice: "Beatrice", + LLVC: "LLVC", } as const; export type VoiceChangerType = (typeof VoiceChangerType)[keyof typeof VoiceChangerType]; @@ -37,6 +38,9 @@ export const ModelSamplingRate = { export type ModelSamplingRate = (typeof InputSampleRate)[keyof typeof InputSampleRate]; export const CrossFadeOverlapSize = { + "128": 128, + "256": 256, + "512": 512, "1024": 1024, "2048": 2048, "4096": 4096, @@ -296,7 +300,14 @@ export type BeatriceModelSlot = ModelSlot & { speakers: { [key: number]: string }; }; -export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot | BeatriceModelSlot; +export type LLVCModelSlot = ModelSlot & { + modelFile: string; + configFile: string; + + speakers: { [key: number]: string }; +}; + +export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot | BeatriceModelSlot | LLVCModelSlot; type ServerAudioDevice = { kind: "audioinput" | "audiooutput"; diff --git a/client/lib/src/hooks/useServerSetting.ts b/client/lib/src/hooks/useServerSetting.ts index 38414a12..18d30d8c 100644 --- a/client/lib/src/hooks/useServerSetting.ts +++ b/client/lib/src/hooks/useServerSetting.ts @@ -29,6 +29,9 @@ export const ModelFileKind = { diffusionSVCModel: "diffusionSVCModel", beatriceModel: "beatriceModel", + + llvcModel: "llvcModel", + llvcConfig: "llvcConfig", } as const; export type ModelFileKind = (typeof ModelFileKind)[keyof typeof ModelFileKind]; diff --git a/server/const.py b/server/const.py index d5a3d2b4..1af55e3f 100644 --- a/server/const.py +++ b/server/const.py @@ -13,6 +13,7 @@ VoiceChangerType: TypeAlias = Literal[ "RVC", "Diffusion-SVC", "Beatrice", + "LLVC", ] StaticSlot: TypeAlias = Literal["Beatrice-JVS",] diff --git a/server/data/ModelSlot.py b/server/data/ModelSlot.py index 2eaca0fb..5b5234c7 100644 --- a/server/data/ModelSlot.py +++ b/server/data/ModelSlot.py @@ -134,7 +134,24 @@ class BeatriceModelSlot(ModelSlot): speakers: dict = field(default_factory=lambda: {1: "user1", 2: "user2"}) -ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot, BeatriceModelSlot] +@dataclass +class LLVCModelSlot(ModelSlot): + voiceChangerType: VoiceChangerType = "LLVC" + modelFile: str = "" + configFile: str = "" + + +ModelSlots: TypeAlias = Union[ + ModelSlot, + RVCModelSlot, + MMVCv13ModelSlot, + MMVCv15ModelSlot, + SoVitsSvc40ModelSlot, + DDSPSVCModelSlot, + DiffusionSVCModelSlot, + BeatriceModelSlot, + LLVCModelSlot, +] def loadSlotInfo(model_dir: str, slotIndex: int | StaticSlot) -> ModelSlots: @@ -165,10 +182,12 @@ def loadSlotInfo(model_dir: str, slotIndex: int | StaticSlot) -> ModelSlots: return DiffusionSVCModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey}) elif slotInfo.voiceChangerType == "Beatrice": slotInfoKey.extend(list(BeatriceModelSlot.__annotations__.keys())) - if slotIndex == "Beatrice-JVS": + if slotIndex == "Beatrice-JVS": # STATIC Model return BeatriceModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey}) - return BeatriceModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey}) + elif slotInfo.voiceChangerType == "LLVC": + slotInfoKey.extend(list(LLVCModelSlot.__annotations__.keys())) + return LLVCModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey}) else: return ModelSlot() diff --git a/server/voice_changer/LLVC/LLVC.py b/server/voice_changer/LLVC/LLVC.py new file mode 100644 index 00000000..db2c5718 --- /dev/null +++ b/server/voice_changer/LLVC/LLVC.py @@ -0,0 +1,198 @@ +import traceback +from typing import Any, cast +from scipy import signal +import os +from dataclasses import dataclass, asdict, field +import resampy +from data.ModelSlot import LLVCModelSlot +from mods.log_control import VoiceChangaerLogger +import numpy as np +from voice_changer.LLVC.LLVCInferencer import LLVCInferencer +from voice_changer.ModelSlotManager import ModelSlotManager +from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager +from voice_changer.utils.Timer import Timer2 +from voice_changer.utils.VoiceChangerModel import AudioInOut, AudioInOutFloat, VoiceChangerModel +from voice_changer.utils.VoiceChangerParams import VoiceChangerParams +import math +import torchaudio +import torch + +logger = VoiceChangaerLogger.get_instance().getLogger() + + +@dataclass +class LLVCSetting: + # Crossfade(CF), Resample(RE) 組み合わせ + # CF:True, RE:True -> ブラウザで使える + # CF:True, RE:False -> N/A, 必要のない設定。(Resampleしないと音はぶつぶつしない。) + # CF:False, RE:True -> N/A, 音にぷつぷつが乗るのでNG(client, server両モードでNGだった) + # CF:False, RE:False -> 再生側が16Kに対応していればよい。 + + crossfade: bool = True + resample: bool = True + + # 変更可能な変数だけ列挙 + intData: list[str] = field(default_factory=lambda: []) + floatData: list[str] = field(default_factory=lambda: []) + strData: list[str] = field(default_factory=lambda: []) + + +class LLVC(VoiceChangerModel): + def __init__(self, params: VoiceChangerParams, slotInfo: LLVCModelSlot): + logger.info("[Voice Changer] [LLVC] Creating instance ") + self.voiceChangerType = "LLVC" + self.settings = LLVCSetting() + + self.processingSampleRate = 16000 + bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=self.processingSampleRate) + self.bh = bh + self.ah = ah + + self.params = params + self.slotInfo = slotInfo + self.modelSlotManager = ModelSlotManager.get_instance(self.params.model_dir) + + # # クロスフェード・リサンプリング設定 + # ## 16Kで出力するモード + # self.settings.crossfade = False + # self.settings.resample = False + + ## 48Kで出力するモード + self.settings.crossfade = True + self.settings.resample = True + + self.initialize() + + def initialize(self): + print("[Voice Changer] [LLVC] Initializing... ") + vcparams = VoiceChangerParamsManager.get_instance().params + configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile) + modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile) + + self.inputSampleRate = 48000 + self.outputSampleRate = 48000 + + self.downsampler = torchaudio.transforms.Resample(self.inputSampleRate, self.processingSampleRate) + self.upsampler = torchaudio.transforms.Resample(self.processingSampleRate, self.outputSampleRate) + + self.inferencer = LLVCInferencer().loadModel(modelPath, configPath) + self.prev_audio1 = None + self.result_buff = None + + def updateSetting(self, key: str, val: Any): + if key in self.settings.intData: + setattr(self.settings, key, int(val)) + ret = True + elif key in self.settings.floatData: + setattr(self.settings, key, float(val)) + ret = True + elif key in self.settings.strData: + setattr(self.settings, key, str(val)) + ret = True + else: + ret = False + + return ret + + def setSamplingRate(self, inputSampleRate, outputSampleRate): + self.inputSampleRate = inputSampleRate + self.outputSampleRate = outputSampleRate + self.downsampler = torchaudio.transforms.Resample(self.inputSampleRate, self.processingSampleRate) + self.upsampler = torchaudio.transforms.Resample(self.processingSampleRate, self.outputSampleRate) + + def _preprocess(self, waveform: AudioInOutFloat, srcSampleRate: int) -> AudioInOutFloat: + """データ前処理(torch independent) + ・マルチディメンション処理 + ・リサンプリング( 入力sr -> 16K) + ・バターフィルタ + Args: + waveform: AudioInOutFloat: 入力音声 + srcSampleRate: int: 入力音声のサンプルレート + + Returns: + waveform: AudioInOutFloat: 前処理後の音声(1ch, 16K, np.ndarray) + + Raises: + OSError: ファイル指定が失敗している場合 + + """ + if waveform.ndim == 2: # double channels + waveform = waveform.mean(axis=-1) + waveform16K = resampy.resample(waveform, srcSampleRate, self.processingSampleRate) + # waveform16K = self.downsampler(torch.from_numpy(waveform)).numpy() + waveform16K = signal.filtfilt(self.bh, self.ah, waveform16K) + return waveform16K.copy() + + def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): + try: + # print("CROSSFADE", crossfade_frame, sola_search_frame) + crossfade_frame16k = math.ceil((crossfade_frame / self.outputSampleRate) * self.processingSampleRate) + sola_search_frame16k = math.ceil((sola_search_frame / self.outputSampleRate) * self.processingSampleRate) + + with Timer2("mainPorcess timer", False) as t: + # 起動パラメータ + # vcParams = VoiceChangerParamsManager.get_instance().params + + # リサンプリングとバターフィルタ (torch independent) + receivedData = receivedData.astype(np.float32) / 32768.0 + waveformFloat = self._preprocess(receivedData, self.inputSampleRate) + # print(f"input audio shape 48k:{receivedData.shape} -> 16K:{waveformFloat.shape}") + + # 推論 + audio1 = self.inferencer.infer(waveformFloat) + audio1 = audio1.detach().cpu().numpy() + # print(f"infered shape: in:{waveformFloat.shape} -> out:{ audio1.shape}") + + # クロスフェード洋データ追加とリサンプリング + if self.settings.crossfade is False and self.settings.resample is False: + # 変換後そのまま返却(クロスフェードしない) + new_audio = audio1 + new_audio = (new_audio * 32767.5).astype(np.int16) + return new_audio + + # (1) クロスフェード部分の追加 + crossfade_audio_length = audio1.shape[0] + crossfade_frame16k + sola_search_frame16k + if self.prev_audio1 is not None: + new_audio = np.concatenate([self.prev_audio1, audio1]) + else: + new_audio = audio1 + self.prev_audio1 = new_audio[-crossfade_audio_length:] # 次回のクロスフェード用に保存 + # (2) リサンプル + if self.outputSampleRate != self.processingSampleRate: + new_audio = resampy.resample(new_audio, self.processingSampleRate, self.outputSampleRate) + # new_audio = self.upsampler(torch.from_numpy(new_audio)).numpy() + # new_audio = np.repeat(new_audio, 3) + + # バッファリング。⇒ 最上位(crossfade完了後)で行う必要があるのでとりあえずペンディング + # if self.result_buff is None: + # self.result_buff = new_audio + # else: + # self.result_buff = np.concatenate([self.result_buff, new_audio]) + + # if self.result_buff.shape[0] > receivedData.shape[0]: + # new_audio = self.result_buff[: receivedData.shape[0]] + # self.result_buff = self.result_buff[receivedData.shape[0] :] + # else: + # new_audio = np.zeros(receivedData.shape[0]) + + new_audio = cast(AudioInOutFloat, new_audio) + + new_audio = (new_audio * 32767.5).astype(np.int16) + return new_audio + except Exception as e: + traceback.print_exc() + raise RuntimeError(e) + + def getPipelineInfo(self): + return {"TODO": "LLVC get info"} + + def get_info(self): + data = asdict(self.settings) + + return data + + def get_processing_sampling_rate(self): + return self.processingSampleRate + + def get_model_current(self): + return [] diff --git a/server/voice_changer/LLVC/LLVCInferencer.py b/server/voice_changer/LLVC/LLVCInferencer.py new file mode 100644 index 00000000..faf8cb08 --- /dev/null +++ b/server/voice_changer/LLVC/LLVCInferencer.py @@ -0,0 +1,71 @@ +import numpy as np +import torch +import json +from voice_changer.LLVC.model.llvc import Net + +from voice_changer.utils.VoiceChangerModel import AudioInOutFloat + + +class LLVCInferencer: + def loadModel(self, checkpoint_path: str, config_path: str): + with open(config_path) as f: + config = json.load(f) + model = Net(**config["model_params"]) + model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]) + + self.config = config + self.model = model + + self.enc_buf, self.dec_buf, self.out_buf = self.model.init_buffers(1, torch.device("cpu")) + + if hasattr(self.model, "convnet_pre"): + self.convnet_pre_ctx = self.model.convnet_pre.init_ctx_buf(1, torch.device("cpu")) + else: + self.convnet_pre_ctx = None + + self.audio_buffer: AudioInOutFloat = np.zeros(0, dtype=np.float32) + self.front_ctx: AudioInOutFloat | None = None + + return self + + def infer( + self, + audio: AudioInOutFloat, + ) -> torch.Tensor: + # print(f"[infer] inputsize:{audio.shape} + rest:{self.audio_buffer.shape}") + self.audio_buffer = np.concatenate([self.audio_buffer, audio]) + # print(f"[infer] concat size", self.audio_buffer.shape) + + try: + L = self.model.L + processing_unit = self.model.dec_chunk_size * L + chunk_size = (len(self.audio_buffer) // processing_unit) * processing_unit + + chunk = self.audio_buffer[:chunk_size] + self.audio_buffer = self.audio_buffer[chunk_size:] + + inputTensor = torch.from_numpy(chunk.astype(np.float32)).to("cpu") + + if self.front_ctx is None: + inputTensor = torch.cat([torch.zeros(L * 2), inputTensor]) + else: + inputTensor = torch.cat([self.front_ctx, inputTensor]) + self.front_ctx = inputTensor[-L * 2 :] + + audio1, self.enc_buf, self.dec_buf, self.out_buf, self.convnet_pre_ctx = self.model( + inputTensor.unsqueeze(0).unsqueeze(0), + self.enc_buf, + self.dec_buf, + self.out_buf, + self.convnet_pre_ctx, + pad=(not self.model.lookahead), + ) + # print(f"[infer] input chunk size {chunk.shape} ->(+32) lookaheadsize{inputTensor.shape}->(same chunk) inferedsize{audio1.shape}") + + audio1 = audio1.squeeze(0).squeeze(0) + return audio1 + except Exception as e: + raise RuntimeError(f"Exeption in {self.__class__.__name__}", e) + + # def isTorch(self): + # return True diff --git a/server/voice_changer/LLVC/LLVCModelSlotGenerator.py b/server/voice_changer/LLVC/LLVCModelSlotGenerator.py new file mode 100644 index 00000000..1d486ec3 --- /dev/null +++ b/server/voice_changer/LLVC/LLVCModelSlotGenerator.py @@ -0,0 +1,19 @@ +import os + +from data.ModelSlot import BeatriceModelSlot, LLVCModelSlot +from voice_changer.utils.LoadModelParams import LoadModelParams +from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator + + +class LLVCModelSlotGenerator(ModelSlotGenerator): + @classmethod + def loadModel(cls, props: LoadModelParams): + slotInfo: LLVCModelSlot = LLVCModelSlot() + for file in props.files: + if file.kind == "llvcModel": + slotInfo.modelFile = file.name + if file.kind == "llvcConfig": + slotInfo.configFile = file.name + slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0] + slotInfo.slotIndex = props.slot + return slotInfo diff --git a/server/voice_changer/LLVC/model/cached_convnet.py b/server/voice_changer/LLVC/model/cached_convnet.py new file mode 100644 index 00000000..6e152b85 --- /dev/null +++ b/server/voice_changer/LLVC/model/cached_convnet.py @@ -0,0 +1,156 @@ +# based on https://github.com/YangangCao/Causal-U-Net/blob/main/cunet.py +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualBlock(nn.Module): + """ + Based on https://github.com/f90/Seq-U-Net/blob/master/sequnet_res.py + """ + + def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout, use_2d): + super().__init__() + self.use_2d = use_2d + if use_2d: + self.filter = nn.Conv2d(in_channels, out_channels, kernel_size, dilation=dilation) + self.gate = nn.Conv2d(in_channels, out_channels, kernel_size, dilation=dilation) + self.dropout = nn.Dropout2d(dropout) + else: + self.filter = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation) + self.gate = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation) + self.dropout = nn.Dropout1d(dropout) + self.output_crop = dilation * (kernel_size - 1) + + def forward(self, x): + filtered = torch.tanh(self.filter(x)) + gated = torch.sigmoid(self.gate(x)) + residual = filtered * gated + # pad dim 1 of x to match residual + if self.use_2d: + x = F.pad(x, (0, 0, 0, 0, 0, residual.shape[1] - x.shape[1])) + output = x[..., self.output_crop :, self.output_crop :] + residual + else: + x = F.pad(x, (0, 0, 0, residual.shape[1] - x.shape[1])) + output = x[..., self.output_crop :] + residual + output = self.dropout(output) + return output + + +class CausalConvBlock(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout, use_2d): + super().__init__() + if use_2d: + conv_layer = nn.Conv2d + batchnorm_layer = nn.BatchNorm2d + dropout_layer = nn.Dropout2d + else: + conv_layer = nn.Conv1d + batchnorm_layer = nn.BatchNorm1d + dropout_layer = nn.Dropout1d + self.conv = nn.Sequential( + conv_layer(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, dilation=dilation), + batchnorm_layer(num_features=out_channels), + dropout_layer(dropout), + nn.LeakyReLU(inplace=True), + ) + + def forward(self, x): + """ + 1D Causal convolution. + """ + return self.conv(x) + + +class CachedConvNet(nn.Module): + def __init__(self, num_channels, kernel_sizes, dilations, dropout, combine_residuals, use_residual_blocks, out_channels, use_2d, use_pool=False, pool_kernel=2): + super().__init__() + assert len(kernel_sizes) == len(dilations), "kernel_sizes and dilations must be the same length" + assert len(kernel_sizes) == len(out_channels), "kernel_sizes and out_channels must be the same length" + self.num_layers = len(kernel_sizes) + self.ctx_height = max(out_channels) + self.down_convs = nn.ModuleList() + self.num_channels = num_channels + self.kernel_sizes = kernel_sizes + self.combine_residuals = combine_residuals + self.use_2d = use_2d + self.use_pool = use_pool + + # compute buffer lengths for each layer + self.buf_lengths = [(k - 1) * d for k, d in zip(kernel_sizes, dilations)] + + # Compute buffer start indices for each layer + self.buf_indices = [0] + for i in range(len(kernel_sizes) - 1): + self.buf_indices.append(self.buf_indices[-1] + self.buf_lengths[i]) + + if use_residual_blocks: + block = ResidualBlock + else: + block = CausalConvBlock + + if self.use_pool: + self.pool = nn.AvgPool1d(kernel_size=pool_kernel) + + for i in range(self.num_layers): + in_channel = num_channels if i == 0 else out_channels[i - 1] + self.down_convs.append(block(in_channels=in_channel, out_channels=out_channels[i], kernel_size=kernel_sizes[i], dilation=dilations[i], dropout=dropout, use_2d=use_2d)) + + def init_ctx_buf(self, batch_size, device, height=None): + """ + Initialize context buffer for each layer. + """ + if height is not None: + up_ctx = torch.zeros((batch_size, self.ctx_height, height, sum(self.buf_lengths))).to(device) + else: + up_ctx = torch.zeros((batch_size, self.ctx_height, sum(self.buf_lengths))).to(device) + return up_ctx + + def forward(self, x, ctx): + """ + Args: + x: [B, in_channels, T] + Input + ctx: {[B, channels, self.buf_length[0]], ...} + A list of tensors holding context for each unet layer. (len(ctx) == self.num_layers) + Returns: + x: [B, out_channels, T] + ctx: {[B, channels, self.buf_length[0]], ...} + Updated context buffer with output as the + last element. + """ + if self.use_pool: + x = self.pool(x) + + for i in range(self.num_layers): + buf_start_idx = self.buf_indices[i] + buf_end_idx = self.buf_indices[i] + self.buf_lengths[i] + + # concatenate context buffer with input + if self.use_2d: + conv_in = torch.cat((ctx[..., : x.shape[1], : x.shape[-2], buf_start_idx:buf_end_idx], x), dim=-1) + else: + conv_in = torch.cat((ctx[..., : x.shape[-2], buf_start_idx:buf_end_idx], x), dim=-1) + + # Push current output to the context buffer + if self.use_2d: + ctx[..., : x.shape[1], : x.shape[-2], buf_start_idx:buf_end_idx] = conv_in[..., -self.buf_lengths[i] :] + else: + ctx[..., : x.shape[1], buf_start_idx:buf_end_idx] = conv_in[..., -self.buf_lengths[i] :] + + # pad second-to-last index of input with self.buf_lengths[i] // 2 zeros + # on each side to ensure that height of output is the same as input + if self.use_2d: + conv_in = F.pad(conv_in, (0, 0, self.buf_lengths[i] // 2, self.buf_lengths[i] // 2)) + + if self.combine_residuals == "add": + x = x + self.down_convs[i](conv_in) + elif self.combine_residuals == "multiply": + x = x * self.down_convs[i](conv_in) + else: + x = self.down_convs[i](conv_in) + + if self.use_pool: + x = F.interpolate(x, scale_factor=self.pool.kernel_size[0]) + + return x, ctx diff --git a/server/voice_changer/LLVC/model/llvc.py b/server/voice_changer/LLVC/model/llvc.py new file mode 100644 index 00000000..d55a48d6 --- /dev/null +++ b/server/voice_changer/LLVC/model/llvc.py @@ -0,0 +1,464 @@ +import math +from collections import OrderedDict +from typing import Optional + +from torch import Tensor +import torch +import torch.nn as nn +import torch.nn.functional as F + +from voice_changer.LLVC.model.cached_convnet import CachedConvNet + + +class PositionalEncoding(nn.Module): + """This class implements the absolute sinusoidal positional encoding function. + PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) + PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) + Arguments + --------- + input_size: int + Embedding dimension. + max_len : int, optional + Max length of the input sequences (default 2500). + Example + ------- + >>> a = torch.rand((8, 120, 512)) + >>> enc = PositionalEncoding(input_size=a.shape[-1]) + >>> b = enc(a) + >>> b.shape + torch.Size([1, 120, 512]) + """ + + def __init__(self, input_size, max_len=2500): + super().__init__() + self.max_len = max_len + pe = torch.zeros(self.max_len, input_size, requires_grad=False) + positions = torch.arange(0, self.max_len).unsqueeze(1).float() + denominator = torch.exp(torch.arange(0, input_size, 2).float() * -(math.log(10000.0) / input_size)) + + pe[:, 0::2] = torch.sin(positions * denominator) + pe[:, 1::2] = torch.cos(positions * denominator) + pe = pe.unsqueeze(0) + self.register_buffer("pe", pe) + + def forward(self, x): + """ + Arguments + --------- + x : tensor + Input feature shape (batch, time, fea) + """ + return self.pe[:, : x.size(1)].clone().detach() + + +def mod_pad(x, chunk_size, pad): + # Mod pad the input to perform integer number of + # inferences + mod = 0 + if (x.shape[-1] % chunk_size) != 0: + mod = chunk_size - (x.shape[-1] % chunk_size) + + x = F.pad(x, (0, mod)) + x = F.pad(x, pad) + + return x, mod + + +class LayerNormPermuted(nn.LayerNorm): + def __init__(self, *args, **kwargs): + super(LayerNormPermuted, self).__init__(*args, **kwargs) + + def forward(self, x): + """ + Args: + x: [B, C, T] + """ + x = x.permute(0, 2, 1) # [B, T, C] + x = super().forward(x) + x = x.permute(0, 2, 1) # [B, C, T] + return x + + +class DepthwiseSeparableConv(nn.Module): + """ + Depthwise separable convolutions + """ + + def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation): + super(DepthwiseSeparableConv, self).__init__() + + self.layers = nn.Sequential( + nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups=in_channels, dilation=dilation), + LayerNormPermuted(in_channels), + nn.ReLU(), + nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0), + LayerNormPermuted(out_channels), + nn.ReLU(), + ) + + def forward(self, x): + return self.layers(x) + + +class DilatedCausalConvEncoder(nn.Module): + """ + A dilated causal convolution based encoder for encoding + time domain audio input into latent space. + """ + + def __init__(self, channels, num_layers, kernel_size=3): + super(DilatedCausalConvEncoder, self).__init__() + self.channels = channels + self.num_layers = num_layers + self.kernel_size = kernel_size + + # Compute buffer lengths for each layer + # buf_length[i] = (kernel_size - 1) * dilation[i] + self.buf_lengths = [(kernel_size - 1) * 2**i for i in range(num_layers)] + + # Compute buffer start indices for each layer + self.buf_indices = [0] + for i in range(num_layers - 1): + self.buf_indices.append(self.buf_indices[-1] + self.buf_lengths[i]) + + # Dilated causal conv layers aggregate previous context to obtain + # contexful encoded input. + _dcc_layers = OrderedDict() + for i in range(num_layers): + dcc_layer = DepthwiseSeparableConv(channels, channels, kernel_size=3, stride=1, padding=0, dilation=2**i) + _dcc_layers.update({"dcc_%d" % i: dcc_layer}) + self.dcc_layers = nn.Sequential(_dcc_layers) + + def init_ctx_buf(self, batch_size, device): + """ + Returns an initialized context buffer for a given batch size. + """ + return torch.zeros((batch_size, self.channels, (self.kernel_size - 1) * (2**self.num_layers - 1)), device=device) + + def forward(self, x, ctx_buf): + """ + Encodes input audio `x` into latent space, and aggregates + contextual information in `ctx_buf`. Also generates new context + buffer with updated context. + Args: + x: [B, in_channels, T] + Input multi-channel audio. + ctx_buf: {[B, channels, self.buf_length[0]], ...} + A list of tensors holding context for each dilation + causal conv layer. (len(ctx_buf) == self.num_layers) + Returns: + ctx_buf: {[B, channels, self.buf_length[0]], ...} + Updated context buffer with output as the + last element. + """ + T = x.shape[-1] # Sequence length # noqa + + for i in range(self.num_layers): + buf_start_idx = self.buf_indices[i] + buf_end_idx = self.buf_indices[i] + self.buf_lengths[i] + + # DCC input: concatenation of current output and context + dcc_in = torch.cat((ctx_buf[..., buf_start_idx:buf_end_idx], x), dim=-1) + + # Push current output to the context buffer + ctx_buf[..., buf_start_idx:buf_end_idx] = dcc_in[..., -self.buf_lengths[i] :] + + # Residual connection + x = x + self.dcc_layers[i](dcc_in) + + return x, ctx_buf + + +class CausalTransformerDecoderLayer(torch.nn.TransformerDecoderLayer): + """ + Adapted from: + "https://github.com/alexmt-scale/causal-transformer-decoder/blob/" + "0caf6ad71c46488f76d89845b0123d2550ef792f/" + "causal_transformer_decoder/model.py#L77" + """ + + def forward2(self, tgt: Tensor, memory: Optional[Tensor] = None, chunk_size: int = 1): + tgt_last_tok = tgt[:, -chunk_size:, :] + + # self attention part + tmp_tgt, sa_map = self.self_attn( + tgt_last_tok, + tgt, + tgt, + attn_mask=None, # not needed because we only care about the last token + key_padding_mask=None, + ) + tgt_last_tok = tgt_last_tok + self.dropout1(tmp_tgt) + tgt_last_tok = self.norm1(tgt_last_tok) + + # encoder-decoder attention + if memory is not None: + tmp_tgt, ca_map = self.multihead_attn( + tgt_last_tok, + memory, + memory, + attn_mask=None, # Attend to the entire chunk + key_padding_mask=None, + ) + tgt_last_tok = tgt_last_tok + self.dropout2(tmp_tgt) + tgt_last_tok = self.norm2(tgt_last_tok) + + # final feed-forward network + tmp_tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt_last_tok)))) + tgt_last_tok = tgt_last_tok + self.dropout3(tmp_tgt) + tgt_last_tok = self.norm3(tgt_last_tok) + return tgt_last_tok, sa_map, ca_map + + +class CausalTransformerDecoder(nn.Module): + """ + A casual transformer decoder which decodes input vectors using + precisely `ctx_len` past vectors in the sequence, and using no future + vectors at all. + """ + + def __init__(self, model_dim, ctx_len, chunk_size, num_layers, nhead, use_pos_enc, ff_dim, dropout): + super(CausalTransformerDecoder, self).__init__() + self.num_layers = num_layers + self.model_dim = model_dim + self.ctx_len = ctx_len + self.chunk_size = chunk_size + self.nhead = nhead + self.use_pos_enc = use_pos_enc + self.unfold = nn.Unfold(kernel_size=(ctx_len + chunk_size, 1), stride=chunk_size) + self.pos_enc = PositionalEncoding(model_dim, max_len=200) + self.tf_dec_layers = nn.ModuleList([CausalTransformerDecoderLayer(d_model=model_dim, nhead=nhead, dim_feedforward=ff_dim, batch_first=True, dropout=dropout) for _ in range(num_layers)]) + + def init_ctx_buf(self, batch_size, device): + return torch.zeros((batch_size, self.num_layers + 1, self.ctx_len, self.model_dim), device=device) + + def _causal_unfold(self, x): + """ + Unfolds the sequence into a batch of sequences + prepended with `ctx_len` previous values. + + Args: + x: [B, ctx_len + L, C] + ctx_len: int + Returns: + [B * L, ctx_len + 1, C] + """ + B, T, C = x.shape + x = x.permute(0, 2, 1) # [B, C, ctx_len + L] + x = self.unfold(x.unsqueeze(-1)) # [B, C * (ctx_len + chunk_size), -1] + x = x.permute(0, 2, 1) + x = x.reshape(B, -1, C, self.ctx_len + self.chunk_size) + x = x.reshape(-1, C, self.ctx_len + self.chunk_size) + x = x.permute(0, 2, 1) + return x + + def forward(self, tgt, mem, ctx_buf, probe=False): + """ + Args: + x: [B, model_dim, T] + ctx_buf: [B, num_layers, model_dim, ctx_len] + """ + mem, _ = mod_pad(mem, self.chunk_size, (0, 0)) + tgt, mod = mod_pad(tgt, self.chunk_size, (0, 0)) + + # Input sequence length + B, C, T = tgt.shape + + tgt = tgt.permute(0, 2, 1) + mem = mem.permute(0, 2, 1) + + # Prepend mem with the context + mem = torch.cat((ctx_buf[:, 0, :, :], mem), dim=1) + ctx_buf[:, 0, :, :] = mem[:, -self.ctx_len :, :] + mem_ctx = self._causal_unfold(mem) + if self.use_pos_enc: + mem_ctx = mem_ctx + self.pos_enc(mem_ctx) + + # Attention chunk size: required to ensure the model + # wouldn't trigger an out-of-memory error when working + # on long sequences. + K = 1000 + + for i, tf_dec_layer in enumerate(self.tf_dec_layers): + # Update the tgt with context + tgt = torch.cat((ctx_buf[:, i + 1, :, :], tgt), dim=1) + ctx_buf[:, i + 1, :, :] = tgt[:, -self.ctx_len :, :] + + # Compute encoded output + tgt_ctx = self._causal_unfold(tgt) + if self.use_pos_enc and i == 0: + tgt_ctx = tgt_ctx + self.pos_enc(tgt_ctx) + tgt = torch.zeros_like(tgt_ctx)[:, -self.chunk_size :, :] + for i in range(int(math.ceil(tgt.shape[0] / K))): + tgt[i * K : (i + 1) * K], _sa_map, _ca_map = tf_dec_layer.forward2(tgt_ctx[i * K : (i + 1) * K], mem_ctx[i * K : (i + 1) * K], self.chunk_size) + tgt = tgt.reshape(B, T, C) + + tgt = tgt.permute(0, 2, 1) + if mod != 0: + tgt = tgt[..., :-mod] + + return tgt, ctx_buf + + +class MaskNet(nn.Module): + def __init__(self, enc_dim, num_enc_layers, dec_dim, dec_buf_len, dec_chunk_size, num_dec_layers, use_pos_enc, skip_connection, proj, decoder_dropout): + super(MaskNet, self).__init__() + self.skip_connection = skip_connection + self.proj = proj + + # Encoder based on dilated causal convolutions. + self.encoder = DilatedCausalConvEncoder(channels=enc_dim, num_layers=num_enc_layers) + + # Project between encoder and decoder dimensions + self.proj_e2d_e = nn.Sequential(nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU()) + self.proj_e2d_l = nn.Sequential(nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU()) + self.proj_d2e = nn.Sequential(nn.Conv1d(dec_dim, enc_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU()) + + # Transformer decoder that operates on chunks of size + # buffer size. + + self.decoder = CausalTransformerDecoder(model_dim=dec_dim, ctx_len=dec_buf_len, chunk_size=dec_chunk_size, num_layers=num_dec_layers, nhead=8, use_pos_enc=use_pos_enc, ff_dim=2 * dec_dim, dropout=decoder_dropout) + + def forward(self, x, l, enc_buf, dec_buf): # noqa + """ + Generates a mask based on encoded input `e` and the one-hot + label `label`. + + Args: + x: [B, C, T] + Input audio sequence + l: [B, C] + Label embedding + ctx_buf: {[B, C, ], ...} + List of context buffers maintained by DCC encoder + """ + # Enocder the label integrated input + e, enc_buf = self.encoder(x, enc_buf) + + # Label integration + l = l.unsqueeze(2) * e # noqa + + # Project to `dec_dim` dimensions + if self.proj: + e = self.proj_e2d_e(e) + m = self.proj_e2d_l(l) + # Cross-attention to predict the mask + m, dec_buf = self.decoder(m, e, dec_buf) + else: + # Cross-attention to predict the mask + m, dec_buf = self.decoder(l, e, dec_buf) + + # Project mask to encoder dimensions + if self.proj: + m = self.proj_d2e(m) + + # Final mask after residual connection + if self.skip_connection: + m = l + m + + return m, enc_buf, dec_buf + + +class Net(nn.Module): + def __init__(self, label_len, L=8, enc_dim=512, num_enc_layers=10, dec_dim=256, dec_buf_len=100, num_dec_layers=2, dec_chunk_size=72, out_buf_len=2, use_pos_enc=True, skip_connection=True, proj=True, lookahead=True, decoder_dropout=0.0, convnet_config=None): + super(Net, self).__init__() + self.L = L + self.dec_chunk_size = dec_chunk_size + self.out_buf_len = out_buf_len + self.enc_dim = enc_dim + self.lookahead = lookahead + + self.convnet_config = convnet_config + if convnet_config["convnet_prenet"]: + self.convnet_pre = CachedConvNet(1, convnet_config["kernel_sizes"], convnet_config["dilations"], convnet_config["dropout"], convnet_config["combine_residuals"], convnet_config["use_residual_blocks"], convnet_config["out_channels"], use_2d=False) + + # Input conv to convert input audio to a latent representation + kernel_size = 3 * L if lookahead else L + self.in_conv = nn.Sequential(nn.Conv1d(in_channels=1, out_channels=enc_dim, kernel_size=kernel_size, stride=L, padding=0, bias=False), nn.ReLU()) + + # Label embedding layer + label_len = 1 + self.label_embedding = nn.Sequential(nn.Linear(label_len, 512), nn.LayerNorm(512), nn.ReLU(), nn.Linear(512, enc_dim), nn.LayerNorm(enc_dim), nn.ReLU()) + + # Mask generator + self.mask_gen = MaskNet(enc_dim=enc_dim, num_enc_layers=num_enc_layers, dec_dim=dec_dim, dec_buf_len=dec_buf_len, dec_chunk_size=dec_chunk_size, num_dec_layers=num_dec_layers, use_pos_enc=use_pos_enc, skip_connection=skip_connection, proj=proj, decoder_dropout=decoder_dropout) + + # Output conv layer + self.out_conv = nn.Sequential(nn.ConvTranspose1d(in_channels=enc_dim, out_channels=1, kernel_size=(out_buf_len + 1) * L, stride=L, padding=out_buf_len * L, bias=False), nn.Tanh()) + + def init_buffers(self, batch_size, device): + enc_buf = self.mask_gen.encoder.init_ctx_buf(batch_size, device) + dec_buf = self.mask_gen.decoder.init_ctx_buf(batch_size, device) + out_buf = torch.zeros(batch_size, self.enc_dim, self.out_buf_len, device=device) + return enc_buf, dec_buf, out_buf + + def forward(self, x, init_enc_buf=None, init_dec_buf=None, init_out_buf=None, convnet_pre_ctx=None, pad=True): + """ + Extracts the audio corresponding to the `label` in the given + `mixture`. Generates `chunk_size` samples per iteration. + + Args: + mixed: [B, n_mics, T] + input audio mixture + label: [B, num_labels] + one hot label + Returns: + out: [B, n_spk, T] + extracted audio with sounds corresponding to the `label` + """ + label = torch.zeros(x.shape[0], 1, device=x.device) + mod = 0 + if pad: + pad_size = (self.L, self.L) if self.lookahead else (0, 0) + x, mod = mod_pad(x, chunk_size=self.L, pad=pad_size) + + if hasattr(self, "convnet_pre"): + if convnet_pre_ctx is None: + convnet_pre_ctx = self.convnet_pre.init_ctx_buf(x.shape[0], x.device) + + convnet_out, convnet_pre_ctx = self.convnet_pre(x, convnet_pre_ctx) + + if self.convnet_config["skip_connection"] == "add": + x = x + convnet_out + elif self.convnet_config["skip_connection"] == "multiply": + x = x * convnet_out + else: + x = convnet_out + + if init_enc_buf is None or init_dec_buf is None or init_out_buf is None: + assert init_enc_buf is None and init_dec_buf is None and init_out_buf is None, "Both buffers have to initialized, or " "both of them have to be None." + enc_buf, dec_buf, out_buf = self.init_buffers(x.shape[0], x.device) + else: + ( + enc_buf, + dec_buf, + out_buf, + ) = ( + init_enc_buf, + init_dec_buf, + init_out_buf, + ) + + # Generate latent space representation of the input + x = self.in_conv(x) + + # Generate label embedding + l = self.label_embedding(label) # [B, label_len] --> [B, channels] # noqa + + # Generate mask corresponding to the label + m, enc_buf, dec_buf = self.mask_gen(x, l, enc_buf, dec_buf) + + # Apply mask and decode + x = x * m + x = torch.cat((out_buf, x), dim=-1) + out_buf = x[..., -self.out_buf_len :] + x = self.out_conv(x) + + # Remove mod padding, if present. + if mod != 0: + x = x[:, :, :-mod] + + if init_enc_buf is None: + return x + else: + return x, enc_buf, dec_buf, out_buf, convnet_pre_ctx diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 0f6498e9..f14c1617 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -199,6 +199,13 @@ class VoiceChangerManager(ServerDeviceCallbacks): slotInfo = BeatriceModelSlotGenerator.loadModel(params) self.modelSlotManager.save_model_slot(params.slot, slotInfo) + + elif params.voiceChangerType == "LLVC": + from voice_changer.LLVC.LLVCModelSlotGenerator import LLVCModelSlotGenerator + + slotInfo = LLVCModelSlotGenerator.loadModel(params) + self.modelSlotManager.save_model_slot(params.slot, slotInfo) + logger.info(f"params, {params}") def get_info(self): @@ -291,6 +298,14 @@ class VoiceChangerManager(ServerDeviceCallbacks): self.voiceChangerModel = Beatrice(self.params, slotInfo) self.voiceChanger = VoiceChangerV2(self.params) self.voiceChanger.setModel(self.voiceChangerModel) + elif slotInfo.voiceChangerType == "LLVC": + logger.info("................LLVC") + from voice_changer.LLVC.LLVC import LLVC + + self.voiceChangerModel = LLVC(self.params, slotInfo) + self.voiceChanger = VoiceChangerV2(self.params) + self.voiceChanger.setModel(self.voiceChangerModel) + pass else: logger.info(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}") diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index 81e473b7..8bf72fca 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -90,22 +90,16 @@ class VoiceChangerV2(VoiceChangerIF): self.params = params self.gpu_num = torch.cuda.device_count() self.prev_audio = np.zeros(4096) - self.mps_enabled: bool = ( - getattr(torch.backends, "mps", None) is not None - and torch.backends.mps.is_available() - ) + self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() self.onnx_device = onnxruntime.get_device() self.noCrossFade = False - logger.info( - f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})" - ) + logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})") def setModel(self, model: VoiceChangerModel): self.voiceChanger = model - self.voiceChanger.setSamplingRate( - self.settings.inputSampleRate, self.settings.outputSampleRate - ) + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + # if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC": if model.voiceChangerType == "Beatrice": self.noCrossFade = True else: @@ -113,15 +107,11 @@ class VoiceChangerV2(VoiceChangerIF): def setInputSampleRate(self, sr: int): self.settings.inputSampleRate = sr - self.voiceChanger.setSamplingRate( - self.settings.inputSampleRate, self.settings.outputSampleRate - ) + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) def setOutputSampleRate(self, sr: int): self.settings.outputSampleRate = sr - self.voiceChanger.setSamplingRate( - self.settings.inputSampleRate, self.settings.outputSampleRate - ) + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) def get_info(self): data = asdict(self.settings) @@ -140,9 +130,7 @@ class VoiceChangerV2(VoiceChangerIF): if key == "serverAudioStated" and val == 0: self.settings.inputSampleRate = 48000 self.settings.outputSampleRate = 48000 - self.voiceChanger.setSamplingRate( - self.settings.inputSampleRate, self.settings.outputSampleRate - ) + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) if key in self.settings.intData: setattr(self.settings, key, int(val)) @@ -156,6 +144,7 @@ class VoiceChangerV2(VoiceChangerIF): STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate, + # 16000, ) if key == "recordIO" and val == 0: if hasattr(self, "ioRecorder"): @@ -165,9 +154,7 @@ class VoiceChangerV2(VoiceChangerIF): if hasattr(self, "ioRecorder"): self.ioRecorder.close() if key == "inputSampleRate" or key == "outputSampleRate": - self.voiceChanger.setSamplingRate( - self.settings.inputSampleRate, self.settings.outputSampleRate - ) + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) elif key in self.settings.floatData: setattr(self.settings, key, float(val)) elif key in self.settings.strData: @@ -180,12 +167,7 @@ class VoiceChangerV2(VoiceChangerIF): return self.get_info() def _generate_strength(self, crossfadeSize: int): - if ( - self.crossfadeSize != crossfadeSize - or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate - or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate - or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize - ): + if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize: self.crossfadeSize = crossfadeSize self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate self.currentCrossFadeEndRate = self.settings.crossFadeEndRate @@ -214,9 +196,7 @@ class VoiceChangerV2(VoiceChangerIF): ] ) - logger.info( - f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}" - ) + logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}") # ひとつ前の結果とサイズが変わるため、記録は消去する。 if hasattr(self, "np_prev_audio1") is True: @@ -231,21 +211,15 @@ class VoiceChangerV2(VoiceChangerIF): return self.voiceChanger.get_processing_sampling_rate() # receivedData: tuple of short - def on_request( - self, receivedData: AudioInOut - ) -> tuple[AudioInOut, list[Union[int, float]]]: + def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: try: if self.voiceChanger is None: - raise VoiceChangerIsNotSelectedException( - "Voice Changer is not selected." - ) + raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.") with Timer("main-process") as t: - processing_sampling_rate = ( - self.voiceChanger.get_processing_sampling_rate() - ) + processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() - if self.noCrossFade: # Beatrice + if self.noCrossFade: # Beatrice, LLVC audio = self.voiceChanger.inference( receivedData, crossfade_frame=0, @@ -257,9 +231,7 @@ class VoiceChangerV2(VoiceChangerIF): else: sola_search_frame = int(0.012 * processing_sampling_rate) block_frame = receivedData.shape[0] - crossfade_frame = min( - self.settings.crossFadeOverlapSize, block_frame - ) + crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) self._generate_strength(crossfade_frame) audio = self.voiceChanger.inference( @@ -270,9 +242,7 @@ class VoiceChangerV2(VoiceChangerIF): if hasattr(self, "sola_buffer") is True: np.set_printoptions(threshold=10000) - audio_offset = -1 * ( - sola_search_frame + crossfade_frame + block_frame - ) + audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame) audio = audio[audio_offset:] # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI @@ -297,25 +267,16 @@ class VoiceChangerV2(VoiceChangerIF): result = output_wav else: - logger.info( - "[Voice Changer] warming up... generating sola buffer." - ) + logger.info("[Voice Changer] warming up... generating sola buffer.") result = np.zeros(4096).astype(np.int16) - if ( - hasattr(self, "sola_buffer") is True - and sola_offset < sola_search_frame - ): - offset = -1 * ( - sola_search_frame + crossfade_frame - sola_offset - ) + if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame: + offset = -1 * (sola_search_frame + crossfade_frame - sola_offset) end = -1 * (sola_search_frame - sola_offset) sola_buf_org = audio[offset:end] self.sola_buffer = sola_buf_org * self.np_prev_strength else: - self.sola_buffer = ( - audio[-crossfade_frame:] * self.np_prev_strength - ) + self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength # self.sola_buffer = audio[- crossfade_frame:] mainprocess_time = t.secs @@ -324,12 +285,15 @@ class VoiceChangerV2(VoiceChangerIF): with Timer("post-process") as t: result = result.astype(np.int16) - print_convert_processing( - f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz" - ) + print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz") if receivedData.shape[0] != result.shape[0]: - outputData = pad_array(result, receivedData.shape[0]) + # print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0]) + if self.voiceChanger.voiceChangerType == "LLVC": + outputData = result + else: + outputData = pad_array(result, receivedData.shape[0]) + pass else: outputData = result @@ -340,9 +304,7 @@ class VoiceChangerV2(VoiceChangerIF): postprocess_time = t.secs - print_convert_processing( - f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}" - ) + print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") perf = [0, mainprocess_time, postprocess_time] return outputData, perf @@ -351,9 +313,7 @@ class VoiceChangerV2(VoiceChangerIF): logger.warn(f"[Voice Changer] [Exception], {e}") return np.zeros(1).astype(np.int16), [0, 0, 0] except ONNXInputArgumentException as e: - logger.warn( - f"[Voice Changer] [Exception] onnx are waiting valid input., {e}" - ) + logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}") return np.zeros(1).astype(np.int16), [0, 0, 0] except HalfPrecisionChangingException: logger.warn("[Voice Changer] Switching model configuration....") @@ -365,9 +325,7 @@ class VoiceChangerV2(VoiceChangerIF): logger.warn(f"[Voice Changer] embedder: {e}") return np.zeros(1).astype(np.int16), [0, 0, 0] except VoiceChangerIsNotSelectedException: - logger.warn( - "[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc." - ) + logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.") return np.zeros(1).astype(np.int16), [0, 0, 0] except DeviceCannotSupportHalfPrecisionException: # RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。 diff --git a/server/voice_changer/utils/LoadModelParams.py b/server/voice_changer/utils/LoadModelParams.py index d548a897..f9a6358c 100644 --- a/server/voice_changer/utils/LoadModelParams.py +++ b/server/voice_changer/utils/LoadModelParams.py @@ -20,6 +20,8 @@ LoadModelParamFileKind: TypeAlias = Literal[ "ddspSvcDiffusionConfig", "diffusionSVCModel", "beatriceModel", + "llvcModel", + "llvcConfig", ] diff --git a/server/voice_changer/utils/VoiceChangerModel.py b/server/voice_changer/utils/VoiceChangerModel.py index 1a3ad4b2..4e722ac0 100644 --- a/server/voice_changer/utils/VoiceChangerModel.py +++ b/server/voice_changer/utils/VoiceChangerModel.py @@ -6,6 +6,8 @@ from voice_changer.utils.LoadModelParams import LoadModelParams AudioInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] +AudioInOutFloat: TypeAlias = np.ndarray[Any, np.dtype[np.float32]] + PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]] FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]