mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
Experimental LLVC
This commit is contained in:
parent
1e68e01e39
commit
dadab1ad13
@ -116,6 +116,15 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
|
||||
return x.kind == "beatriceModel";
|
||||
});
|
||||
return enough;
|
||||
} else if (setting.voiceChangerType == "LLVC") {
|
||||
const enough =
|
||||
!!setting.files.find((x) => {
|
||||
return x.kind == "llvcModel";
|
||||
}) &&
|
||||
!!setting.files.find((x) => {
|
||||
return x.kind == "llvcConfig";
|
||||
});
|
||||
return enough;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
@ -177,6 +186,9 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
|
||||
rows.push(generateFileRow(uploadSetting!, "Model(combo)", "diffusionSVCModel", ["ptc"]));
|
||||
} else if (vcType == "Beatrice") {
|
||||
rows.push(generateFileRow(uploadSetting!, "Beatrice", "beatriceModel", ["bin"]));
|
||||
} else if (vcType == "LLVC") {
|
||||
rows.push(generateFileRow(uploadSetting!, "Model", "llvcModel", ["pth"]));
|
||||
rows.push(generateFileRow(uploadSetting!, "Config", "llvcConfig", ["json"]));
|
||||
}
|
||||
return rows;
|
||||
};
|
||||
|
@ -58,6 +58,7 @@ export class VoiceChangerClient {
|
||||
|
||||
// const ctx44k = new AudioContext({ sampleRate: 44100 }) // これでもプチプチが残る
|
||||
const ctx44k = new AudioContext({ sampleRate: 48000 }); // 結局これが一番まし。
|
||||
// const ctx44k = new AudioContext({ sampleRate: 16000 }); // LLVCテスト⇒16K出力でプチプチなしで行ける。
|
||||
console.log("audio out:", ctx44k);
|
||||
try {
|
||||
this.vcOutNode = new VoiceChangerWorkletNode(ctx44k, voiceChangerWorkletListener); // vc node
|
||||
|
@ -11,6 +11,7 @@ export const VoiceChangerType = {
|
||||
RVC: "RVC",
|
||||
"Diffusion-SVC": "Diffusion-SVC",
|
||||
Beatrice: "Beatrice",
|
||||
LLVC: "LLVC",
|
||||
} as const;
|
||||
export type VoiceChangerType = (typeof VoiceChangerType)[keyof typeof VoiceChangerType];
|
||||
|
||||
@ -37,6 +38,9 @@ export const ModelSamplingRate = {
|
||||
export type ModelSamplingRate = (typeof InputSampleRate)[keyof typeof InputSampleRate];
|
||||
|
||||
export const CrossFadeOverlapSize = {
|
||||
"128": 128,
|
||||
"256": 256,
|
||||
"512": 512,
|
||||
"1024": 1024,
|
||||
"2048": 2048,
|
||||
"4096": 4096,
|
||||
@ -296,7 +300,14 @@ export type BeatriceModelSlot = ModelSlot & {
|
||||
speakers: { [key: number]: string };
|
||||
};
|
||||
|
||||
export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot | BeatriceModelSlot;
|
||||
export type LLVCModelSlot = ModelSlot & {
|
||||
modelFile: string;
|
||||
configFile: string;
|
||||
|
||||
speakers: { [key: number]: string };
|
||||
};
|
||||
|
||||
export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot | BeatriceModelSlot | LLVCModelSlot;
|
||||
|
||||
type ServerAudioDevice = {
|
||||
kind: "audioinput" | "audiooutput";
|
||||
|
@ -29,6 +29,9 @@ export const ModelFileKind = {
|
||||
diffusionSVCModel: "diffusionSVCModel",
|
||||
|
||||
beatriceModel: "beatriceModel",
|
||||
|
||||
llvcModel: "llvcModel",
|
||||
llvcConfig: "llvcConfig",
|
||||
} as const;
|
||||
export type ModelFileKind = (typeof ModelFileKind)[keyof typeof ModelFileKind];
|
||||
|
||||
|
@ -13,6 +13,7 @@ VoiceChangerType: TypeAlias = Literal[
|
||||
"RVC",
|
||||
"Diffusion-SVC",
|
||||
"Beatrice",
|
||||
"LLVC",
|
||||
]
|
||||
|
||||
StaticSlot: TypeAlias = Literal["Beatrice-JVS",]
|
||||
|
@ -134,7 +134,24 @@ class BeatriceModelSlot(ModelSlot):
|
||||
speakers: dict = field(default_factory=lambda: {1: "user1", 2: "user2"})
|
||||
|
||||
|
||||
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot, BeatriceModelSlot]
|
||||
@dataclass
|
||||
class LLVCModelSlot(ModelSlot):
|
||||
voiceChangerType: VoiceChangerType = "LLVC"
|
||||
modelFile: str = ""
|
||||
configFile: str = ""
|
||||
|
||||
|
||||
ModelSlots: TypeAlias = Union[
|
||||
ModelSlot,
|
||||
RVCModelSlot,
|
||||
MMVCv13ModelSlot,
|
||||
MMVCv15ModelSlot,
|
||||
SoVitsSvc40ModelSlot,
|
||||
DDSPSVCModelSlot,
|
||||
DiffusionSVCModelSlot,
|
||||
BeatriceModelSlot,
|
||||
LLVCModelSlot,
|
||||
]
|
||||
|
||||
|
||||
def loadSlotInfo(model_dir: str, slotIndex: int | StaticSlot) -> ModelSlots:
|
||||
@ -165,10 +182,12 @@ def loadSlotInfo(model_dir: str, slotIndex: int | StaticSlot) -> ModelSlots:
|
||||
return DiffusionSVCModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
|
||||
elif slotInfo.voiceChangerType == "Beatrice":
|
||||
slotInfoKey.extend(list(BeatriceModelSlot.__annotations__.keys()))
|
||||
if slotIndex == "Beatrice-JVS":
|
||||
if slotIndex == "Beatrice-JVS": # STATIC Model
|
||||
return BeatriceModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
|
||||
|
||||
return BeatriceModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
|
||||
elif slotInfo.voiceChangerType == "LLVC":
|
||||
slotInfoKey.extend(list(LLVCModelSlot.__annotations__.keys()))
|
||||
return LLVCModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
|
||||
else:
|
||||
return ModelSlot()
|
||||
|
||||
|
198
server/voice_changer/LLVC/LLVC.py
Normal file
198
server/voice_changer/LLVC/LLVC.py
Normal file
@ -0,0 +1,198 @@
|
||||
import traceback
|
||||
from typing import Any, cast
|
||||
from scipy import signal
|
||||
import os
|
||||
from dataclasses import dataclass, asdict, field
|
||||
import resampy
|
||||
from data.ModelSlot import LLVCModelSlot
|
||||
from mods.log_control import VoiceChangaerLogger
|
||||
import numpy as np
|
||||
from voice_changer.LLVC.LLVCInferencer import LLVCInferencer
|
||||
from voice_changer.ModelSlotManager import ModelSlotManager
|
||||
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
||||
from voice_changer.utils.Timer import Timer2
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, AudioInOutFloat, VoiceChangerModel
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
import math
|
||||
import torchaudio
|
||||
import torch
|
||||
|
||||
logger = VoiceChangaerLogger.get_instance().getLogger()
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLVCSetting:
|
||||
# Crossfade(CF), Resample(RE) 組み合わせ
|
||||
# CF:True, RE:True -> ブラウザで使える
|
||||
# CF:True, RE:False -> N/A, 必要のない設定。(Resampleしないと音はぶつぶつしない。)
|
||||
# CF:False, RE:True -> N/A, 音にぷつぷつが乗るのでNG(client, server両モードでNGだった)
|
||||
# CF:False, RE:False -> 再生側が16Kに対応していればよい。
|
||||
|
||||
crossfade: bool = True
|
||||
resample: bool = True
|
||||
|
||||
# 変更可能な変数だけ列挙
|
||||
intData: list[str] = field(default_factory=lambda: [])
|
||||
floatData: list[str] = field(default_factory=lambda: [])
|
||||
strData: list[str] = field(default_factory=lambda: [])
|
||||
|
||||
|
||||
class LLVC(VoiceChangerModel):
|
||||
def __init__(self, params: VoiceChangerParams, slotInfo: LLVCModelSlot):
|
||||
logger.info("[Voice Changer] [LLVC] Creating instance ")
|
||||
self.voiceChangerType = "LLVC"
|
||||
self.settings = LLVCSetting()
|
||||
|
||||
self.processingSampleRate = 16000
|
||||
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=self.processingSampleRate)
|
||||
self.bh = bh
|
||||
self.ah = ah
|
||||
|
||||
self.params = params
|
||||
self.slotInfo = slotInfo
|
||||
self.modelSlotManager = ModelSlotManager.get_instance(self.params.model_dir)
|
||||
|
||||
# # クロスフェード・リサンプリング設定
|
||||
# ## 16Kで出力するモード
|
||||
# self.settings.crossfade = False
|
||||
# self.settings.resample = False
|
||||
|
||||
## 48Kで出力するモード
|
||||
self.settings.crossfade = True
|
||||
self.settings.resample = True
|
||||
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
print("[Voice Changer] [LLVC] Initializing... ")
|
||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||
configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile)
|
||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
||||
|
||||
self.inputSampleRate = 48000
|
||||
self.outputSampleRate = 48000
|
||||
|
||||
self.downsampler = torchaudio.transforms.Resample(self.inputSampleRate, self.processingSampleRate)
|
||||
self.upsampler = torchaudio.transforms.Resample(self.processingSampleRate, self.outputSampleRate)
|
||||
|
||||
self.inferencer = LLVCInferencer().loadModel(modelPath, configPath)
|
||||
self.prev_audio1 = None
|
||||
self.result_buff = None
|
||||
|
||||
def updateSetting(self, key: str, val: Any):
|
||||
if key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
ret = True
|
||||
elif key in self.settings.floatData:
|
||||
setattr(self.settings, key, float(val))
|
||||
ret = True
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
ret = True
|
||||
else:
|
||||
ret = False
|
||||
|
||||
return ret
|
||||
|
||||
def setSamplingRate(self, inputSampleRate, outputSampleRate):
|
||||
self.inputSampleRate = inputSampleRate
|
||||
self.outputSampleRate = outputSampleRate
|
||||
self.downsampler = torchaudio.transforms.Resample(self.inputSampleRate, self.processingSampleRate)
|
||||
self.upsampler = torchaudio.transforms.Resample(self.processingSampleRate, self.outputSampleRate)
|
||||
|
||||
def _preprocess(self, waveform: AudioInOutFloat, srcSampleRate: int) -> AudioInOutFloat:
|
||||
"""データ前処理(torch independent)
|
||||
・マルチディメンション処理
|
||||
・リサンプリング( 入力sr -> 16K)
|
||||
・バターフィルタ
|
||||
Args:
|
||||
waveform: AudioInOutFloat: 入力音声
|
||||
srcSampleRate: int: 入力音声のサンプルレート
|
||||
|
||||
Returns:
|
||||
waveform: AudioInOutFloat: 前処理後の音声(1ch, 16K, np.ndarray)
|
||||
|
||||
Raises:
|
||||
OSError: ファイル指定が失敗している場合
|
||||
|
||||
"""
|
||||
if waveform.ndim == 2: # double channels
|
||||
waveform = waveform.mean(axis=-1)
|
||||
waveform16K = resampy.resample(waveform, srcSampleRate, self.processingSampleRate)
|
||||
# waveform16K = self.downsampler(torch.from_numpy(waveform)).numpy()
|
||||
waveform16K = signal.filtfilt(self.bh, self.ah, waveform16K)
|
||||
return waveform16K.copy()
|
||||
|
||||
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
|
||||
try:
|
||||
# print("CROSSFADE", crossfade_frame, sola_search_frame)
|
||||
crossfade_frame16k = math.ceil((crossfade_frame / self.outputSampleRate) * self.processingSampleRate)
|
||||
sola_search_frame16k = math.ceil((sola_search_frame / self.outputSampleRate) * self.processingSampleRate)
|
||||
|
||||
with Timer2("mainPorcess timer", False) as t:
|
||||
# 起動パラメータ
|
||||
# vcParams = VoiceChangerParamsManager.get_instance().params
|
||||
|
||||
# リサンプリングとバターフィルタ (torch independent)
|
||||
receivedData = receivedData.astype(np.float32) / 32768.0
|
||||
waveformFloat = self._preprocess(receivedData, self.inputSampleRate)
|
||||
# print(f"input audio shape 48k:{receivedData.shape} -> 16K:{waveformFloat.shape}")
|
||||
|
||||
# 推論
|
||||
audio1 = self.inferencer.infer(waveformFloat)
|
||||
audio1 = audio1.detach().cpu().numpy()
|
||||
# print(f"infered shape: in:{waveformFloat.shape} -> out:{ audio1.shape}")
|
||||
|
||||
# クロスフェード洋データ追加とリサンプリング
|
||||
if self.settings.crossfade is False and self.settings.resample is False:
|
||||
# 変換後そのまま返却(クロスフェードしない)
|
||||
new_audio = audio1
|
||||
new_audio = (new_audio * 32767.5).astype(np.int16)
|
||||
return new_audio
|
||||
|
||||
# (1) クロスフェード部分の追加
|
||||
crossfade_audio_length = audio1.shape[0] + crossfade_frame16k + sola_search_frame16k
|
||||
if self.prev_audio1 is not None:
|
||||
new_audio = np.concatenate([self.prev_audio1, audio1])
|
||||
else:
|
||||
new_audio = audio1
|
||||
self.prev_audio1 = new_audio[-crossfade_audio_length:] # 次回のクロスフェード用に保存
|
||||
# (2) リサンプル
|
||||
if self.outputSampleRate != self.processingSampleRate:
|
||||
new_audio = resampy.resample(new_audio, self.processingSampleRate, self.outputSampleRate)
|
||||
# new_audio = self.upsampler(torch.from_numpy(new_audio)).numpy()
|
||||
# new_audio = np.repeat(new_audio, 3)
|
||||
|
||||
# バッファリング。⇒ 最上位(crossfade完了後)で行う必要があるのでとりあえずペンディング
|
||||
# if self.result_buff is None:
|
||||
# self.result_buff = new_audio
|
||||
# else:
|
||||
# self.result_buff = np.concatenate([self.result_buff, new_audio])
|
||||
|
||||
# if self.result_buff.shape[0] > receivedData.shape[0]:
|
||||
# new_audio = self.result_buff[: receivedData.shape[0]]
|
||||
# self.result_buff = self.result_buff[receivedData.shape[0] :]
|
||||
# else:
|
||||
# new_audio = np.zeros(receivedData.shape[0])
|
||||
|
||||
new_audio = cast(AudioInOutFloat, new_audio)
|
||||
|
||||
new_audio = (new_audio * 32767.5).astype(np.int16)
|
||||
return new_audio
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
raise RuntimeError(e)
|
||||
|
||||
def getPipelineInfo(self):
|
||||
return {"TODO": "LLVC get info"}
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
|
||||
return data
|
||||
|
||||
def get_processing_sampling_rate(self):
|
||||
return self.processingSampleRate
|
||||
|
||||
def get_model_current(self):
|
||||
return []
|
71
server/voice_changer/LLVC/LLVCInferencer.py
Normal file
71
server/voice_changer/LLVC/LLVCInferencer.py
Normal file
@ -0,0 +1,71 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import json
|
||||
from voice_changer.LLVC.model.llvc import Net
|
||||
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOutFloat
|
||||
|
||||
|
||||
class LLVCInferencer:
|
||||
def loadModel(self, checkpoint_path: str, config_path: str):
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
model = Net(**config["model_params"])
|
||||
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"])
|
||||
|
||||
self.config = config
|
||||
self.model = model
|
||||
|
||||
self.enc_buf, self.dec_buf, self.out_buf = self.model.init_buffers(1, torch.device("cpu"))
|
||||
|
||||
if hasattr(self.model, "convnet_pre"):
|
||||
self.convnet_pre_ctx = self.model.convnet_pre.init_ctx_buf(1, torch.device("cpu"))
|
||||
else:
|
||||
self.convnet_pre_ctx = None
|
||||
|
||||
self.audio_buffer: AudioInOutFloat = np.zeros(0, dtype=np.float32)
|
||||
self.front_ctx: AudioInOutFloat | None = None
|
||||
|
||||
return self
|
||||
|
||||
def infer(
|
||||
self,
|
||||
audio: AudioInOutFloat,
|
||||
) -> torch.Tensor:
|
||||
# print(f"[infer] inputsize:{audio.shape} + rest:{self.audio_buffer.shape}")
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, audio])
|
||||
# print(f"[infer] concat size", self.audio_buffer.shape)
|
||||
|
||||
try:
|
||||
L = self.model.L
|
||||
processing_unit = self.model.dec_chunk_size * L
|
||||
chunk_size = (len(self.audio_buffer) // processing_unit) * processing_unit
|
||||
|
||||
chunk = self.audio_buffer[:chunk_size]
|
||||
self.audio_buffer = self.audio_buffer[chunk_size:]
|
||||
|
||||
inputTensor = torch.from_numpy(chunk.astype(np.float32)).to("cpu")
|
||||
|
||||
if self.front_ctx is None:
|
||||
inputTensor = torch.cat([torch.zeros(L * 2), inputTensor])
|
||||
else:
|
||||
inputTensor = torch.cat([self.front_ctx, inputTensor])
|
||||
self.front_ctx = inputTensor[-L * 2 :]
|
||||
|
||||
audio1, self.enc_buf, self.dec_buf, self.out_buf, self.convnet_pre_ctx = self.model(
|
||||
inputTensor.unsqueeze(0).unsqueeze(0),
|
||||
self.enc_buf,
|
||||
self.dec_buf,
|
||||
self.out_buf,
|
||||
self.convnet_pre_ctx,
|
||||
pad=(not self.model.lookahead),
|
||||
)
|
||||
# print(f"[infer] input chunk size {chunk.shape} ->(+32) lookaheadsize{inputTensor.shape}->(same chunk) inferedsize{audio1.shape}")
|
||||
|
||||
audio1 = audio1.squeeze(0).squeeze(0)
|
||||
return audio1
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Exeption in {self.__class__.__name__}", e)
|
||||
|
||||
# def isTorch(self):
|
||||
# return True
|
19
server/voice_changer/LLVC/LLVCModelSlotGenerator.py
Normal file
19
server/voice_changer/LLVC/LLVCModelSlotGenerator.py
Normal file
@ -0,0 +1,19 @@
|
||||
import os
|
||||
|
||||
from data.ModelSlot import BeatriceModelSlot, LLVCModelSlot
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
|
||||
|
||||
|
||||
class LLVCModelSlotGenerator(ModelSlotGenerator):
|
||||
@classmethod
|
||||
def loadModel(cls, props: LoadModelParams):
|
||||
slotInfo: LLVCModelSlot = LLVCModelSlot()
|
||||
for file in props.files:
|
||||
if file.kind == "llvcModel":
|
||||
slotInfo.modelFile = file.name
|
||||
if file.kind == "llvcConfig":
|
||||
slotInfo.configFile = file.name
|
||||
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
|
||||
slotInfo.slotIndex = props.slot
|
||||
return slotInfo
|
156
server/voice_changer/LLVC/model/cached_convnet.py
Normal file
156
server/voice_changer/LLVC/model/cached_convnet.py
Normal file
@ -0,0 +1,156 @@
|
||||
# based on https://github.com/YangangCao/Causal-U-Net/blob/main/cunet.py
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class ResidualBlock(nn.Module):
|
||||
"""
|
||||
Based on https://github.com/f90/Seq-U-Net/blob/master/sequnet_res.py
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout, use_2d):
|
||||
super().__init__()
|
||||
self.use_2d = use_2d
|
||||
if use_2d:
|
||||
self.filter = nn.Conv2d(in_channels, out_channels, kernel_size, dilation=dilation)
|
||||
self.gate = nn.Conv2d(in_channels, out_channels, kernel_size, dilation=dilation)
|
||||
self.dropout = nn.Dropout2d(dropout)
|
||||
else:
|
||||
self.filter = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation)
|
||||
self.gate = nn.Conv1d(in_channels, out_channels, kernel_size, dilation=dilation)
|
||||
self.dropout = nn.Dropout1d(dropout)
|
||||
self.output_crop = dilation * (kernel_size - 1)
|
||||
|
||||
def forward(self, x):
|
||||
filtered = torch.tanh(self.filter(x))
|
||||
gated = torch.sigmoid(self.gate(x))
|
||||
residual = filtered * gated
|
||||
# pad dim 1 of x to match residual
|
||||
if self.use_2d:
|
||||
x = F.pad(x, (0, 0, 0, 0, 0, residual.shape[1] - x.shape[1]))
|
||||
output = x[..., self.output_crop :, self.output_crop :] + residual
|
||||
else:
|
||||
x = F.pad(x, (0, 0, 0, residual.shape[1] - x.shape[1]))
|
||||
output = x[..., self.output_crop :] + residual
|
||||
output = self.dropout(output)
|
||||
return output
|
||||
|
||||
|
||||
class CausalConvBlock(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, kernel_size, dilation, dropout, use_2d):
|
||||
super().__init__()
|
||||
if use_2d:
|
||||
conv_layer = nn.Conv2d
|
||||
batchnorm_layer = nn.BatchNorm2d
|
||||
dropout_layer = nn.Dropout2d
|
||||
else:
|
||||
conv_layer = nn.Conv1d
|
||||
batchnorm_layer = nn.BatchNorm1d
|
||||
dropout_layer = nn.Dropout1d
|
||||
self.conv = nn.Sequential(
|
||||
conv_layer(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, dilation=dilation),
|
||||
batchnorm_layer(num_features=out_channels),
|
||||
dropout_layer(dropout),
|
||||
nn.LeakyReLU(inplace=True),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
1D Causal convolution.
|
||||
"""
|
||||
return self.conv(x)
|
||||
|
||||
|
||||
class CachedConvNet(nn.Module):
|
||||
def __init__(self, num_channels, kernel_sizes, dilations, dropout, combine_residuals, use_residual_blocks, out_channels, use_2d, use_pool=False, pool_kernel=2):
|
||||
super().__init__()
|
||||
assert len(kernel_sizes) == len(dilations), "kernel_sizes and dilations must be the same length"
|
||||
assert len(kernel_sizes) == len(out_channels), "kernel_sizes and out_channels must be the same length"
|
||||
self.num_layers = len(kernel_sizes)
|
||||
self.ctx_height = max(out_channels)
|
||||
self.down_convs = nn.ModuleList()
|
||||
self.num_channels = num_channels
|
||||
self.kernel_sizes = kernel_sizes
|
||||
self.combine_residuals = combine_residuals
|
||||
self.use_2d = use_2d
|
||||
self.use_pool = use_pool
|
||||
|
||||
# compute buffer lengths for each layer
|
||||
self.buf_lengths = [(k - 1) * d for k, d in zip(kernel_sizes, dilations)]
|
||||
|
||||
# Compute buffer start indices for each layer
|
||||
self.buf_indices = [0]
|
||||
for i in range(len(kernel_sizes) - 1):
|
||||
self.buf_indices.append(self.buf_indices[-1] + self.buf_lengths[i])
|
||||
|
||||
if use_residual_blocks:
|
||||
block = ResidualBlock
|
||||
else:
|
||||
block = CausalConvBlock
|
||||
|
||||
if self.use_pool:
|
||||
self.pool = nn.AvgPool1d(kernel_size=pool_kernel)
|
||||
|
||||
for i in range(self.num_layers):
|
||||
in_channel = num_channels if i == 0 else out_channels[i - 1]
|
||||
self.down_convs.append(block(in_channels=in_channel, out_channels=out_channels[i], kernel_size=kernel_sizes[i], dilation=dilations[i], dropout=dropout, use_2d=use_2d))
|
||||
|
||||
def init_ctx_buf(self, batch_size, device, height=None):
|
||||
"""
|
||||
Initialize context buffer for each layer.
|
||||
"""
|
||||
if height is not None:
|
||||
up_ctx = torch.zeros((batch_size, self.ctx_height, height, sum(self.buf_lengths))).to(device)
|
||||
else:
|
||||
up_ctx = torch.zeros((batch_size, self.ctx_height, sum(self.buf_lengths))).to(device)
|
||||
return up_ctx
|
||||
|
||||
def forward(self, x, ctx):
|
||||
"""
|
||||
Args:
|
||||
x: [B, in_channels, T]
|
||||
Input
|
||||
ctx: {[B, channels, self.buf_length[0]], ...}
|
||||
A list of tensors holding context for each unet layer. (len(ctx) == self.num_layers)
|
||||
Returns:
|
||||
x: [B, out_channels, T]
|
||||
ctx: {[B, channels, self.buf_length[0]], ...}
|
||||
Updated context buffer with output as the
|
||||
last element.
|
||||
"""
|
||||
if self.use_pool:
|
||||
x = self.pool(x)
|
||||
|
||||
for i in range(self.num_layers):
|
||||
buf_start_idx = self.buf_indices[i]
|
||||
buf_end_idx = self.buf_indices[i] + self.buf_lengths[i]
|
||||
|
||||
# concatenate context buffer with input
|
||||
if self.use_2d:
|
||||
conv_in = torch.cat((ctx[..., : x.shape[1], : x.shape[-2], buf_start_idx:buf_end_idx], x), dim=-1)
|
||||
else:
|
||||
conv_in = torch.cat((ctx[..., : x.shape[-2], buf_start_idx:buf_end_idx], x), dim=-1)
|
||||
|
||||
# Push current output to the context buffer
|
||||
if self.use_2d:
|
||||
ctx[..., : x.shape[1], : x.shape[-2], buf_start_idx:buf_end_idx] = conv_in[..., -self.buf_lengths[i] :]
|
||||
else:
|
||||
ctx[..., : x.shape[1], buf_start_idx:buf_end_idx] = conv_in[..., -self.buf_lengths[i] :]
|
||||
|
||||
# pad second-to-last index of input with self.buf_lengths[i] // 2 zeros
|
||||
# on each side to ensure that height of output is the same as input
|
||||
if self.use_2d:
|
||||
conv_in = F.pad(conv_in, (0, 0, self.buf_lengths[i] // 2, self.buf_lengths[i] // 2))
|
||||
|
||||
if self.combine_residuals == "add":
|
||||
x = x + self.down_convs[i](conv_in)
|
||||
elif self.combine_residuals == "multiply":
|
||||
x = x * self.down_convs[i](conv_in)
|
||||
else:
|
||||
x = self.down_convs[i](conv_in)
|
||||
|
||||
if self.use_pool:
|
||||
x = F.interpolate(x, scale_factor=self.pool.kernel_size[0])
|
||||
|
||||
return x, ctx
|
464
server/voice_changer/LLVC/model/llvc.py
Normal file
464
server/voice_changer/LLVC/model/llvc.py
Normal file
@ -0,0 +1,464 @@
|
||||
import math
|
||||
from collections import OrderedDict
|
||||
from typing import Optional
|
||||
|
||||
from torch import Tensor
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from voice_changer.LLVC.model.cached_convnet import CachedConvNet
|
||||
|
||||
|
||||
class PositionalEncoding(nn.Module):
|
||||
"""This class implements the absolute sinusoidal positional encoding function.
|
||||
PE(pos, 2i) = sin(pos/(10000^(2i/dmodel)))
|
||||
PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
|
||||
Arguments
|
||||
---------
|
||||
input_size: int
|
||||
Embedding dimension.
|
||||
max_len : int, optional
|
||||
Max length of the input sequences (default 2500).
|
||||
Example
|
||||
-------
|
||||
>>> a = torch.rand((8, 120, 512))
|
||||
>>> enc = PositionalEncoding(input_size=a.shape[-1])
|
||||
>>> b = enc(a)
|
||||
>>> b.shape
|
||||
torch.Size([1, 120, 512])
|
||||
"""
|
||||
|
||||
def __init__(self, input_size, max_len=2500):
|
||||
super().__init__()
|
||||
self.max_len = max_len
|
||||
pe = torch.zeros(self.max_len, input_size, requires_grad=False)
|
||||
positions = torch.arange(0, self.max_len).unsqueeze(1).float()
|
||||
denominator = torch.exp(torch.arange(0, input_size, 2).float() * -(math.log(10000.0) / input_size))
|
||||
|
||||
pe[:, 0::2] = torch.sin(positions * denominator)
|
||||
pe[:, 1::2] = torch.cos(positions * denominator)
|
||||
pe = pe.unsqueeze(0)
|
||||
self.register_buffer("pe", pe)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Arguments
|
||||
---------
|
||||
x : tensor
|
||||
Input feature shape (batch, time, fea)
|
||||
"""
|
||||
return self.pe[:, : x.size(1)].clone().detach()
|
||||
|
||||
|
||||
def mod_pad(x, chunk_size, pad):
|
||||
# Mod pad the input to perform integer number of
|
||||
# inferences
|
||||
mod = 0
|
||||
if (x.shape[-1] % chunk_size) != 0:
|
||||
mod = chunk_size - (x.shape[-1] % chunk_size)
|
||||
|
||||
x = F.pad(x, (0, mod))
|
||||
x = F.pad(x, pad)
|
||||
|
||||
return x, mod
|
||||
|
||||
|
||||
class LayerNormPermuted(nn.LayerNorm):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(LayerNormPermuted, self).__init__(*args, **kwargs)
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x: [B, C, T]
|
||||
"""
|
||||
x = x.permute(0, 2, 1) # [B, T, C]
|
||||
x = super().forward(x)
|
||||
x = x.permute(0, 2, 1) # [B, C, T]
|
||||
return x
|
||||
|
||||
|
||||
class DepthwiseSeparableConv(nn.Module):
|
||||
"""
|
||||
Depthwise separable convolutions
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation):
|
||||
super(DepthwiseSeparableConv, self).__init__()
|
||||
|
||||
self.layers = nn.Sequential(
|
||||
nn.Conv1d(in_channels, in_channels, kernel_size, stride, padding, groups=in_channels, dilation=dilation),
|
||||
LayerNormPermuted(in_channels),
|
||||
nn.ReLU(),
|
||||
nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=1, padding=0),
|
||||
LayerNormPermuted(out_channels),
|
||||
nn.ReLU(),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers(x)
|
||||
|
||||
|
||||
class DilatedCausalConvEncoder(nn.Module):
|
||||
"""
|
||||
A dilated causal convolution based encoder for encoding
|
||||
time domain audio input into latent space.
|
||||
"""
|
||||
|
||||
def __init__(self, channels, num_layers, kernel_size=3):
|
||||
super(DilatedCausalConvEncoder, self).__init__()
|
||||
self.channels = channels
|
||||
self.num_layers = num_layers
|
||||
self.kernel_size = kernel_size
|
||||
|
||||
# Compute buffer lengths for each layer
|
||||
# buf_length[i] = (kernel_size - 1) * dilation[i]
|
||||
self.buf_lengths = [(kernel_size - 1) * 2**i for i in range(num_layers)]
|
||||
|
||||
# Compute buffer start indices for each layer
|
||||
self.buf_indices = [0]
|
||||
for i in range(num_layers - 1):
|
||||
self.buf_indices.append(self.buf_indices[-1] + self.buf_lengths[i])
|
||||
|
||||
# Dilated causal conv layers aggregate previous context to obtain
|
||||
# contexful encoded input.
|
||||
_dcc_layers = OrderedDict()
|
||||
for i in range(num_layers):
|
||||
dcc_layer = DepthwiseSeparableConv(channels, channels, kernel_size=3, stride=1, padding=0, dilation=2**i)
|
||||
_dcc_layers.update({"dcc_%d" % i: dcc_layer})
|
||||
self.dcc_layers = nn.Sequential(_dcc_layers)
|
||||
|
||||
def init_ctx_buf(self, batch_size, device):
|
||||
"""
|
||||
Returns an initialized context buffer for a given batch size.
|
||||
"""
|
||||
return torch.zeros((batch_size, self.channels, (self.kernel_size - 1) * (2**self.num_layers - 1)), device=device)
|
||||
|
||||
def forward(self, x, ctx_buf):
|
||||
"""
|
||||
Encodes input audio `x` into latent space, and aggregates
|
||||
contextual information in `ctx_buf`. Also generates new context
|
||||
buffer with updated context.
|
||||
Args:
|
||||
x: [B, in_channels, T]
|
||||
Input multi-channel audio.
|
||||
ctx_buf: {[B, channels, self.buf_length[0]], ...}
|
||||
A list of tensors holding context for each dilation
|
||||
causal conv layer. (len(ctx_buf) == self.num_layers)
|
||||
Returns:
|
||||
ctx_buf: {[B, channels, self.buf_length[0]], ...}
|
||||
Updated context buffer with output as the
|
||||
last element.
|
||||
"""
|
||||
T = x.shape[-1] # Sequence length # noqa
|
||||
|
||||
for i in range(self.num_layers):
|
||||
buf_start_idx = self.buf_indices[i]
|
||||
buf_end_idx = self.buf_indices[i] + self.buf_lengths[i]
|
||||
|
||||
# DCC input: concatenation of current output and context
|
||||
dcc_in = torch.cat((ctx_buf[..., buf_start_idx:buf_end_idx], x), dim=-1)
|
||||
|
||||
# Push current output to the context buffer
|
||||
ctx_buf[..., buf_start_idx:buf_end_idx] = dcc_in[..., -self.buf_lengths[i] :]
|
||||
|
||||
# Residual connection
|
||||
x = x + self.dcc_layers[i](dcc_in)
|
||||
|
||||
return x, ctx_buf
|
||||
|
||||
|
||||
class CausalTransformerDecoderLayer(torch.nn.TransformerDecoderLayer):
|
||||
"""
|
||||
Adapted from:
|
||||
"https://github.com/alexmt-scale/causal-transformer-decoder/blob/"
|
||||
"0caf6ad71c46488f76d89845b0123d2550ef792f/"
|
||||
"causal_transformer_decoder/model.py#L77"
|
||||
"""
|
||||
|
||||
def forward2(self, tgt: Tensor, memory: Optional[Tensor] = None, chunk_size: int = 1):
|
||||
tgt_last_tok = tgt[:, -chunk_size:, :]
|
||||
|
||||
# self attention part
|
||||
tmp_tgt, sa_map = self.self_attn(
|
||||
tgt_last_tok,
|
||||
tgt,
|
||||
tgt,
|
||||
attn_mask=None, # not needed because we only care about the last token
|
||||
key_padding_mask=None,
|
||||
)
|
||||
tgt_last_tok = tgt_last_tok + self.dropout1(tmp_tgt)
|
||||
tgt_last_tok = self.norm1(tgt_last_tok)
|
||||
|
||||
# encoder-decoder attention
|
||||
if memory is not None:
|
||||
tmp_tgt, ca_map = self.multihead_attn(
|
||||
tgt_last_tok,
|
||||
memory,
|
||||
memory,
|
||||
attn_mask=None, # Attend to the entire chunk
|
||||
key_padding_mask=None,
|
||||
)
|
||||
tgt_last_tok = tgt_last_tok + self.dropout2(tmp_tgt)
|
||||
tgt_last_tok = self.norm2(tgt_last_tok)
|
||||
|
||||
# final feed-forward network
|
||||
tmp_tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt_last_tok))))
|
||||
tgt_last_tok = tgt_last_tok + self.dropout3(tmp_tgt)
|
||||
tgt_last_tok = self.norm3(tgt_last_tok)
|
||||
return tgt_last_tok, sa_map, ca_map
|
||||
|
||||
|
||||
class CausalTransformerDecoder(nn.Module):
|
||||
"""
|
||||
A casual transformer decoder which decodes input vectors using
|
||||
precisely `ctx_len` past vectors in the sequence, and using no future
|
||||
vectors at all.
|
||||
"""
|
||||
|
||||
def __init__(self, model_dim, ctx_len, chunk_size, num_layers, nhead, use_pos_enc, ff_dim, dropout):
|
||||
super(CausalTransformerDecoder, self).__init__()
|
||||
self.num_layers = num_layers
|
||||
self.model_dim = model_dim
|
||||
self.ctx_len = ctx_len
|
||||
self.chunk_size = chunk_size
|
||||
self.nhead = nhead
|
||||
self.use_pos_enc = use_pos_enc
|
||||
self.unfold = nn.Unfold(kernel_size=(ctx_len + chunk_size, 1), stride=chunk_size)
|
||||
self.pos_enc = PositionalEncoding(model_dim, max_len=200)
|
||||
self.tf_dec_layers = nn.ModuleList([CausalTransformerDecoderLayer(d_model=model_dim, nhead=nhead, dim_feedforward=ff_dim, batch_first=True, dropout=dropout) for _ in range(num_layers)])
|
||||
|
||||
def init_ctx_buf(self, batch_size, device):
|
||||
return torch.zeros((batch_size, self.num_layers + 1, self.ctx_len, self.model_dim), device=device)
|
||||
|
||||
def _causal_unfold(self, x):
|
||||
"""
|
||||
Unfolds the sequence into a batch of sequences
|
||||
prepended with `ctx_len` previous values.
|
||||
|
||||
Args:
|
||||
x: [B, ctx_len + L, C]
|
||||
ctx_len: int
|
||||
Returns:
|
||||
[B * L, ctx_len + 1, C]
|
||||
"""
|
||||
B, T, C = x.shape
|
||||
x = x.permute(0, 2, 1) # [B, C, ctx_len + L]
|
||||
x = self.unfold(x.unsqueeze(-1)) # [B, C * (ctx_len + chunk_size), -1]
|
||||
x = x.permute(0, 2, 1)
|
||||
x = x.reshape(B, -1, C, self.ctx_len + self.chunk_size)
|
||||
x = x.reshape(-1, C, self.ctx_len + self.chunk_size)
|
||||
x = x.permute(0, 2, 1)
|
||||
return x
|
||||
|
||||
def forward(self, tgt, mem, ctx_buf, probe=False):
|
||||
"""
|
||||
Args:
|
||||
x: [B, model_dim, T]
|
||||
ctx_buf: [B, num_layers, model_dim, ctx_len]
|
||||
"""
|
||||
mem, _ = mod_pad(mem, self.chunk_size, (0, 0))
|
||||
tgt, mod = mod_pad(tgt, self.chunk_size, (0, 0))
|
||||
|
||||
# Input sequence length
|
||||
B, C, T = tgt.shape
|
||||
|
||||
tgt = tgt.permute(0, 2, 1)
|
||||
mem = mem.permute(0, 2, 1)
|
||||
|
||||
# Prepend mem with the context
|
||||
mem = torch.cat((ctx_buf[:, 0, :, :], mem), dim=1)
|
||||
ctx_buf[:, 0, :, :] = mem[:, -self.ctx_len :, :]
|
||||
mem_ctx = self._causal_unfold(mem)
|
||||
if self.use_pos_enc:
|
||||
mem_ctx = mem_ctx + self.pos_enc(mem_ctx)
|
||||
|
||||
# Attention chunk size: required to ensure the model
|
||||
# wouldn't trigger an out-of-memory error when working
|
||||
# on long sequences.
|
||||
K = 1000
|
||||
|
||||
for i, tf_dec_layer in enumerate(self.tf_dec_layers):
|
||||
# Update the tgt with context
|
||||
tgt = torch.cat((ctx_buf[:, i + 1, :, :], tgt), dim=1)
|
||||
ctx_buf[:, i + 1, :, :] = tgt[:, -self.ctx_len :, :]
|
||||
|
||||
# Compute encoded output
|
||||
tgt_ctx = self._causal_unfold(tgt)
|
||||
if self.use_pos_enc and i == 0:
|
||||
tgt_ctx = tgt_ctx + self.pos_enc(tgt_ctx)
|
||||
tgt = torch.zeros_like(tgt_ctx)[:, -self.chunk_size :, :]
|
||||
for i in range(int(math.ceil(tgt.shape[0] / K))):
|
||||
tgt[i * K : (i + 1) * K], _sa_map, _ca_map = tf_dec_layer.forward2(tgt_ctx[i * K : (i + 1) * K], mem_ctx[i * K : (i + 1) * K], self.chunk_size)
|
||||
tgt = tgt.reshape(B, T, C)
|
||||
|
||||
tgt = tgt.permute(0, 2, 1)
|
||||
if mod != 0:
|
||||
tgt = tgt[..., :-mod]
|
||||
|
||||
return tgt, ctx_buf
|
||||
|
||||
|
||||
class MaskNet(nn.Module):
|
||||
def __init__(self, enc_dim, num_enc_layers, dec_dim, dec_buf_len, dec_chunk_size, num_dec_layers, use_pos_enc, skip_connection, proj, decoder_dropout):
|
||||
super(MaskNet, self).__init__()
|
||||
self.skip_connection = skip_connection
|
||||
self.proj = proj
|
||||
|
||||
# Encoder based on dilated causal convolutions.
|
||||
self.encoder = DilatedCausalConvEncoder(channels=enc_dim, num_layers=num_enc_layers)
|
||||
|
||||
# Project between encoder and decoder dimensions
|
||||
self.proj_e2d_e = nn.Sequential(nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU())
|
||||
self.proj_e2d_l = nn.Sequential(nn.Conv1d(enc_dim, dec_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU())
|
||||
self.proj_d2e = nn.Sequential(nn.Conv1d(dec_dim, enc_dim, kernel_size=1, stride=1, padding=0, groups=dec_dim), nn.ReLU())
|
||||
|
||||
# Transformer decoder that operates on chunks of size
|
||||
# buffer size.
|
||||
|
||||
self.decoder = CausalTransformerDecoder(model_dim=dec_dim, ctx_len=dec_buf_len, chunk_size=dec_chunk_size, num_layers=num_dec_layers, nhead=8, use_pos_enc=use_pos_enc, ff_dim=2 * dec_dim, dropout=decoder_dropout)
|
||||
|
||||
def forward(self, x, l, enc_buf, dec_buf): # noqa
|
||||
"""
|
||||
Generates a mask based on encoded input `e` and the one-hot
|
||||
label `label`.
|
||||
|
||||
Args:
|
||||
x: [B, C, T]
|
||||
Input audio sequence
|
||||
l: [B, C]
|
||||
Label embedding
|
||||
ctx_buf: {[B, C, <receptive field of the layer>], ...}
|
||||
List of context buffers maintained by DCC encoder
|
||||
"""
|
||||
# Enocder the label integrated input
|
||||
e, enc_buf = self.encoder(x, enc_buf)
|
||||
|
||||
# Label integration
|
||||
l = l.unsqueeze(2) * e # noqa
|
||||
|
||||
# Project to `dec_dim` dimensions
|
||||
if self.proj:
|
||||
e = self.proj_e2d_e(e)
|
||||
m = self.proj_e2d_l(l)
|
||||
# Cross-attention to predict the mask
|
||||
m, dec_buf = self.decoder(m, e, dec_buf)
|
||||
else:
|
||||
# Cross-attention to predict the mask
|
||||
m, dec_buf = self.decoder(l, e, dec_buf)
|
||||
|
||||
# Project mask to encoder dimensions
|
||||
if self.proj:
|
||||
m = self.proj_d2e(m)
|
||||
|
||||
# Final mask after residual connection
|
||||
if self.skip_connection:
|
||||
m = l + m
|
||||
|
||||
return m, enc_buf, dec_buf
|
||||
|
||||
|
||||
class Net(nn.Module):
|
||||
def __init__(self, label_len, L=8, enc_dim=512, num_enc_layers=10, dec_dim=256, dec_buf_len=100, num_dec_layers=2, dec_chunk_size=72, out_buf_len=2, use_pos_enc=True, skip_connection=True, proj=True, lookahead=True, decoder_dropout=0.0, convnet_config=None):
|
||||
super(Net, self).__init__()
|
||||
self.L = L
|
||||
self.dec_chunk_size = dec_chunk_size
|
||||
self.out_buf_len = out_buf_len
|
||||
self.enc_dim = enc_dim
|
||||
self.lookahead = lookahead
|
||||
|
||||
self.convnet_config = convnet_config
|
||||
if convnet_config["convnet_prenet"]:
|
||||
self.convnet_pre = CachedConvNet(1, convnet_config["kernel_sizes"], convnet_config["dilations"], convnet_config["dropout"], convnet_config["combine_residuals"], convnet_config["use_residual_blocks"], convnet_config["out_channels"], use_2d=False)
|
||||
|
||||
# Input conv to convert input audio to a latent representation
|
||||
kernel_size = 3 * L if lookahead else L
|
||||
self.in_conv = nn.Sequential(nn.Conv1d(in_channels=1, out_channels=enc_dim, kernel_size=kernel_size, stride=L, padding=0, bias=False), nn.ReLU())
|
||||
|
||||
# Label embedding layer
|
||||
label_len = 1
|
||||
self.label_embedding = nn.Sequential(nn.Linear(label_len, 512), nn.LayerNorm(512), nn.ReLU(), nn.Linear(512, enc_dim), nn.LayerNorm(enc_dim), nn.ReLU())
|
||||
|
||||
# Mask generator
|
||||
self.mask_gen = MaskNet(enc_dim=enc_dim, num_enc_layers=num_enc_layers, dec_dim=dec_dim, dec_buf_len=dec_buf_len, dec_chunk_size=dec_chunk_size, num_dec_layers=num_dec_layers, use_pos_enc=use_pos_enc, skip_connection=skip_connection, proj=proj, decoder_dropout=decoder_dropout)
|
||||
|
||||
# Output conv layer
|
||||
self.out_conv = nn.Sequential(nn.ConvTranspose1d(in_channels=enc_dim, out_channels=1, kernel_size=(out_buf_len + 1) * L, stride=L, padding=out_buf_len * L, bias=False), nn.Tanh())
|
||||
|
||||
def init_buffers(self, batch_size, device):
|
||||
enc_buf = self.mask_gen.encoder.init_ctx_buf(batch_size, device)
|
||||
dec_buf = self.mask_gen.decoder.init_ctx_buf(batch_size, device)
|
||||
out_buf = torch.zeros(batch_size, self.enc_dim, self.out_buf_len, device=device)
|
||||
return enc_buf, dec_buf, out_buf
|
||||
|
||||
def forward(self, x, init_enc_buf=None, init_dec_buf=None, init_out_buf=None, convnet_pre_ctx=None, pad=True):
|
||||
"""
|
||||
Extracts the audio corresponding to the `label` in the given
|
||||
`mixture`. Generates `chunk_size` samples per iteration.
|
||||
|
||||
Args:
|
||||
mixed: [B, n_mics, T]
|
||||
input audio mixture
|
||||
label: [B, num_labels]
|
||||
one hot label
|
||||
Returns:
|
||||
out: [B, n_spk, T]
|
||||
extracted audio with sounds corresponding to the `label`
|
||||
"""
|
||||
label = torch.zeros(x.shape[0], 1, device=x.device)
|
||||
mod = 0
|
||||
if pad:
|
||||
pad_size = (self.L, self.L) if self.lookahead else (0, 0)
|
||||
x, mod = mod_pad(x, chunk_size=self.L, pad=pad_size)
|
||||
|
||||
if hasattr(self, "convnet_pre"):
|
||||
if convnet_pre_ctx is None:
|
||||
convnet_pre_ctx = self.convnet_pre.init_ctx_buf(x.shape[0], x.device)
|
||||
|
||||
convnet_out, convnet_pre_ctx = self.convnet_pre(x, convnet_pre_ctx)
|
||||
|
||||
if self.convnet_config["skip_connection"] == "add":
|
||||
x = x + convnet_out
|
||||
elif self.convnet_config["skip_connection"] == "multiply":
|
||||
x = x * convnet_out
|
||||
else:
|
||||
x = convnet_out
|
||||
|
||||
if init_enc_buf is None or init_dec_buf is None or init_out_buf is None:
|
||||
assert init_enc_buf is None and init_dec_buf is None and init_out_buf is None, "Both buffers have to initialized, or " "both of them have to be None."
|
||||
enc_buf, dec_buf, out_buf = self.init_buffers(x.shape[0], x.device)
|
||||
else:
|
||||
(
|
||||
enc_buf,
|
||||
dec_buf,
|
||||
out_buf,
|
||||
) = (
|
||||
init_enc_buf,
|
||||
init_dec_buf,
|
||||
init_out_buf,
|
||||
)
|
||||
|
||||
# Generate latent space representation of the input
|
||||
x = self.in_conv(x)
|
||||
|
||||
# Generate label embedding
|
||||
l = self.label_embedding(label) # [B, label_len] --> [B, channels] # noqa
|
||||
|
||||
# Generate mask corresponding to the label
|
||||
m, enc_buf, dec_buf = self.mask_gen(x, l, enc_buf, dec_buf)
|
||||
|
||||
# Apply mask and decode
|
||||
x = x * m
|
||||
x = torch.cat((out_buf, x), dim=-1)
|
||||
out_buf = x[..., -self.out_buf_len :]
|
||||
x = self.out_conv(x)
|
||||
|
||||
# Remove mod padding, if present.
|
||||
if mod != 0:
|
||||
x = x[:, :, :-mod]
|
||||
|
||||
if init_enc_buf is None:
|
||||
return x
|
||||
else:
|
||||
return x, enc_buf, dec_buf, out_buf, convnet_pre_ctx
|
@ -199,6 +199,13 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
||||
|
||||
slotInfo = BeatriceModelSlotGenerator.loadModel(params)
|
||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||
|
||||
elif params.voiceChangerType == "LLVC":
|
||||
from voice_changer.LLVC.LLVCModelSlotGenerator import LLVCModelSlotGenerator
|
||||
|
||||
slotInfo = LLVCModelSlotGenerator.loadModel(params)
|
||||
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
|
||||
|
||||
logger.info(f"params, {params}")
|
||||
|
||||
def get_info(self):
|
||||
@ -291,6 +298,14 @@ class VoiceChangerManager(ServerDeviceCallbacks):
|
||||
self.voiceChangerModel = Beatrice(self.params, slotInfo)
|
||||
self.voiceChanger = VoiceChangerV2(self.params)
|
||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||
elif slotInfo.voiceChangerType == "LLVC":
|
||||
logger.info("................LLVC")
|
||||
from voice_changer.LLVC.LLVC import LLVC
|
||||
|
||||
self.voiceChangerModel = LLVC(self.params, slotInfo)
|
||||
self.voiceChanger = VoiceChangerV2(self.params)
|
||||
self.voiceChanger.setModel(self.voiceChangerModel)
|
||||
pass
|
||||
|
||||
else:
|
||||
logger.info(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
|
||||
|
@ -90,22 +90,16 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
self.params = params
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
self.prev_audio = np.zeros(4096)
|
||||
self.mps_enabled: bool = (
|
||||
getattr(torch.backends, "mps", None) is not None
|
||||
and torch.backends.mps.is_available()
|
||||
)
|
||||
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
||||
self.onnx_device = onnxruntime.get_device()
|
||||
self.noCrossFade = False
|
||||
|
||||
logger.info(
|
||||
f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})"
|
||||
)
|
||||
logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
|
||||
|
||||
def setModel(self, model: VoiceChangerModel):
|
||||
self.voiceChanger = model
|
||||
self.voiceChanger.setSamplingRate(
|
||||
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||
)
|
||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||
# if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC":
|
||||
if model.voiceChangerType == "Beatrice":
|
||||
self.noCrossFade = True
|
||||
else:
|
||||
@ -113,15 +107,11 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
|
||||
def setInputSampleRate(self, sr: int):
|
||||
self.settings.inputSampleRate = sr
|
||||
self.voiceChanger.setSamplingRate(
|
||||
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||
)
|
||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||
|
||||
def setOutputSampleRate(self, sr: int):
|
||||
self.settings.outputSampleRate = sr
|
||||
self.voiceChanger.setSamplingRate(
|
||||
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||
)
|
||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
@ -140,9 +130,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
if key == "serverAudioStated" and val == 0:
|
||||
self.settings.inputSampleRate = 48000
|
||||
self.settings.outputSampleRate = 48000
|
||||
self.voiceChanger.setSamplingRate(
|
||||
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||
)
|
||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||
|
||||
if key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
@ -156,6 +144,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
STREAM_OUTPUT_FILE,
|
||||
self.settings.inputSampleRate,
|
||||
self.settings.outputSampleRate,
|
||||
# 16000,
|
||||
)
|
||||
if key == "recordIO" and val == 0:
|
||||
if hasattr(self, "ioRecorder"):
|
||||
@ -165,9 +154,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
if hasattr(self, "ioRecorder"):
|
||||
self.ioRecorder.close()
|
||||
if key == "inputSampleRate" or key == "outputSampleRate":
|
||||
self.voiceChanger.setSamplingRate(
|
||||
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||
)
|
||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
||||
elif key in self.settings.floatData:
|
||||
setattr(self.settings, key, float(val))
|
||||
elif key in self.settings.strData:
|
||||
@ -180,12 +167,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
return self.get_info()
|
||||
|
||||
def _generate_strength(self, crossfadeSize: int):
|
||||
if (
|
||||
self.crossfadeSize != crossfadeSize
|
||||
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
|
||||
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
|
||||
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
|
||||
):
|
||||
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
|
||||
self.crossfadeSize = crossfadeSize
|
||||
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
||||
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
||||
@ -214,9 +196,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
]
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
|
||||
)
|
||||
logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
|
||||
|
||||
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
||||
if hasattr(self, "np_prev_audio1") is True:
|
||||
@ -231,21 +211,15 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
return self.voiceChanger.get_processing_sampling_rate()
|
||||
|
||||
# receivedData: tuple of short
|
||||
def on_request(
|
||||
self, receivedData: AudioInOut
|
||||
) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||
try:
|
||||
if self.voiceChanger is None:
|
||||
raise VoiceChangerIsNotSelectedException(
|
||||
"Voice Changer is not selected."
|
||||
)
|
||||
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
|
||||
|
||||
with Timer("main-process") as t:
|
||||
processing_sampling_rate = (
|
||||
self.voiceChanger.get_processing_sampling_rate()
|
||||
)
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
|
||||
if self.noCrossFade: # Beatrice
|
||||
if self.noCrossFade: # Beatrice, LLVC
|
||||
audio = self.voiceChanger.inference(
|
||||
receivedData,
|
||||
crossfade_frame=0,
|
||||
@ -257,9 +231,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
else:
|
||||
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||
block_frame = receivedData.shape[0]
|
||||
crossfade_frame = min(
|
||||
self.settings.crossFadeOverlapSize, block_frame
|
||||
)
|
||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
||||
self._generate_strength(crossfade_frame)
|
||||
|
||||
audio = self.voiceChanger.inference(
|
||||
@ -270,9 +242,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
|
||||
if hasattr(self, "sola_buffer") is True:
|
||||
np.set_printoptions(threshold=10000)
|
||||
audio_offset = -1 * (
|
||||
sola_search_frame + crossfade_frame + block_frame
|
||||
)
|
||||
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
|
||||
audio = audio[audio_offset:]
|
||||
|
||||
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
||||
@ -297,25 +267,16 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
|
||||
result = output_wav
|
||||
else:
|
||||
logger.info(
|
||||
"[Voice Changer] warming up... generating sola buffer."
|
||||
)
|
||||
logger.info("[Voice Changer] warming up... generating sola buffer.")
|
||||
result = np.zeros(4096).astype(np.int16)
|
||||
|
||||
if (
|
||||
hasattr(self, "sola_buffer") is True
|
||||
and sola_offset < sola_search_frame
|
||||
):
|
||||
offset = -1 * (
|
||||
sola_search_frame + crossfade_frame - sola_offset
|
||||
)
|
||||
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
|
||||
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
|
||||
end = -1 * (sola_search_frame - sola_offset)
|
||||
sola_buf_org = audio[offset:end]
|
||||
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
||||
else:
|
||||
self.sola_buffer = (
|
||||
audio[-crossfade_frame:] * self.np_prev_strength
|
||||
)
|
||||
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
|
||||
# self.sola_buffer = audio[- crossfade_frame:]
|
||||
|
||||
mainprocess_time = t.secs
|
||||
@ -324,12 +285,15 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
with Timer("post-process") as t:
|
||||
result = result.astype(np.int16)
|
||||
|
||||
print_convert_processing(
|
||||
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz"
|
||||
)
|
||||
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
|
||||
|
||||
if receivedData.shape[0] != result.shape[0]:
|
||||
outputData = pad_array(result, receivedData.shape[0])
|
||||
# print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0])
|
||||
if self.voiceChanger.voiceChangerType == "LLVC":
|
||||
outputData = result
|
||||
else:
|
||||
outputData = pad_array(result, receivedData.shape[0])
|
||||
|
||||
pass
|
||||
else:
|
||||
outputData = result
|
||||
@ -340,9 +304,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
|
||||
postprocess_time = t.secs
|
||||
|
||||
print_convert_processing(
|
||||
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
|
||||
)
|
||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||
perf = [0, mainprocess_time, postprocess_time]
|
||||
|
||||
return outputData, perf
|
||||
@ -351,9 +313,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
logger.warn(f"[Voice Changer] [Exception], {e}")
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except ONNXInputArgumentException as e:
|
||||
logger.warn(
|
||||
f"[Voice Changer] [Exception] onnx are waiting valid input., {e}"
|
||||
)
|
||||
logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}")
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except HalfPrecisionChangingException:
|
||||
logger.warn("[Voice Changer] Switching model configuration....")
|
||||
@ -365,9 +325,7 @@ class VoiceChangerV2(VoiceChangerIF):
|
||||
logger.warn(f"[Voice Changer] embedder: {e}")
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except VoiceChangerIsNotSelectedException:
|
||||
logger.warn(
|
||||
"[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc."
|
||||
)
|
||||
logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
except DeviceCannotSupportHalfPrecisionException:
|
||||
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
||||
|
@ -20,6 +20,8 @@ LoadModelParamFileKind: TypeAlias = Literal[
|
||||
"ddspSvcDiffusionConfig",
|
||||
"diffusionSVCModel",
|
||||
"beatriceModel",
|
||||
"llvcModel",
|
||||
"llvcConfig",
|
||||
]
|
||||
|
||||
|
||||
|
@ -6,6 +6,8 @@ from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
|
||||
|
||||
AudioInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
||||
AudioInOutFloat: TypeAlias = np.ndarray[Any, np.dtype[np.float32]]
|
||||
|
||||
PitchfInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
||||
FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user