This commit is contained in:
w-okada 2023-08-09 16:55:59 +09:00
parent 5b43daa705
commit 50f963ff6b
17 changed files with 1458 additions and 3100 deletions

3
.gitignore vendored
View File

@ -61,3 +61,6 @@ start_trainer.sh
# venv
venv/
beatrice_internal_api.cp310-win_amd64.pyd

View File

@ -1 +1,10 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
<!DOCTYPE html>
<html style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>Voice Changer Client Demo</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
</body>
</html>

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -111,6 +111,11 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
return x.kind == "diffusionSVCModel";
});
return enough;
} else if (setting.voiceChangerType == "Beatrice") {
const enough = !!setting.files.find((x) => {
return x.kind == "beatriceModel";
});
return enough;
}
return false;
};
@ -170,6 +175,8 @@ export const FileUploaderScreen = (props: FileUploaderScreenProps) => {
rows.push(generateFileRow(uploadSetting!, "Model(diff)", "ddspSvcDiffusion", ["pth", "pt"], "diff/"));
} else if (vcType == "Diffusion-SVC") {
rows.push(generateFileRow(uploadSetting!, "Model(combo)", "diffusionSVCModel", ["ptc"]));
} else if (vcType == "Beatrice") {
rows.push(generateFileRow(uploadSetting!, "Beatrice", "beatriceModel", ["bin"]));
}
return rows;
};

View File

@ -10,7 +10,8 @@ export const VoiceChangerType = {
"so-vits-svc-40": "so-vits-svc-40",
"DDSP-SVC": "DDSP-SVC",
"RVC": "RVC",
"Diffusion-SVC":"Diffusion-SVC"
"Diffusion-SVC":"Diffusion-SVC",
"Beatrice": "Beatrice"
} as const
export type VoiceChangerType = typeof VoiceChangerType[keyof typeof VoiceChangerType]
@ -287,7 +288,15 @@ export type DiffusionSVCModelSlot = ModelSlot & {
speakers: { [key: number]: string }
}
export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot
export type BeatriceModelSlot = ModelSlot & {
modelFile: string
dstId: number
speakers: { [key: number]: string }
}
export type ModelSlotUnion = RVCModelSlot | MMVCv13ModelSlot | MMVCv15ModelSlot | SoVitsSvc40ModelSlot | DDSPSVCModelSlot | DiffusionSVCModelSlot | BeatriceModelSlot
type ServerAudioDevice = {
kind: "audioinput" | "audiooutput",

View File

@ -29,6 +29,8 @@ export const ModelFileKind = {
"diffusionSVCModel": "diffusionSVCModel",
"beatriceModel": "beatriceModel",
} as const
export type ModelFileKind = typeof ModelFileKind[keyof typeof ModelFileKind]

View File

@ -11,7 +11,8 @@ VoiceChangerType: TypeAlias = Literal[
"so-vits-svc-40",
"DDSP-SVC",
"RVC",
"Diffusion-SVC"
"Diffusion-SVC",
"Beatrice",
]
STORED_SETTING_FILE = "stored_setting.json"

View File

@ -124,7 +124,15 @@ class DiffusionSVCModelSlot(ModelSlot):
embChannels: int = 768
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot]
@dataclass
class BeatriceModelSlot(ModelSlot):
voiceChangerType: VoiceChangerType = "Beatrice"
modelFile: str = ""
dstId: int = 1
speakers: dict = field(default_factory=lambda: {1: "user1", 2: "user2"})
ModelSlots: TypeAlias = Union[ModelSlot, RVCModelSlot, MMVCv13ModelSlot, MMVCv15ModelSlot, SoVitsSvc40ModelSlot, DDSPSVCModelSlot, DiffusionSVCModelSlot, BeatriceModelSlot]
def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
@ -153,6 +161,9 @@ def loadSlotInfo(model_dir: str, slotIndex: int) -> ModelSlots:
elif slotInfo.voiceChangerType == "Diffusion-SVC":
slotInfoKey.extend(list(DiffusionSVCModelSlot.__annotations__.keys()))
return DiffusionSVCModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
elif slotInfo.voiceChangerType == "Beatrice":
slotInfoKey.extend(list(BeatriceModelSlot.__annotations__.keys()))
return BeatriceModelSlot(**{k: v for k, v in jsonDict.items() if k in slotInfoKey})
else:
return ModelSlot()
@ -168,6 +179,7 @@ def loadAllSlotInfo(model_dir: str):
def saveSlotInfo(model_dir: str, slotIndex: int, slotInfo: ModelSlots):
slotDir = os.path.join(model_dir, str(slotIndex))
print("SlotInfo:::", slotInfo)
slotInfoDict = asdict(slotInfo)
slotInfo.slotIndex = -1 # スロットインデックスは動的に注入
json.dump(slotInfoDict, open(os.path.join(slotDir, "params.json"), "w"), indent=4)

View File

@ -0,0 +1,59 @@
from typing import Union
import os
import numpy as np
from data.ModelSlot import BeatriceModelSlot
from mods.log_control import VoiceChangaerLogger
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from beatrice_internal_api import BeatriceInternalAPI
logger = VoiceChangaerLogger.get_instance().getLogger()
class BeatriceAPI(BeatriceInternalAPI):
def __init__(self, sample_rate: float = 48000.0):
pass
class Beatrice(VoiceChangerModel):
def __init__(self, params: VoiceChangerParams, slotInfo: BeatriceModelSlot):
raise RuntimeError("not implemented")
def initialize(self):
raise RuntimeError("not implemented")
def setSamplingRate(self, inputSampleRate, outputSampleRate):
raise RuntimeError("not implemented")
def update_settings(self, key: str, val: int | float | str):
raise RuntimeError("not implemented")
def get_info(self):
raise RuntimeError("not implemented")
def get_processing_sampling_rate(self):
raise RuntimeError("not implemented")
def generate_input(
self,
newData: AudioInOut,
crossfadeSize: int,
solaSearchFrame: int = 0,
):
raise RuntimeError("not implemented")
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
raise RuntimeError("not implemented")
def __del__(self):
del self.pipeline
def get_model_current(self):
return [
{
"key": "dstId",
"val": self.settings.dstId,
},
]

View File

@ -0,0 +1,17 @@
import os
from data.ModelSlot import BeatriceModelSlot
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.ModelSlotGenerator import ModelSlotGenerator
class BeatriceModelSlotGenerator(ModelSlotGenerator):
@classmethod
def loadModel(cls, props: LoadModelParams):
slotInfo: BeatriceModelSlot = BeatriceModelSlot()
for file in props.files:
if file.kind == "beatriceModel":
slotInfo.modelFile = file.name
slotInfo.name = os.path.splitext(os.path.basename(slotInfo.modelFile))[0]
slotInfo.slotIndex = props.slot
return slotInfo

View File

@ -0,0 +1,16 @@
from dataclasses import dataclass, field
@dataclass
class BeatriceSettings:
# gpu: int = -9999
dstId: int = 0
modelSamplingRate: int = 48000
silentThreshold: float = 0.00001
speakers: dict[str, int] = field(default_factory=lambda: {})
intData = [
# "gpu",
"dstId",
]
floatData = ["silentThreshold"]
strData = []

View File

@ -1,8 +1,5 @@
import os
from dataclasses import asdict
from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot, RVCModelSlot
from data.ModelSlot import DiffusionSVCModelSlot, ModelSlot
from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder_from_combo
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
from voice_changer.utils.LoadModelParams import LoadModelParams
@ -15,7 +12,7 @@ def get_divisors(n):
if n % i == 0:
divisors.append(i)
if i != n // i:
divisors.append(n //i)
divisors.append(n // i)
return sorted(divisors)

View File

@ -63,7 +63,7 @@ class RVCModelSlotGenerator(ModelSlotGenerator):
# elif slot.embedder == "hubert_jp":
# slot.embedder = "hubert_jp"
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
raise RuntimeError("[Voice Changer][setInfoByPytorch] unknown embedder")
elif config_len == 18:
# Original RVC

View File

@ -185,6 +185,11 @@ class VoiceChangerManager(ServerDeviceCallbacks):
slotInfo = DiffusionSVCModelSlotGenerator.loadModel(params)
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
elif params.voiceChangerType == "Beatrice":
from voice_changer.Beatrice.BeatriceModelSlotGenerator import BeatriceModelSlotGenerator
slotInfo = BeatriceModelSlotGenerator.loadModel(params)
self.modelSlotManager.save_model_slot(params.slot, slotInfo)
logger.info(f"params, {params}")
def get_info(self):
@ -267,6 +272,13 @@ class VoiceChangerManager(ServerDeviceCallbacks):
self.voiceChangerModel = DiffusionSVC(self.params, slotInfo)
self.voiceChanger = VoiceChangerV2(self.params)
self.voiceChanger.setModel(self.voiceChangerModel)
elif slotInfo.voiceChangerType == "Beatrice":
logger.info("................Beatrice")
from voice_changer.Beatrice.Beatrice import Beatrice
self.voiceChangerModel = Beatrice(self.params, slotInfo)
self.voiceChanger = VoiceChangerV2(self.params)
self.voiceChanger.setModel(self.voiceChangerModel)
else:
logger.info(f"[Voice Changer] unknown voice changer model: {slotInfo.voiceChangerType}")
if hasattr(self, "voiceChangerModel"):

View File

@ -18,6 +18,7 @@ import numpy as np
from dataclasses import dataclass, asdict, field
import onnxruntime
from mods.log_control import VoiceChangaerLogger
from voice_changer.Beatrice.Beatrice import Beatrice
from voice_changer.IORecorder import IORecorder
@ -89,12 +90,17 @@ class VoiceChangerV2(VoiceChangerIF):
self.prev_audio = np.zeros(4096)
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
self.onnx_device = onnxruntime.get_device()
self.noCrossFade = False
logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
def setModel(self, model: VoiceChangerModel):
self.voiceChanger = model
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
if isinstance(model, Beatrice):
self.noCrossFade = True
else:
self.noCrossFade = False
def setInputSampleRate(self, sr: int):
self.settings.inputSampleRate = sr
@ -202,57 +208,67 @@ class VoiceChangerV2(VoiceChangerIF):
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
with Timer("main-process") as t:
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
sola_search_frame = int(0.012 * processing_sampling_rate)
block_frame = receivedData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
audio = self.voiceChanger.inference(
receivedData,
crossfade_frame=crossfade_frame,
sola_search_frame=sola_search_frame
)
if hasattr(self, "sola_buffer") is True:
np.set_printoptions(threshold=10000)
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
audio = audio[audio_offset:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],
np.flip(self.sola_buffer),
"valid",
if self.noCrossFade: # Beatrice
audio = self.voiceChanger.inference(
receivedData,
crossfade_frame=0,
sola_search_frame=0,
)
cor_den = np.sqrt(
np.convolve(
audio[: crossfade_frame + sola_search_frame] ** 2,
np.ones(crossfade_frame),
# block_frame = receivedData.shape[0]
# result = audio[:block_frame]
result = audio
else:
sola_search_frame = int(0.012 * processing_sampling_rate)
block_frame = receivedData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
audio = self.voiceChanger.inference(
receivedData,
crossfade_frame=crossfade_frame,
sola_search_frame=sola_search_frame
)
if hasattr(self, "sola_buffer") is True:
np.set_printoptions(threshold=10000)
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
audio = audio[audio_offset:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],
np.flip(self.sola_buffer),
"valid",
)
+ 1e-3
)
sola_offset = int(np.argmax(cor_nom / cor_den))
sola_end = sola_offset + block_frame
output_wav = audio[sola_offset:sola_end].astype(np.float64)
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
cor_den = np.sqrt(
np.convolve(
audio[: crossfade_frame + sola_search_frame] ** 2,
np.ones(crossfade_frame),
"valid",
)
+ 1e-3
)
sola_offset = int(np.argmax(cor_nom / cor_den))
sola_end = sola_offset + block_frame
output_wav = audio[sola_offset:sola_end].astype(np.float64)
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
result = output_wav
else:
logger.info("[Voice Changer] warming up... generating sola buffer.")
result = np.zeros(4096).astype(np.int16)
result = output_wav
else:
logger.info("[Voice Changer] warming up... generating sola buffer.")
result = np.zeros(4096).astype(np.int16)
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
end = -1 * (sola_search_frame - sola_offset)
sola_buf_org = audio[offset:end]
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
# self.sola_buffer = audio[- crossfade_frame:]
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
end = -1 * (sola_search_frame - sola_offset)
sola_buf_org = audio[offset:end]
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
# self.sola_buffer = audio[- crossfade_frame:]
mainprocess_time = t.secs

View File

@ -19,6 +19,7 @@ LoadModelParamFileKind: TypeAlias = Literal[
"ddspSvcDiffusion",
"ddspSvcDiffusionConfig",
"diffusionSVCModel",
"beatriceModel",
]