WIP DDSPSVC

This commit is contained in:
wataru 2023-05-08 05:51:24 +09:00
parent af4cf4857e
commit 3e0772d955
29 changed files with 2932 additions and 329 deletions

2
.gitignore vendored
View File

@ -40,7 +40,9 @@ client/lib/worklet/dist
docker/cudnn/
server/pretrain/
server/weights/
server/weights_/
server/weights__/
start_trainer.sh

View File

@ -49,6 +49,42 @@
"showOnnxExportButton": false
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Model(.pt,.pth)",
"acceptExtentions": ["pt", "pth"],
"fileKind": "ddspSvcModel"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Model config(.yaml)",
"acceptExtentions": ["yaml"],
"fileKind": "ddspSvcModelConfig"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Diff(.pt,.pth)",
"acceptExtentions": ["pt", "pth"],
"fileKind": "ddspSvcDiffusion"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Diff config(.yaml)",
"acceptExtentions": ["yaml"],
"fileKind": "ddspSvcDiffusionConfig"
}
},
{
"name": "modelUploadButtonRow2",
"options": {}
},
{
"name": "framework",
"options": {
@ -63,6 +99,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -70,6 +110,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -54,6 +54,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -61,6 +65,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -53,6 +53,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -60,6 +64,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -1 +1,10 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
<!DOCTYPE html>
<html style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>Voice Changer Client Demo</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
</body>
</html>

File diff suppressed because one or more lines are too long

View File

@ -1,31 +0,0 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

View File

@ -49,6 +49,42 @@
"showOnnxExportButton": false
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Model(.pt,.pth)",
"acceptExtentions": ["pt", "pth"],
"fileKind": "ddspSvcModel"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Model config(.yaml)",
"acceptExtentions": ["yaml"],
"fileKind": "ddspSvcModelConfig"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Diff(.pt,.pth)",
"acceptExtentions": ["pt", "pth"],
"fileKind": "ddspSvcDiffusion"
}
},
{
"name": "commonFileSelect",
"options": {
"title": "Diff config(.yaml)",
"acceptExtentions": ["yaml"],
"fileKind": "ddspSvcDiffusionConfig"
}
},
{
"name": "modelUploadButtonRow2",
"options": {}
},
{
"name": "framework",
"options": {
@ -63,6 +99,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -70,6 +110,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -54,6 +54,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -61,6 +65,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -53,6 +53,10 @@
"lab": [],
"deviceSetting": [
{
"name": "audioDeviceMode",
"options": {}
},
{
"name": "audioInput",
"options": {}
@ -60,6 +64,10 @@
{
"name": "audioOutput",
"options": {}
},
{
"name": "ioBuffer",
"options": {}
}
],
"qualityControl": [

View File

@ -48,6 +48,8 @@ import { ModelSwitchRow, ModelSwitchRowProps } from "./components/204v2_ModelSwi
import { EnableDirectMLRow, EnableDirectMLRowProps } from "./components/813_EnableDirectMLRow"
import { AudioDeviceModeRow, AudioDeviceModeRowProps } from "./components/410_AudioDeviceModeRow"
import { IOBufferRow, IOBufferRowProps } from "./components/411_IOBufferRow"
import { CommonFileSelectRow, CommonFileSelectRowProps } from "./components/301-e_CommonFileSelectRow"
import { ModelUploadButtonRow2, ModelUploadButtonRow2Props } from "./components/301-f_ModelUploadButtonRow"
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
@ -81,7 +83,10 @@ const initialize = () => {
addToCatalog("modelUploader", (props: ModelUploaderRowProps) => { return <ModelUploaderRow {...props} /> })
addToCatalog("framework", (props: FrameworkRowProps) => { return <FrameworkRow {...props} /> })
addToCatalog("modelSamplingRate", (props: ModelSamplingRateRowProps) => { return <ModelSamplingRateRow {...props} /> })
// addToCatalog("onnxExport", (props: OnnxExportRowProps) => { return <OnnxExportRow {...props} /> })
addToCatalog("commonFileSelect", (props: CommonFileSelectRowProps) => { return <CommonFileSelectRow {...props} /> })
addToCatalog("modelUploadButtonRow2", (props: ModelUploadButtonRow2Props) => { return <ModelUploadButtonRow2 {...props} /> })
addToCatalog("audioInput", (props: AudioInputRowProps) => { return <AudioInputRow {...props} /> })
addToCatalog("audioOutput", (props: AudioOutputRowProps) => { return <AudioOutputRow {...props} /> })

View File

@ -0,0 +1,81 @@
import React, { useMemo } from "react"
import { fileSelector } from "@dannadori/voice-changer-client-js"
import { useAppState } from "../../../001_provider/001_AppStateProvider"
import { useGuiState } from "../001_GuiStateProvider"
export type CommonFileSelectRowProps = {
title: string
acceptExtentions: string[]
fileKind: Filekinds
}
export const Filekinds = {
"ddspSvcModel": "ddspSvcModel",
"ddspSvcModelConfig": "ddspSvcModelConfig",
"ddspSvcDiffusion": "ddspSvcDiffusion",
"ddspSvcDiffusionConfig": "ddspSvcDiffusionConfig",
} as const
export type Filekinds = typeof Filekinds[keyof typeof Filekinds]
export const CommonFileSelectRow = (props: CommonFileSelectRowProps) => {
const appState = useAppState()
const guiState = useGuiState()
const commonFileSelectRow = useMemo(() => {
const slot = guiState.modelSlotNum
const getTargetModelData = () => {
const targetSlot = appState.serverSetting.fileUploadSettings[slot]
if (!targetSlot) {
return null
}
return targetSlot[props.fileKind]
}
const targetModel = getTargetModelData()
const filenameText = targetModel?.filename || targetModel?.file?.name || ""
const checkExtention = (filename: string) => {
const ext = filename.split('.').pop();
if (!ext) {
return false
}
return props.acceptExtentions.includes(ext)
}
const onFileLoadClicked = async () => {
const file = await fileSelector("")
if (checkExtention(file.name) == false) {
alert(`モデルファイルの拡張子は${props.acceptExtentions}である必要があります。`)
return
}
appState.serverSetting.fileUploadSettings[slot][props.fileKind]! = { file: file }
appState.serverSetting.setFileUploadSetting(slot, {
...appState.serverSetting.fileUploadSettings[slot]
})
}
const onFileClearClicked = () => {
appState.serverSetting.fileUploadSettings[slot][props.fileKind] = null
appState.serverSetting.setFileUploadSetting(slot, {
...appState.serverSetting.fileUploadSettings[slot],
})
}
return (
<div className="body-row split-3-3-4 left-padding-1 guided">
<div className="body-item-title left-padding-2">{props.title}</div>
<div className="body-item-text">
<div>{filenameText}</div>
</div>
<div className="body-button-container">
<div className="body-button" onClick={onFileLoadClicked}>select</div>
<div className="body-button left-margin-1" onClick={onFileClearClicked}>clear</div>
</div>
</div>
)
}, [appState.serverSetting.fileUploadSettings, appState.serverSetting.setFileUploadSetting, appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings, guiState.modelSlotNum])
return commonFileSelectRow
}

View File

@ -0,0 +1,42 @@
import React, { useMemo } from "react"
import { useAppState } from "../../../001_provider/001_AppStateProvider"
import { useGuiState } from "../001_GuiStateProvider"
export type ModelUploadButtonRow2Props = {
}
export const ModelUploadButtonRow2 = (_props: ModelUploadButtonRow2Props) => {
const appState = useAppState()
const guiState = useGuiState()
const modelUploadButtonRow = useMemo(() => {
const slot = guiState.modelSlotNum
const onModelUploadClicked = async () => {
appState.serverSetting.loadModel(slot)
}
const uploadButtonClassName = appState.serverSetting.isUploading ? "body-button-disabled" : "body-button"
const uploadButtonAction = appState.serverSetting.isUploading ? () => { } : onModelUploadClicked
const uploadButtonLabel = appState.serverSetting.isUploading ? "wait..." : "upload"
const uploadingStatus = appState.serverSetting.isUploading ?
appState.serverSetting.uploadProgress == 0 ? `loading model...(wait about 20sec)` : `uploading.... ${appState.serverSetting.uploadProgress.toFixed(1)}%` : ""
const uploadedText = appState.serverSetting.fileUploadSettings[slot] == undefined ? "" : appState.serverSetting.fileUploadSettings[slot].uploaded ? "" : "not uploaded"
return (
<div className="body-row split-3-3-4 left-padding-1 guided">
<div className="body-item-title left-padding-2"></div>
<div className="body-item-text">
{uploadingStatus}
</div>
<div className="body-button-container">
<div className={uploadButtonClassName} onClick={uploadButtonAction}>{uploadButtonLabel}</div>
<div className="body-item-text-em" >{uploadedText}</div>
</div>
</div>
)
}, [appState.serverSetting.isUploading, appState.serverSetting.uploadProgress, appState.serverSetting.loadModel, guiState.modelSlotNum, appState.serverSetting.fileUploadSettings])
return modelUploadButtonRow
}

View File

@ -25,6 +25,11 @@ export type FileUploadSetting = {
framework: Framework
params: string
ddspSvcModel: ModelData | null
ddspSvcModelConfig: ModelData | null
ddspSvcDiffusion: ModelData | null
ddspSvcDiffusionConfig: ModelData | null
}
const InitialFileUploadSetting: FileUploadSetting = {
@ -36,11 +41,17 @@ const InitialFileUploadSetting: FileUploadSetting = {
feature: null,
index: null,
ddspSvcModel: null,
ddspSvcModelConfig: null,
ddspSvcDiffusion: null,
ddspSvcDiffusionConfig: null,
isHalf: true,
uploaded: false,
defaultTune: 0,
framework: Framework.PyTorch,
params: "{}"
params: "{}",
}
export type UseServerSettingProps = {
@ -191,15 +202,35 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
// (e) モデルアップロード
const _uploadFile = useMemo(() => {
return async (modelData: ModelData, onprogress: (progress: number, end: boolean) => void) => {
return async (modelData: ModelData, onprogress: (progress: number, end: boolean) => void, dir: string = "") => {
if (!props.voiceChangerClient) return
const num = await props.voiceChangerClient.uploadFile(modelData.data!, modelData.filename!, onprogress)
const res = await props.voiceChangerClient.concatUploadedFile(modelData.filename!, num)
const num = await props.voiceChangerClient.uploadFile(modelData.data!, dir + modelData.filename!, onprogress)
const res = await props.voiceChangerClient.concatUploadedFile(dir + modelData.filename!, num)
console.log("uploaded", num, res)
}
}, [props.voiceChangerClient])
const loadModel = useMemo(() => {
return async (slot: number) => {
if (props.clientType == "DDSP-SVC") {
if (!fileUploadSettings[slot].ddspSvcModel) {
alert("DDSPモデルを指定する必要があります。")
return
}
if (!fileUploadSettings[slot].ddspSvcModelConfig) {
alert("DDSP Configファイルを指定する必要があります。")
return
}
if (!fileUploadSettings[slot].ddspSvcDiffusion) {
alert("Diffusionモデルを指定する必要があります。")
return
}
if (!fileUploadSettings[slot].ddspSvcDiffusionConfig) {
alert("Diffusion Configファイルを指定する必要があります。")
return
}
} else {
if (!fileUploadSettings[slot].pyTorchModel && !fileUploadSettings[slot].onnxModel) {
alert("PyTorchモデルとONNXモデルのどちらか一つ以上指定する必要があります。")
return
@ -208,6 +239,7 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
alert("Configファイルを指定する必要があります。")
return
}
}
if (!props.voiceChangerClient) return
@ -272,21 +304,52 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
})
}
const configFileName = fileUploadSetting.configFile ? fileUploadSetting.configFile.filename || "-" : "-"
// DDSP-SVC
const ddspSvcModels = [fileUploadSetting.ddspSvcModel, fileUploadSetting.ddspSvcModelConfig, fileUploadSetting.ddspSvcDiffusion, fileUploadSetting.ddspSvcDiffusionConfig].filter(x => { return x != null }) as ModelData[]
for (let i = 0; i < ddspSvcModels.length; i++) {
if (!ddspSvcModels[i].data) {
ddspSvcModels[i].data = await ddspSvcModels[i].file!.arrayBuffer()
ddspSvcModels[i].filename = await ddspSvcModels[i].file!.name
}
}
for (let i = 0; i < ddspSvcModels.length; i++) {
const progRate = 1 / ddspSvcModels.length
const progOffset = 100 * i * progRate
const dir = i == 0 || i == 1 ? "ddsp_mod/" : "ddsp_diff/"
await _uploadFile(ddspSvcModels[i], (progress: number, _end: boolean) => {
setUploadProgress(progress * progRate + progOffset)
}, dir)
}
const configFileName = fileUploadSetting.configFile?.filename || "-"
const params = JSON.stringify({
trans: fileUploadSetting.defaultTune || 0
trans: fileUploadSetting.defaultTune || 0,
files: {
ddspSvcModel: fileUploadSetting.ddspSvcModel?.filename ? "ddsp_mod/" + fileUploadSetting.ddspSvcModel?.filename : "",
ddspSvcModelConfig: fileUploadSetting.ddspSvcModelConfig?.filename ? "ddsp_mod/" + fileUploadSetting.ddspSvcModelConfig?.filename : "",
ddspSvcDiffusion: fileUploadSetting.ddspSvcDiffusion?.filename ? "ddsp_diff/" + fileUploadSetting.ddspSvcDiffusion?.filename : "",
ddspSvcDiffusionConfig: fileUploadSetting.ddspSvcDiffusionConfig?.filename ? "ddsp_diff/" + fileUploadSetting.ddspSvcDiffusionConfig.filename : "",
}
})
if (fileUploadSetting.isHalf == undefined) {
fileUploadSetting.isHalf = false
}
const pyTorchModel = fileUploadSetting.pyTorchModel?.filename || null
const onnxModel = fileUploadSetting.onnxModel?.filename || null
const clusterTorchModel = fileUploadSetting.clusterTorchModel?.filename || null
const feature = fileUploadSetting.feature?.filename || null
const index = fileUploadSetting.index?.filename || null
const loadPromise = props.voiceChangerClient.loadModel(
slot,
configFileName,
fileUploadSetting.pyTorchModel?.filename || null,
fileUploadSetting.onnxModel?.filename || null,
fileUploadSetting.clusterTorchModel?.filename || null,
fileUploadSetting.feature?.filename || null,
fileUploadSetting.index?.filename || null,
pyTorchModel,
onnxModel,
clusterTorchModel,
feature,
index,
fileUploadSetting.isHalf,
params,
)
@ -332,7 +395,11 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
uploaded: false, // キャッシュから読み込まれるときには、まだuploadされていないから。
defaultTune: fileUploadSetting.defaultTune,
framework: fileUploadSetting.framework,
params: fileUploadSetting.params
params: fileUploadSetting.params,
ddspSvcModel: fileUploadSetting.ddspSvcModel ? { data: fileUploadSetting.ddspSvcModel.data, filename: fileUploadSetting.ddspSvcModel.filename } : null,
ddspSvcModelConfig: fileUploadSetting.ddspSvcModelConfig ? { data: fileUploadSetting.ddspSvcModelConfig.data, filename: fileUploadSetting.ddspSvcModelConfig.filename } : null,
ddspSvcDiffusion: fileUploadSetting.ddspSvcDiffusion ? { data: fileUploadSetting.ddspSvcDiffusion.data, filename: fileUploadSetting.ddspSvcDiffusion.filename } : null,
ddspSvcDiffusionConfig: fileUploadSetting.ddspSvcDiffusionConfig ? { data: fileUploadSetting.ddspSvcDiffusionConfig.data, filename: fileUploadSetting.ddspSvcDiffusionConfig.filename } : null,
}
setItem(`${INDEXEDDB_KEY_MODEL_DATA}_${slot}`, saveData)
} catch (e) {

View File

@ -9,7 +9,7 @@
"editor.formatOnSave": true //
},
"flake8.args": [
"--ignore=E501,E402,E722,E741,W503"
"--ignore=E501,E402,E722,E741,E203,W503"
// "--max-line-length=150",
// "--max-complexity=20"
]

View File

@ -170,6 +170,34 @@ if __name__ == "MMVCServerSIO":
"position": 1,
}
)
if os.path.exists(voiceChangerParams.hubert_soft) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/embedder/hubert-soft-0d54a1f4.pt",
"saveTo": voiceChangerParams.hubert_soft,
"position": 2,
}
)
if os.path.exists(voiceChangerParams.nsf_hifigan) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_20221211/model.bin",
"saveTo": voiceChangerParams.nsf_hifigan,
"position": 3,
}
)
nsf_hifigan_config = os.path.join(
os.path.dirname(voiceChangerParams.nsf_hifigan), "config.json"
)
if os.path.exists(nsf_hifigan_config) is False:
downloadParams.append(
{
"url": "https://huggingface.co/wok000/weights/raw/main/ddsp-svc30/nsf_hifigan_20221211/config.json",
"saveTo": nsf_hifigan_config,
"position": 4,
}
)
with ThreadPoolExecutor() as pool:
pool.map(download, downloadParams)

View File

@ -19,7 +19,7 @@ class ValidationErrorLoggingRoute(APIRoute):
async def custom_route_handler(request: Request) -> Response:
try:
return await original_route_handler(request)
except RequestValidationError as exc:
except RequestValidationError as exc: # type: ignore
print("Exception", request.url, str(exc))
body = await request.body()
detail = {"errors": exc.errors(), "body": body.decode()}

View File

@ -1,3 +1,4 @@
import json
import os
import shutil
from typing import Union
@ -49,10 +50,7 @@ class MMVC_Rest_Fileuploader:
def post_concat_uploaded_file(
self, filename: str = Form(...), filenameChunkNum: int = Form(...)
):
slot = 0
res = concat_file_chunks(
slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR
)
res = concat_file_chunks(UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
json_compatible_item_data = jsonable_encoder(res)
return JSONResponse(content=json_compatible_item_data)
@ -94,23 +92,38 @@ class MMVC_Rest_Fileuploader:
featureFilename=featureFilename,
indexFilename=indexFilename,
)
props: LoadModelParams = LoadModelParams(
slot=slot, isHalf=isHalf, params=params, files=files
)
paramDict = json.loads(params)
print("paramDict", paramDict)
# Change Filepath
for field in fields(props.files):
for field in fields(files):
key = field.name
val = getattr(props.files, key)
val = getattr(files, key)
if val != "-":
uploadPath = os.path.join(UPLOAD_DIR, val)
storeDir = os.path.join(UPLOAD_DIR, f"{slot}")
storePath = os.path.join(UPLOAD_DIR, f"{slot}", val)
storeDir = os.path.dirname(storePath)
os.makedirs(storeDir, exist_ok=True)
storePath = os.path.join(storeDir, val)
shutil.move(uploadPath, storePath)
setattr(props.files, key, storePath)
setattr(files, key, storePath)
else:
setattr(props.files, key, None)
setattr(files, key, None)
newFilesDict = {}
for key, val in paramDict["files"].items():
if val != "-" and val != "":
uploadPath = os.path.join(UPLOAD_DIR, val)
storePath = os.path.join(UPLOAD_DIR, f"{slot}", val)
storeDir = os.path.dirname(storePath)
os.makedirs(storeDir, exist_ok=True)
shutil.move(uploadPath, storePath)
newFilesDict[key] = storePath
paramDict["files"] = newFilesDict
props: LoadModelParams = LoadModelParams(
slot=slot, isHalf=isHalf, params=paramDict, files=files
)
info = self.voiceChangerManager.loadModel(props)
json_compatible_item_data = jsonable_encoder(info)

View File

@ -8,7 +8,10 @@ from fastapi import UploadFile
def upload_file(upload_dirname: str, file: UploadFile, filename: str):
if file and filename:
fileobj = file.file
upload_dir = open(os.path.join(upload_dirname, filename), 'wb+')
target_path = os.path.join(upload_dirname, filename)
target_dir = os.path.dirname(target_path)
os.makedirs(target_dir, exist_ok=True)
upload_dir = open(target_path, "wb+")
shutil.copyfileobj(fileobj, upload_dir)
upload_dir.close()
@ -16,20 +19,21 @@ def upload_file(upload_dirname: str, file: UploadFile, filename: str):
return {"status": "ERROR", "msg": "uploaded file is not found."}
def concat_file_chunks(slot: int, upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
# target_dir = os.path.join(dest_dirname, f"{slot}")
target_dir = os.path.join(dest_dirname)
def concat_file_chunks(
upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str
):
target_path = os.path.join(upload_dirname, filename)
target_dir = os.path.dirname(target_path)
os.makedirs(target_dir, exist_ok=True)
target_file_name = os.path.join(target_dir, filename)
if os.path.exists(target_file_name):
os.remove(target_file_name)
with open(target_file_name, "ab") as target_file:
if os.path.exists(target_path):
os.remove(target_path)
with open(target_path, "ab") as out:
for i in range(chunkNum):
chunkName = f"{filename}_{i}"
chunk_file_path = os.path.join(upload_dirname, chunkName)
stored_chunk_file = open(chunk_file_path, 'rb')
target_file.write(stored_chunk_file.read())
stored_chunk_file = open(chunk_file_path, "rb")
out.write(stored_chunk_file.read())
stored_chunk_file.close()
os.remove(chunk_file_path)
target_file.close()
return {"status": "OK", "msg": f"concat files {target_file_name} "}
out.close()
return {"status": "OK", "msg": f"concat files {out} "}

View File

@ -1,4 +1,5 @@
import os,glob
import os
import glob
# def get_file_list(top_dir):
@ -20,5 +21,6 @@ def get_dir_list(top_dir):
dirlist.append(filename)
return dirlist
def get_file_list(top_dir):
return glob.glob(top_dir)

View File

@ -1,9 +1,11 @@
import json
import sys
import os
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from dataclasses import asdict
import numpy as np
import torch
from torchaudio.transforms import Resample
from torch.nn import functional as F
if sys.platform.startswith("darwin"):
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
@ -15,14 +17,19 @@ if sys.platform.startswith("darwin"):
else:
sys.path.append("DDSP-SVC")
from dataclasses import dataclass, asdict, field
import numpy as np
import torch
import ddsp.vocoder as vo # type:ignore
from ddsp.core import upsample # type:ignore
from enhancer import Enhancer # type:ignore
from diffusion.infer_gt_mel import DiffGtMel # type: ignore
from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
from Exceptions import NoModeLoadedException
from voice_changer.DDSP_SVC.SvcDDSP import SvcDDSP
providers = [
"OpenVINOExecutionProvider",
@ -32,197 +39,192 @@ providers = [
]
@dataclass
class DDSP_SVCSettings:
gpu: int = 0
dstId: int = 0
f0Detector: str = "dio" # dio or harvest # parselmouth
tran: int = 20
predictF0: int = 0 # 0:False, 1:True
silentThreshold: float = 0.00001
extraConvertSize: int = 1024 * 32
enableEnhancer: int = 0
enhancerTune: int = 0
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
speakers: dict[str, int] = field(default_factory=lambda: {})
# ↓mutableな物だけ列挙
intData = [
"gpu",
"dstId",
"tran",
"predictF0",
"extraConvertSize",
"enableEnhancer",
"enhancerTune",
]
floatData = ["silentThreshold", "clusterInferRatio"]
strData = ["framework", "f0Detector"]
def phase_vocoder(a, b, fade_out, fade_in):
fa = torch.fft.rfft(a)
fb = torch.fft.rfft(b)
absab = torch.abs(fa) + torch.abs(fb)
n = a.shape[0]
if n % 2 == 0:
absab[1:-1] *= 2
else:
absab[1:] *= 2
phia = torch.angle(fa)
phib = torch.angle(fb)
deltaphase = phib - phia
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
t = torch.arange(n).unsqueeze(-1).to(a) / n
result = (
a * (fade_out**2)
+ b * (fade_in**2)
+ torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
)
return result
class DDSP_SVC:
initialLoad: bool = True
settings: DDSP_SVCSettings = DDSP_SVCSettings()
diff_model: DiffGtMel = DiffGtMel()
svc_model: SvcDDSP = SvcDDSP()
# diff_model: DiffGtMel = DiffGtMel()
audio_buffer: AudioInOut | None = None
prevVol: float = 0
# resample_kernel = {}
def __init__(self, params: VoiceChangerParams):
self.settings = DDSP_SVCSettings()
self.net_g = None
self.onnx_session = None
self.gpu_num = torch.cuda.device_count()
self.prevVol = 0
self.params = params
self.svc_model.setVCParams(params)
EmbedderManager.initialize(params)
print("DDSP-SVC initialization:", params)
def useDevice(self):
if self.settings.gpu >= 0 and torch.cuda.is_available():
return torch.device("cuda", index=self.settings.gpu)
else:
return torch.device("cpu")
# def useDevice(self):
# if self.settings.gpu >= 0 and torch.cuda.is_available():
# return torch.device("cuda", index=self.settings.gpu)
# else:
# return torch.device("cpu")
def loadModel(self, props: LoadModelParams):
self.settings.pyTorchModelFile = props.files.pyTorchModelFilename
# model
model, args = vo.load_model(
self.settings.pyTorchModelFile, device=self.useDevice()
)
self.model = model
self.args = args
self.sampling_rate = args.data.sampling_rate
self.hop_size = int(
self.args.data.block_size
* self.sampling_rate
/ self.args.data.sampling_rate
)
# target_slot_idx = props.slot
self.device = torch.device("cuda", index=0)
params = props.params
# hubert
self.vec_path = self.params.hubert_soft
self.encoder = vo.Units_Encoder(
self.args.data.encoder,
self.vec_path,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
device=self.useDevice(),
)
modelFile = params["files"]["ddspSvcModel"]
diffusionFile = params["files"]["ddspSvcDiffusion"]
self.svc_model.update_model(modelFile)
# ort_options = onnxruntime.SessionOptions()
# ort_options.intra_op_num_threads = 8
# self.onnx_session = onnxruntime.InferenceSession(
# "model_DDSP-SVC/hubert4.0.onnx",
# providers=providers
print("diffusion file", diffusionFile)
self.diff_model.flush_model(diffusionFile, ddsp_config=self.svc_model.args)
print("params:", params)
# print("params_arg:", self.args)
# self.settings.pyTorchModelFile = props.files.pyTorchModelFilename
# # model
# model, args = vo.load_model(
# self.settings.pyTorchModelFile, device=self.useDevice()
# )
# self.model = model
# self.args = args
# self.sampling_rate = args.data.sampling_rate
# self.hop_size = int(
# self.args.data.block_size
# * self.sampling_rate
# / self.args.data.sampling_rate
# )
# inputs = self.onnx_session.get_inputs()
# outputs = self.onnx_session.get_outputs()
# for input in inputs:
# print("input::::", input)
# for output in outputs:
# print("output::::", output)
# f0dec
self.f0_detector = vo.F0_Extractor(
# "crepe",
self.settings.f0Detector,
self.sampling_rate,
self.hop_size,
float(50),
float(1100),
)
# # hubert
# self.vec_path = self.params.hubert_soft
# self.encoder = vo.Units_Encoder(
# self.args.data.encoder,
# self.vec_path,
# self.args.data.encoder_sample_rate,
# self.args.data.encoder_hop_size,
# device=self.useDevice(),
# )
self.volume_extractor = vo.Volume_Extractor(self.hop_size)
self.enhancer_path = self.params.nsf_hifigan
self.enhancer = Enhancer(
self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
)
# # f0dec
# self.f0_detector = vo.F0_Extractor(
# # "crepe",
# self.settings.f0Detector,
# self.sampling_rate,
# self.hop_size,
# float(50),
# float(1100),
# )
# self.volume_extractor = vo.Volume_Extractor(self.hop_size)
# self.enhancer_path = self.params.nsf_hifigan
# self.enhancer = Enhancer(
# self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
# )
return self.get_info()
def update_settings(self, key: str, val: int | float | str):
if key == "onnxExecutionProvider" and self.onnx_session is not None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=[val], provider_options=provider_options
)
else:
self.onnx_session.set_providers(providers=[val])
elif key in self.settings.intData:
val = int(val)
setattr(self.settings, key, val)
if (
key == "gpu"
and val >= 0
and val < self.gpu_num
and self.onnx_session is not None
):
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options = [{"device_id": self.settings.gpu}]
self.onnx_session.set_providers(
providers=["CUDAExecutionProvider"],
provider_options=provider_options,
)
if key == "gpu" and len(self.settings.pyTorchModelFile) > 0:
model, _args = vo.load_model(
self.settings.pyTorchModelFile, device=self.useDevice()
)
self.model = model
self.enhancer = Enhancer(
self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
)
self.encoder = vo.Units_Encoder(
self.args.data.encoder,
self.vec_path,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
device=self.useDevice(),
)
# if key == "onnxExecutionProvider" and self.onnx_session is not None:
# if val == "CUDAExecutionProvider":
# if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
# self.settings.gpu = 0
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=[val], provider_options=provider_options
# )
# else:
# self.onnx_session.set_providers(providers=[val])
# elif key in self.settings.intData:
# val = int(val)
# setattr(self.settings, key, val)
# if (
# key == "gpu"
# and val >= 0
# and val < self.gpu_num
# and self.onnx_session is not None
# ):
# providers = self.onnx_session.get_providers()
# print("Providers:", providers)
# if "CUDAExecutionProvider" in providers:
# provider_options = [{"device_id": self.settings.gpu}]
# self.onnx_session.set_providers(
# providers=["CUDAExecutionProvider"],
# provider_options=provider_options,
# )
# if key == "gpu" and len(self.settings.pyTorchModelFile) > 0:
# model, _args = vo.load_model(
# self.settings.pyTorchModelFile, device=self.useDevice()
# )
# self.model = model
# self.enhancer = Enhancer(
# self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
# )
# self.encoder = vo.Units_Encoder(
# self.args.data.encoder,
# self.vec_path,
# self.args.data.encoder_sample_rate,
# self.args.data.encoder_hop_size,
# device=self.useDevice(),
# )
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
if key == "f0Detector":
print("f0Detector update", val)
# if val == "dio":
# val = "parselmouth"
# elif key in self.settings.floatData:
# setattr(self.settings, key, float(val))
# elif key in self.settings.strData:
# setattr(self.settings, key, str(val))
# if key == "f0Detector":
# print("f0Detector update", val)
# # if val == "dio":
# # val = "parselmouth"
if hasattr(self, "sampling_rate") is False:
self.sampling_rate = 44100
self.hop_size = 512
# if hasattr(self, "sampling_rate") is False:
# self.sampling_rate = 44100
# self.hop_size = 512
self.f0_detector = vo.F0_Extractor(
val, self.sampling_rate, self.hop_size, float(50), float(1100)
)
else:
return False
# self.f0_detector = vo.F0_Extractor(
# val, self.sampling_rate, self.hop_size, float(50), float(1100)
# )
# else:
# return False
return True
def get_info(self):
data = asdict(self.settings)
# data = asdict(self.settings)
data["onnxExecutionProviders"] = (
self.onnx_session.get_providers() if self.onnx_session is not None else []
)
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f] is not None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
# data["onnxExecutionProviders"] = (
# self.onnx_session.get_providers() if self.onnx_session is not None else []
# )
# files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
# for f in files:
# if data[f] is not None and os.path.exists(data[f]):
# data[f] = os.path.basename(data[f])
# else:
# data[f] = ""
data = {}
return data
def get_processing_sampling_rate(self):
return self.sampling_rate
return self.svc_model.args.data.sampling_rate
def generate_input(
self,
@ -232,6 +234,7 @@ class DDSP_SVC:
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0
# newData = newData.astype(np.float32)
if self.audio_buffer is not None:
self.audio_buffer = np.concatenate(
@ -244,96 +247,186 @@ class DDSP_SVC:
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
)
if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
# if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
# convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
convertOffset = -1 * convertSize
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
# f0
f0 = self.f0_detector.extract(
self.audio_buffer * 32768.0,
uv_interp=True,
silence_front=self.settings.extraConvertSize / self.sampling_rate,
)
f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(self.settings.tran) / 12)
# # f0
# f0 = self.f0_detector.extract(
# self.audio_buffer * 32768.0,
# uv_interp=True,
# silence_front=self.settings.extraConvertSize / self.sampling_rate,
# )
# f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
# f0 = f0 * 2 ** (float(self.settings.tran) / 12)
# volume, mask
volume = self.volume_extractor.extract(self.audio_buffer)
mask = (volume > 10 ** (float(-60) / 20)).astype("float")
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array(
[np.max(mask[n : n + 9]) for n in range(len(mask) - 8)] # noqa: E203
)
mask = torch.from_numpy(mask).float().unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, self.args.data.block_size).squeeze(-1)
volume = torch.from_numpy(volume).float().unsqueeze(-1).unsqueeze(0)
# # volume, mask
# volume = self.volume_extractor.extract(self.audio_buffer)
# mask = (volume > 10 ** (float(-60) / 20)).astype("float")
# mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
# mask = np.array(
# [np.max(mask[n : n + 9]) for n in range(len(mask) - 8)] # noqa: E203
# )
# mask = torch.from_numpy(mask).float().unsqueeze(-1).unsqueeze(0)
# mask = upsample(mask, self.args.data.block_size).squeeze(-1)
# volume = torch.from_numpy(volume).float().unsqueeze(-1).unsqueeze(0)
# embed
audio = (
torch.from_numpy(self.audio_buffer)
.float()
.to(self.useDevice())
.unsqueeze(0)
)
seg_units = self.encoder.encode(audio, self.sampling_rate, self.hop_size)
# # embed
# audio = (
# torch.from_numpy(self.audio_buffer)
# .float()
# .to(self.useDevice())
# .unsqueeze(0)
# )
# seg_units = self.encoder.encode(audio, self.sampling_rate, self.hop_size)
cropOffset = -1 * (inputSize + crossfadeSize)
cropEnd = -1 * (crossfadeSize)
crop = self.audio_buffer[cropOffset:cropEnd]
# cropOffset = -1 * (inputSize + crossfadeSize)
# cropEnd = -1 * (crossfadeSize)
# crop = self.audio_buffer[cropOffset:cropEnd]
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.0)
self.prevVol = vol
# rms = np.sqrt(np.square(crop).mean(axis=0))
# vol = max(rms, self.prevVol * 0.0)
# self.prevVol = vol
return (seg_units, f0, volume, mask, convertSize, vol)
return (self.audio_buffer, inputSize, crossfadeSize, solaSearchFrame)
def _onnx_inference(self, data):
if hasattr(self, "onnx_session") is False or self.onnx_session is None:
print("[Voice Changer] No onnx session.")
raise NoModeLoadedException("ONNX")
# def _onnx_inference(self, data):
# if hasattr(self, "onnx_session") is False or self.onnx_session is None:
# print("[Voice Changer] No onnx session.")
# raise NoModeLoadedException("ONNX")
raise NoModeLoadedException("ONNX")
# raise NoModeLoadedException("ONNX")
def _pyTorch_inference(self, data):
if hasattr(self, "model") is False or self.model is None:
print("[Voice Changer] No pyTorch session.")
raise NoModeLoadedException("pytorch")
# if hasattr(self, "model") is False or self.model is None:
# print("[Voice Changer] No pyTorch session.")
# raise NoModeLoadedException("pytorch")
c = data[0].to(self.useDevice())
f0 = data[1].to(self.useDevice())
volume = data[2].to(self.useDevice())
mask = data[3].to(self.useDevice())
input_wav = data[0]
# inputSize = data[1]
# crossfadeSize = data[2]
# solaSearchFrame = data[3]
# last_delay_frame = int(0.02 * self.svc_model.args.data.sampling_rate)
# convertSize = data[4]
# vol = data[5]
# if vol < self.settings.silentThreshold:
# print("threshold")
# return np.zeros(convertSize).astype(np.int16)
# fade_in_window = (
# torch.sin(
# np.pi * torch.arange(0, 1, 1 / crossfadeSize, device=self.device) / 2
# )
# ** 2
# )
# fade_out_window = 1 - fade_in_window
with torch.no_grad():
spk_id = torch.LongTensor(np.array([[self.settings.dstId]])).to(
self.useDevice()
)
seg_output, _, (s_h, s_n) = self.model(
c, f0, volume, spk_id=spk_id, spk_mix_dict=None
)
seg_output *= mask
if self.settings.enableEnhancer:
seg_output, output_sample_rate = self.enhancer.enhance(
seg_output,
self.args.data.sampling_rate,
f0,
self.args.data.block_size,
# adaptive_key=float(self.settings.enhancerTune),
adaptive_key="auto",
silence_front=self.settings.extraConvertSize / self.sampling_rate,
_audio, _model_sr = self.svc_model.infer(
input_wav,
44100,
spk_id=1,
threhold=-45,
pitch_adjust=10,
use_spk_mix=False,
spk_mix_dict=None,
use_enhancer=False,
pitch_extractor_type="harvest",
f0_min=50,
f0_max=1100,
safe_prefix_pad_length=0, # TBD なにこれ?
diff_model=self.diff_model,
diff_acc=20, # TBD なにこれ?
diff_spk_id=1,
diff_use=True,
diff_use_dpm=False, # TBD なにこれ?
k_step=120, # TBD なにこれ?
diff_silence=False, # TBD なにこれ?
)
result = seg_output.squeeze().cpu().numpy() * 32768.0
return np.array(result).astype(np.int16)
print(" _model_sr", _model_sr)
print("_audio", _audio.shape)
print("_audio", _audio)
return _audio.cpu().numpy() * 32768.0
# if _model_sr != self.svc_model.args.data.sampling_rate:
# key_str = str(_model_sr) + "_" + str(self.svc_model.args.data.sampling_rate)
# if key_str not in self.resample_kernel:
# self.resample_kernel[key_str] = Resample(
# _model_sr,
# self.svc_model.args.data.sampling_rate,
# lowpass_filter_width=128,
# ).to(self.device)
# _audio = self.resample_kernel[key_str](_audio)
# temp_wav = _audio[
# -inputSize
# - crossfadeSize
# - solaSearchFrame
# - last_delay_frame : -last_delay_frame
# ]
# # sola shift
# conv_input = temp_wav[None, None, : crossfadeSize + solaSearchFrame]
# cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
# cor_den = torch.sqrt(
# F.conv1d(
# conv_input**2,
# torch.ones(1, 1, crossfadeSize, device=self.device),
# )
# + 1e-8
# )
# sola_shift = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
# temp_wav = temp_wav[sola_shift : sola_shift + inputSize + crossfadeSize]
# print("sola_shift: " + str(int(sola_shift)))
# # phase vocoder
# # if self.config.use_phase_vocoder:
# if False:
# temp_wav[:crossfadeSize] = phase_vocoder(
# self.sola_buffer,
# temp_wav[:crossfadeSize],
# fade_out_window,
# fade_in_window,
# )
# else:
# temp_wav[:crossfadeSize] *= fade_in_window
# temp_wav[:crossfadeSize] += self.sola_buffer * fade_out_window
# self.sola_buffer = temp_wav[-crossfadeSize:]
# result = temp_wav[:-crossfadeSize, None].repeat(1, 2).cpu().numpy()
###########################################
# c = data[0].to(self.useDevice())
# f0 = data[1].to(self.useDevice())
# volume = data[2].to(self.useDevice())
# mask = data[3].to(self.useDevice())
# # convertSize = data[4]
# # vol = data[5]
# # if vol < self.settings.silentThreshold:
# # print("threshold")
# # return np.zeros(convertSize).astype(np.int16)
# with torch.no_grad():
# spk_id = torch.LongTensor(np.array([[self.settings.dstId]])).to(
# self.useDevice()
# )
# seg_output, _, (s_h, s_n) = self.model(
# c, f0, volume, spk_id=spk_id, spk_mix_dict=None
# )
# seg_output *= mask
# if self.settings.enableEnhancer:
# seg_output, output_sample_rate = self.enhancer.enhance(
# seg_output,
# self.args.data.sampling_rate,
# f0,
# self.args.data.block_size,
# # adaptive_key=float(self.settings.enhancerTune),
# adaptive_key="auto",
# silence_front=self.settings.extraConvertSize / self.sampling_rate,
# )
# result = seg_output.squeeze().cpu().numpy() * 32768.0
# return np.array(result).astype(np.int16)
def inference(self, data):
if self.settings.framework == "ONNX":
@ -342,9 +435,9 @@ class DDSP_SVC:
audio = self._pyTorch_inference(data)
return audio
def destroy(self):
del self.net_g
del self.onnx_session
# def destroy(self):
# del self.net_g
# del self.onnx_session
def __del__(self):
del self.net_g

View File

@ -0,0 +1,36 @@
from dataclasses import dataclass, field
@dataclass
class DDSP_SVCSettings:
gpu: int = 0
dstId: int = 0
f0Detector: str = "dio" # dio or harvest # parselmouth
tran: int = 20
predictF0: int = 0 # 0:False, 1:True
silentThreshold: float = 0.00001
extraConvertSize: int = 1024 * 32
enableEnhancer: int = 0
enhancerTune: int = 0
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
speakers: dict[str, int] = field(default_factory=lambda: {})
# ↓mutableな物だけ列挙
intData = [
"gpu",
"dstId",
"tran",
"predictF0",
"extraConvertSize",
"enableEnhancer",
"enhancerTune",
]
floatData = ["silentThreshold", "clusterInferRatio"]
strData = ["framework", "f0Detector"]

View File

@ -0,0 +1,16 @@
from const import EnumInferenceTypes, EnumEmbedderTypes
from dataclasses import dataclass
@dataclass
class ModelSlot:
pyTorchModelFile: str = ""
pyTorchDiffusionModelFile: str = ""
defaultTrans: int = 0
# modelType: EnumDDSPSVCInferenceTypes = EnumDDSPSVCInferenceTypes.pyTorchRVC
# samplingRate: int = -1
# f0: bool = True
# embChannels: int = 256
# deprecated: bool = False
# embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert

View File

@ -0,0 +1,107 @@
from const import EnumEmbedderTypes, EnumInferenceTypes
from voice_changer.RVC.ModelSlot import ModelSlot
from voice_changer.utils.LoadModelParams import FilePaths
import torch
import onnxruntime
import json
def generateModelSlot(files: FilePaths, params):
modelSlot = ModelSlot()
modelSlot.pyTorchModelFile = files.pyTorchModelFilename
modelSlot.onnxModelFile = files.onnxModelFilename
modelSlot.featureFile = files.featureFilename
modelSlot.indexFile = files.indexFilename
modelSlot.defaultTrans = params["trans"] if "trans" in params else 0
modelSlot.isONNX = True if modelSlot.onnxModelFile is not None else False
if modelSlot.isONNX:
_setInfoByONNX(modelSlot, modelSlot.onnxModelFile)
else:
_setInfoByPytorch(modelSlot, modelSlot.pyTorchModelFile)
return modelSlot
def _setInfoByPytorch(slot: ModelSlot, file: str):
cpt = torch.load(file, map_location="cpu")
config_len = len(cpt["config"])
if config_len == 18:
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = (
EnumInferenceTypes.pyTorchRVC
if slot.f0
else EnumInferenceTypes.pyTorchRVCNono
)
slot.embChannels = 256
slot.embedder = EnumEmbedderTypes.hubert
else:
slot.f0 = True if cpt["f0"] == 1 else False
slot.modelType = (
EnumInferenceTypes.pyTorchWebUI
if slot.f0
else EnumInferenceTypes.pyTorchWebUINono
)
slot.embChannels = cpt["config"][17]
slot.embedder = cpt["embedder_name"]
if slot.embedder.endswith("768"):
slot.embedder = slot.embedder[:-3]
if slot.embedder == EnumEmbedderTypes.hubert.value:
slot.embedder = EnumEmbedderTypes.hubert
elif slot.embedder == EnumEmbedderTypes.contentvec.value:
slot.embedder = EnumEmbedderTypes.contentvec
elif slot.embedder == EnumEmbedderTypes.hubert_jp.value:
slot.embedder = EnumEmbedderTypes.hubert_jp
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
slot.samplingRate = cpt["config"][-1]
del cpt
def _setInfoByONNX(slot: ModelSlot, file: str):
tmp_onnx_session = onnxruntime.InferenceSession(
file, providers=["CPUExecutionProvider"]
)
modelmeta = tmp_onnx_session.get_modelmeta()
try:
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
# slot.modelType = metadata["modelType"]
slot.embChannels = metadata["embChannels"]
if "embedder" not in metadata:
slot.embedder = EnumEmbedderTypes.hubert
elif metadata["embedder"] == EnumEmbedderTypes.hubert.value:
slot.embedder = EnumEmbedderTypes.hubert
elif metadata["embedder"] == EnumEmbedderTypes.contentvec.value:
slot.embedder = EnumEmbedderTypes.contentvec
elif metadata["embedder"] == EnumEmbedderTypes.hubert_jp.value:
slot.embedder = EnumEmbedderTypes.hubert_jp
else:
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
slot.f0 = metadata["f0"]
slot.modelType = (
EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono
)
slot.samplingRate = metadata["samplingRate"]
slot.deprecated = False
except Exception as e:
slot.modelType = EnumInferenceTypes.onnxRVC
slot.embChannels = 256
slot.embedder = EnumEmbedderTypes.hubert
slot.f0 = True
slot.samplingRate = 48000
slot.deprecated = True
print("[Voice Changer] setInfoByONNX", e)
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
del tmp_onnx_session

View File

@ -0,0 +1,197 @@
# original from: https://raw.githubusercontent.com/yxlllc/DDSP-SVC/master/gui_diff.py
import torch
from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder # type: ignore
from ddsp.core import upsample # type: ignore
from enhancer import Enhancer # type: ignore
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
import numpy as np
class SvcDDSP:
def __init__(self) -> None:
self.model = None
self.units_encoder = None
self.encoder_type = None
self.encoder_ckpt = None
self.enhancer = None
self.enhancer_type = None
self.enhancer_ckpt = None
def setVCParams(self, params: VoiceChangerParams):
self.params = params
def update_model(self, model_path):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# load ddsp model
if self.model is None or self.model_path != model_path:
self.model, self.args = load_model(model_path, device=self.device)
self.model_path = model_path
print("ARGS:", self.args)
# load units encoder
if (
self.units_encoder is None
or self.args.data.encoder != self.encoder_type
or self.args.data.encoder_ckpt != self.encoder_ckpt
):
if self.args.data.encoder == "cnhubertsoftfish":
cnhubertsoft_gate = self.args.data.cnhubertsoft_gate
else:
cnhubertsoft_gate = 10
# if self.args.data.encoder == "hubertsoft":
# encoderPath = self.params.hubert_soft
# elif self.args.data.encoder == "hubertbase":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "hubertbase768":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "hubertbase768l12":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "hubertlarge1024l24":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "contentvec":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "contentvec768":
# encoderPath = self.params.hubert_base
# elif self.args.data.encoder == "contentvec768l12":
# encoderPath = self.params.hubert_base
self.units_encoder = Units_Encoder(
self.args.data.encoder,
# encoderPath,
self.args.data.encoder_ckpt,
self.args.data.encoder_sample_rate,
self.args.data.encoder_hop_size,
cnhubertsoft_gate=cnhubertsoft_gate,
device=self.device,
)
self.encoder_type = self.args.data.encoder
# self.encoder_ckpt = encoderPath
self.encoder_ckpt = self.args.data.encoder_ckpt
# load enhancer
if (
self.enhancer is None
or self.args.enhancer.type != self.enhancer_type
or self.args.enhancer.ckpt != self.enhancer_ckpt
):
enhancerPath = self.params.nsf_hifigan
self.enhancer = Enhancer(
self.args.enhancer.type, enhancerPath, device=self.device
)
self.enhancer_type = self.args.enhancer.type
self.enhancer_ckpt = enhancerPath
def infer(
self,
audio,
sample_rate,
spk_id=1,
threhold=-45,
pitch_adjust=0,
use_spk_mix=False,
spk_mix_dict=None,
use_enhancer=True,
enhancer_adaptive_key="auto",
pitch_extractor_type="crepe",
f0_min=50,
f0_max=1100,
safe_prefix_pad_length=0,
diff_model=None,
diff_acc=None,
diff_spk_id=None,
diff_use=False,
diff_use_dpm=False,
k_step=None,
diff_silence=False,
audio_alignment=False,
):
print("Infering...")
print("audio", audio)
# load input
# audio, sample_rate = librosa.load(input_wav, sr=None, mono=True)
hop_size = (
self.args.data.block_size * sample_rate / self.args.data.sampling_rate
)
if audio_alignment:
audio_length = len(audio)
# safe front silence
if safe_prefix_pad_length > 0.03:
silence_front = safe_prefix_pad_length - 0.03
else:
silence_front = 0
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
# extract f0
pitch_extractor = F0_Extractor(
pitch_extractor_type, sample_rate, hop_size, float(f0_min), float(f0_max)
)
f0 = pitch_extractor.extract(
audio, uv_interp=True, device=self.device, silence_front=silence_front
)
f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
f0 = f0 * 2 ** (float(pitch_adjust) / 12)
# extract volume
volume_extractor = Volume_Extractor(hop_size)
volume = volume_extractor.extract(audio)
mask = (volume > 10 ** (float(threhold) / 20)).astype("float")
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n : n + 9]) for n in range(len(mask) - 8)]) # type: ignore
mask = torch.from_numpy(mask).float().to(self.device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, self.args.data.block_size).squeeze(-1)
volume = (
torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
)
# extract units
units = self.units_encoder.encode(audio_t, sample_rate, hop_size)
# spk_id or spk_mix_dict
spk_id = torch.LongTensor(np.array([[spk_id]])).to(self.device)
diff_spk_id = torch.LongTensor(np.array([[diff_spk_id]])).to(self.device)
dictionary = None
if use_spk_mix:
dictionary = spk_mix_dict
# forward and return the output
with torch.no_grad():
output, _, (s_h, s_n) = self.model(
units, f0, volume, spk_id=spk_id, spk_mix_dict=dictionary
)
if diff_use and diff_model is not None:
output = diff_model.infer(
output,
f0,
units,
volume,
acc=diff_acc,
spk_id=diff_spk_id,
k_step=k_step,
use_dpm=diff_use_dpm,
silence_front=silence_front,
use_silence=diff_silence,
spk_mix_dict=dictionary,
)
output *= mask
if use_enhancer and not diff_use:
output, output_sample_rate = self.enhancer.enhance(
output,
self.args.data.sampling_rate,
f0,
self.args.data.block_size,
adaptive_key=enhancer_adaptive_key,
silence_front=silence_front,
)
else:
output_sample_rate = self.args.data.sampling_rate
output = output.squeeze()
if audio_alignment:
output[:audio_length]
return output, output_sample_rate

View File

@ -0,0 +1,56 @@
import torch
class DeviceManager(object):
_instance = None
@classmethod
def get_instance(cls):
if cls._instance is None:
cls._instance = cls()
return cls._instance
def __init__(self):
self.gpu_num = torch.cuda.device_count()
self.mps_enabled: bool = (
getattr(torch.backends, "mps", None) is not None
and torch.backends.mps.is_available()
)
def getDevice(self, id: int):
if id < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
dev = torch.device("cpu")
elif self.mps_enabled:
dev = torch.device("mps")
else:
dev = torch.device("cuda", index=id)
return dev
def halfPrecisionAvailable(self, id: int):
if self.gpu_num == 0:
return False
if id < 0:
return False
try:
gpuName = torch.cuda.get_device_name(id).upper()
if (
("16" in gpuName and "V100" not in gpuName)
or "P40" in gpuName.upper()
or "1070" in gpuName
or "1080" in gpuName
):
return False
except Exception as e:
print(e)
return False
return True
def getDeviceMemory(self, id: int):
try:
return torch.cuda.get_device_properties(id).total_memory
# except Exception as e:
except:
# print(e)
return 0

View File

@ -20,7 +20,6 @@ def list_audio_device():
audio = pyaudio.PyAudio()
audio_input_devices: list[ServerAudioDevice] = []
audio_output_devices: list[ServerAudioDevice] = []
# audio_devices = {}
host_apis = []
for api_index in range(audio.get_host_api_count()):

View File

@ -71,8 +71,7 @@ class RVC:
def loadModel(self, props: LoadModelParams):
target_slot_idx = props.slot
params_str = props.params
params = json.loads(params_str)
params = props.params
modelSlot = generateModelSlot(props.files, params)
self.settings.modelSlots[target_slot_idx] = modelSlot

View File

@ -1,4 +1,5 @@
from dataclasses import dataclass
from typing import Any
@dataclass
@ -16,4 +17,4 @@ class LoadModelParams:
slot: int
isHalf: bool
files: FilePaths
params: str
params: Any