mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 05:25:01 +03:00
WIP DDSPSVC
This commit is contained in:
parent
af4cf4857e
commit
3e0772d955
2
.gitignore
vendored
2
.gitignore
vendored
@ -40,7 +40,9 @@ client/lib/worklet/dist
|
||||
|
||||
docker/cudnn/
|
||||
|
||||
server/pretrain/
|
||||
server/weights/
|
||||
server/weights_/
|
||||
server/weights__/
|
||||
|
||||
start_trainer.sh
|
||||
|
@ -49,6 +49,42 @@
|
||||
"showOnnxExportButton": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Model(.pt,.pth)",
|
||||
"acceptExtentions": ["pt", "pth"],
|
||||
"fileKind": "ddspSvcModel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Model config(.yaml)",
|
||||
"acceptExtentions": ["yaml"],
|
||||
"fileKind": "ddspSvcModelConfig"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Diff(.pt,.pth)",
|
||||
"acceptExtentions": ["pt", "pth"],
|
||||
"fileKind": "ddspSvcDiffusion"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Diff config(.yaml)",
|
||||
"acceptExtentions": ["yaml"],
|
||||
"fileKind": "ddspSvcDiffusionConfig"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "modelUploadButtonRow2",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "framework",
|
||||
"options": {
|
||||
@ -63,6 +99,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -70,6 +110,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
@ -54,6 +54,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -61,6 +65,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
@ -53,6 +53,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -60,6 +64,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
11
client/demo/dist/index.html
vendored
11
client/demo/dist/index.html
vendored
@ -1 +1,10 @@
|
||||
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
||||
<!DOCTYPE html>
|
||||
<html style="width: 100%; height: 100%; overflow: hidden">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>Voice Changer Client Demo</title>
|
||||
<script defer src="index.js"></script></head>
|
||||
<body style="width: 100%; height: 100%; margin: 0px">
|
||||
<div id="app" style="width: 100%; height: 100%"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
1761
client/demo/dist/index.js
vendored
1761
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
31
client/demo/dist/index.js.LICENSE.txt
vendored
31
client/demo/dist/index.js.LICENSE.txt
vendored
@ -1,31 +0,0 @@
|
||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react-dom.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* scheduler.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
@ -49,6 +49,42 @@
|
||||
"showOnnxExportButton": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Model(.pt,.pth)",
|
||||
"acceptExtentions": ["pt", "pth"],
|
||||
"fileKind": "ddspSvcModel"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Model config(.yaml)",
|
||||
"acceptExtentions": ["yaml"],
|
||||
"fileKind": "ddspSvcModelConfig"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Diff(.pt,.pth)",
|
||||
"acceptExtentions": ["pt", "pth"],
|
||||
"fileKind": "ddspSvcDiffusion"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "commonFileSelect",
|
||||
"options": {
|
||||
"title": "Diff config(.yaml)",
|
||||
"acceptExtentions": ["yaml"],
|
||||
"fileKind": "ddspSvcDiffusionConfig"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "modelUploadButtonRow2",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "framework",
|
||||
"options": {
|
||||
@ -63,6 +99,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -70,6 +110,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
@ -54,6 +54,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -61,6 +65,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
@ -53,6 +53,10 @@
|
||||
"lab": [],
|
||||
|
||||
"deviceSetting": [
|
||||
{
|
||||
"name": "audioDeviceMode",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "audioInput",
|
||||
"options": {}
|
||||
@ -60,6 +64,10 @@
|
||||
{
|
||||
"name": "audioOutput",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "ioBuffer",
|
||||
"options": {}
|
||||
}
|
||||
],
|
||||
"qualityControl": [
|
||||
|
@ -48,6 +48,8 @@ import { ModelSwitchRow, ModelSwitchRowProps } from "./components/204v2_ModelSwi
|
||||
import { EnableDirectMLRow, EnableDirectMLRowProps } from "./components/813_EnableDirectMLRow"
|
||||
import { AudioDeviceModeRow, AudioDeviceModeRowProps } from "./components/410_AudioDeviceModeRow"
|
||||
import { IOBufferRow, IOBufferRowProps } from "./components/411_IOBufferRow"
|
||||
import { CommonFileSelectRow, CommonFileSelectRowProps } from "./components/301-e_CommonFileSelectRow"
|
||||
import { ModelUploadButtonRow2, ModelUploadButtonRow2Props } from "./components/301-f_ModelUploadButtonRow"
|
||||
|
||||
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
|
||||
|
||||
@ -81,7 +83,10 @@ const initialize = () => {
|
||||
addToCatalog("modelUploader", (props: ModelUploaderRowProps) => { return <ModelUploaderRow {...props} /> })
|
||||
addToCatalog("framework", (props: FrameworkRowProps) => { return <FrameworkRow {...props} /> })
|
||||
addToCatalog("modelSamplingRate", (props: ModelSamplingRateRowProps) => { return <ModelSamplingRateRow {...props} /> })
|
||||
// addToCatalog("onnxExport", (props: OnnxExportRowProps) => { return <OnnxExportRow {...props} /> })
|
||||
addToCatalog("commonFileSelect", (props: CommonFileSelectRowProps) => { return <CommonFileSelectRow {...props} /> })
|
||||
addToCatalog("modelUploadButtonRow2", (props: ModelUploadButtonRow2Props) => { return <ModelUploadButtonRow2 {...props} /> })
|
||||
|
||||
|
||||
|
||||
addToCatalog("audioInput", (props: AudioInputRowProps) => { return <AudioInputRow {...props} /> })
|
||||
addToCatalog("audioOutput", (props: AudioOutputRowProps) => { return <AudioOutputRow {...props} /> })
|
||||
|
@ -0,0 +1,81 @@
|
||||
import React, { useMemo } from "react"
|
||||
import { fileSelector } from "@dannadori/voice-changer-client-js"
|
||||
import { useAppState } from "../../../001_provider/001_AppStateProvider"
|
||||
import { useGuiState } from "../001_GuiStateProvider"
|
||||
|
||||
export type CommonFileSelectRowProps = {
|
||||
title: string
|
||||
acceptExtentions: string[]
|
||||
fileKind: Filekinds
|
||||
}
|
||||
|
||||
export const Filekinds = {
|
||||
"ddspSvcModel": "ddspSvcModel",
|
||||
"ddspSvcModelConfig": "ddspSvcModelConfig",
|
||||
"ddspSvcDiffusion": "ddspSvcDiffusion",
|
||||
"ddspSvcDiffusionConfig": "ddspSvcDiffusionConfig",
|
||||
} as const
|
||||
export type Filekinds = typeof Filekinds[keyof typeof Filekinds]
|
||||
|
||||
|
||||
export const CommonFileSelectRow = (props: CommonFileSelectRowProps) => {
|
||||
const appState = useAppState()
|
||||
const guiState = useGuiState()
|
||||
|
||||
const commonFileSelectRow = useMemo(() => {
|
||||
|
||||
const slot = guiState.modelSlotNum
|
||||
|
||||
const getTargetModelData = () => {
|
||||
const targetSlot = appState.serverSetting.fileUploadSettings[slot]
|
||||
if (!targetSlot) {
|
||||
return null
|
||||
}
|
||||
return targetSlot[props.fileKind]
|
||||
}
|
||||
|
||||
const targetModel = getTargetModelData()
|
||||
const filenameText = targetModel?.filename || targetModel?.file?.name || ""
|
||||
|
||||
const checkExtention = (filename: string) => {
|
||||
const ext = filename.split('.').pop();
|
||||
if (!ext) {
|
||||
return false
|
||||
}
|
||||
return props.acceptExtentions.includes(ext)
|
||||
}
|
||||
const onFileLoadClicked = async () => {
|
||||
const file = await fileSelector("")
|
||||
if (checkExtention(file.name) == false) {
|
||||
alert(`モデルファイルの拡張子は${props.acceptExtentions}である必要があります。`)
|
||||
return
|
||||
}
|
||||
appState.serverSetting.fileUploadSettings[slot][props.fileKind]! = { file: file }
|
||||
appState.serverSetting.setFileUploadSetting(slot, {
|
||||
...appState.serverSetting.fileUploadSettings[slot]
|
||||
})
|
||||
}
|
||||
const onFileClearClicked = () => {
|
||||
appState.serverSetting.fileUploadSettings[slot][props.fileKind] = null
|
||||
appState.serverSetting.setFileUploadSetting(slot, {
|
||||
...appState.serverSetting.fileUploadSettings[slot],
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
return (
|
||||
<div className="body-row split-3-3-4 left-padding-1 guided">
|
||||
<div className="body-item-title left-padding-2">{props.title}</div>
|
||||
<div className="body-item-text">
|
||||
<div>{filenameText}</div>
|
||||
</div>
|
||||
<div className="body-button-container">
|
||||
<div className="body-button" onClick={onFileLoadClicked}>select</div>
|
||||
<div className="body-button left-margin-1" onClick={onFileClearClicked}>clear</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}, [appState.serverSetting.fileUploadSettings, appState.serverSetting.setFileUploadSetting, appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings, guiState.modelSlotNum])
|
||||
|
||||
return commonFileSelectRow
|
||||
}
|
@ -0,0 +1,42 @@
|
||||
import React, { useMemo } from "react"
|
||||
import { useAppState } from "../../../001_provider/001_AppStateProvider"
|
||||
import { useGuiState } from "../001_GuiStateProvider"
|
||||
|
||||
|
||||
export type ModelUploadButtonRow2Props = {
|
||||
}
|
||||
|
||||
export const ModelUploadButtonRow2 = (_props: ModelUploadButtonRow2Props) => {
|
||||
const appState = useAppState()
|
||||
const guiState = useGuiState()
|
||||
const modelUploadButtonRow = useMemo(() => {
|
||||
const slot = guiState.modelSlotNum
|
||||
const onModelUploadClicked = async () => {
|
||||
appState.serverSetting.loadModel(slot)
|
||||
}
|
||||
|
||||
const uploadButtonClassName = appState.serverSetting.isUploading ? "body-button-disabled" : "body-button"
|
||||
const uploadButtonAction = appState.serverSetting.isUploading ? () => { } : onModelUploadClicked
|
||||
const uploadButtonLabel = appState.serverSetting.isUploading ? "wait..." : "upload"
|
||||
const uploadingStatus = appState.serverSetting.isUploading ?
|
||||
appState.serverSetting.uploadProgress == 0 ? `loading model...(wait about 20sec)` : `uploading.... ${appState.serverSetting.uploadProgress.toFixed(1)}%` : ""
|
||||
|
||||
|
||||
const uploadedText = appState.serverSetting.fileUploadSettings[slot] == undefined ? "" : appState.serverSetting.fileUploadSettings[slot].uploaded ? "" : "not uploaded"
|
||||
return (
|
||||
<div className="body-row split-3-3-4 left-padding-1 guided">
|
||||
<div className="body-item-title left-padding-2"></div>
|
||||
<div className="body-item-text">
|
||||
{uploadingStatus}
|
||||
</div>
|
||||
<div className="body-button-container">
|
||||
<div className={uploadButtonClassName} onClick={uploadButtonAction}>{uploadButtonLabel}</div>
|
||||
<div className="body-item-text-em" >{uploadedText}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
)
|
||||
}, [appState.serverSetting.isUploading, appState.serverSetting.uploadProgress, appState.serverSetting.loadModel, guiState.modelSlotNum, appState.serverSetting.fileUploadSettings])
|
||||
|
||||
return modelUploadButtonRow
|
||||
}
|
@ -25,6 +25,11 @@ export type FileUploadSetting = {
|
||||
framework: Framework
|
||||
params: string
|
||||
|
||||
ddspSvcModel: ModelData | null
|
||||
ddspSvcModelConfig: ModelData | null
|
||||
ddspSvcDiffusion: ModelData | null
|
||||
ddspSvcDiffusionConfig: ModelData | null
|
||||
|
||||
}
|
||||
|
||||
const InitialFileUploadSetting: FileUploadSetting = {
|
||||
@ -36,11 +41,17 @@ const InitialFileUploadSetting: FileUploadSetting = {
|
||||
feature: null,
|
||||
index: null,
|
||||
|
||||
ddspSvcModel: null,
|
||||
ddspSvcModelConfig: null,
|
||||
ddspSvcDiffusion: null,
|
||||
ddspSvcDiffusionConfig: null,
|
||||
|
||||
isHalf: true,
|
||||
uploaded: false,
|
||||
defaultTune: 0,
|
||||
framework: Framework.PyTorch,
|
||||
params: "{}"
|
||||
params: "{}",
|
||||
|
||||
}
|
||||
|
||||
export type UseServerSettingProps = {
|
||||
@ -191,22 +202,43 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
|
||||
|
||||
// (e) モデルアップロード
|
||||
const _uploadFile = useMemo(() => {
|
||||
return async (modelData: ModelData, onprogress: (progress: number, end: boolean) => void) => {
|
||||
return async (modelData: ModelData, onprogress: (progress: number, end: boolean) => void, dir: string = "") => {
|
||||
if (!props.voiceChangerClient) return
|
||||
const num = await props.voiceChangerClient.uploadFile(modelData.data!, modelData.filename!, onprogress)
|
||||
const res = await props.voiceChangerClient.concatUploadedFile(modelData.filename!, num)
|
||||
const num = await props.voiceChangerClient.uploadFile(modelData.data!, dir + modelData.filename!, onprogress)
|
||||
const res = await props.voiceChangerClient.concatUploadedFile(dir + modelData.filename!, num)
|
||||
console.log("uploaded", num, res)
|
||||
}
|
||||
}, [props.voiceChangerClient])
|
||||
|
||||
|
||||
const loadModel = useMemo(() => {
|
||||
return async (slot: number) => {
|
||||
if (!fileUploadSettings[slot].pyTorchModel && !fileUploadSettings[slot].onnxModel) {
|
||||
alert("PyTorchモデルとONNXモデルのどちらか一つ以上指定する必要があります。")
|
||||
return
|
||||
}
|
||||
if (!fileUploadSettings[slot].configFile && props.clientType != "RVC") {
|
||||
alert("Configファイルを指定する必要があります。")
|
||||
return
|
||||
if (props.clientType == "DDSP-SVC") {
|
||||
if (!fileUploadSettings[slot].ddspSvcModel) {
|
||||
alert("DDSPモデルを指定する必要があります。")
|
||||
return
|
||||
}
|
||||
if (!fileUploadSettings[slot].ddspSvcModelConfig) {
|
||||
alert("DDSP Configファイルを指定する必要があります。")
|
||||
return
|
||||
}
|
||||
if (!fileUploadSettings[slot].ddspSvcDiffusion) {
|
||||
alert("Diffusionモデルを指定する必要があります。")
|
||||
return
|
||||
}
|
||||
if (!fileUploadSettings[slot].ddspSvcDiffusionConfig) {
|
||||
alert("Diffusion Configファイルを指定する必要があります。")
|
||||
return
|
||||
}
|
||||
} else {
|
||||
if (!fileUploadSettings[slot].pyTorchModel && !fileUploadSettings[slot].onnxModel) {
|
||||
alert("PyTorchモデルとONNXモデルのどちらか一つ以上指定する必要があります。")
|
||||
return
|
||||
}
|
||||
if (!fileUploadSettings[slot].configFile && props.clientType != "RVC") {
|
||||
alert("Configファイルを指定する必要があります。")
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if (!props.voiceChangerClient) return
|
||||
@ -272,21 +304,52 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
|
||||
})
|
||||
}
|
||||
|
||||
const configFileName = fileUploadSetting.configFile ? fileUploadSetting.configFile.filename || "-" : "-"
|
||||
// DDSP-SVC
|
||||
const ddspSvcModels = [fileUploadSetting.ddspSvcModel, fileUploadSetting.ddspSvcModelConfig, fileUploadSetting.ddspSvcDiffusion, fileUploadSetting.ddspSvcDiffusionConfig].filter(x => { return x != null }) as ModelData[]
|
||||
for (let i = 0; i < ddspSvcModels.length; i++) {
|
||||
if (!ddspSvcModels[i].data) {
|
||||
ddspSvcModels[i].data = await ddspSvcModels[i].file!.arrayBuffer()
|
||||
ddspSvcModels[i].filename = await ddspSvcModels[i].file!.name
|
||||
}
|
||||
}
|
||||
for (let i = 0; i < ddspSvcModels.length; i++) {
|
||||
const progRate = 1 / ddspSvcModels.length
|
||||
const progOffset = 100 * i * progRate
|
||||
const dir = i == 0 || i == 1 ? "ddsp_mod/" : "ddsp_diff/"
|
||||
await _uploadFile(ddspSvcModels[i], (progress: number, _end: boolean) => {
|
||||
setUploadProgress(progress * progRate + progOffset)
|
||||
}, dir)
|
||||
}
|
||||
|
||||
const configFileName = fileUploadSetting.configFile?.filename || "-"
|
||||
const params = JSON.stringify({
|
||||
trans: fileUploadSetting.defaultTune || 0
|
||||
trans: fileUploadSetting.defaultTune || 0,
|
||||
files: {
|
||||
ddspSvcModel: fileUploadSetting.ddspSvcModel?.filename ? "ddsp_mod/" + fileUploadSetting.ddspSvcModel?.filename : "",
|
||||
ddspSvcModelConfig: fileUploadSetting.ddspSvcModelConfig?.filename ? "ddsp_mod/" + fileUploadSetting.ddspSvcModelConfig?.filename : "",
|
||||
ddspSvcDiffusion: fileUploadSetting.ddspSvcDiffusion?.filename ? "ddsp_diff/" + fileUploadSetting.ddspSvcDiffusion?.filename : "",
|
||||
ddspSvcDiffusionConfig: fileUploadSetting.ddspSvcDiffusionConfig?.filename ? "ddsp_diff/" + fileUploadSetting.ddspSvcDiffusionConfig.filename : "",
|
||||
}
|
||||
})
|
||||
if (fileUploadSetting.isHalf == undefined) {
|
||||
fileUploadSetting.isHalf = false
|
||||
}
|
||||
|
||||
const pyTorchModel = fileUploadSetting.pyTorchModel?.filename || null
|
||||
const onnxModel = fileUploadSetting.onnxModel?.filename || null
|
||||
const clusterTorchModel = fileUploadSetting.clusterTorchModel?.filename || null
|
||||
const feature = fileUploadSetting.feature?.filename || null
|
||||
const index = fileUploadSetting.index?.filename || null
|
||||
|
||||
|
||||
const loadPromise = props.voiceChangerClient.loadModel(
|
||||
slot,
|
||||
configFileName,
|
||||
fileUploadSetting.pyTorchModel?.filename || null,
|
||||
fileUploadSetting.onnxModel?.filename || null,
|
||||
fileUploadSetting.clusterTorchModel?.filename || null,
|
||||
fileUploadSetting.feature?.filename || null,
|
||||
fileUploadSetting.index?.filename || null,
|
||||
pyTorchModel,
|
||||
onnxModel,
|
||||
clusterTorchModel,
|
||||
feature,
|
||||
index,
|
||||
fileUploadSetting.isHalf,
|
||||
params,
|
||||
)
|
||||
@ -332,7 +395,11 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
|
||||
uploaded: false, // キャッシュから読み込まれるときには、まだuploadされていないから。
|
||||
defaultTune: fileUploadSetting.defaultTune,
|
||||
framework: fileUploadSetting.framework,
|
||||
params: fileUploadSetting.params
|
||||
params: fileUploadSetting.params,
|
||||
ddspSvcModel: fileUploadSetting.ddspSvcModel ? { data: fileUploadSetting.ddspSvcModel.data, filename: fileUploadSetting.ddspSvcModel.filename } : null,
|
||||
ddspSvcModelConfig: fileUploadSetting.ddspSvcModelConfig ? { data: fileUploadSetting.ddspSvcModelConfig.data, filename: fileUploadSetting.ddspSvcModelConfig.filename } : null,
|
||||
ddspSvcDiffusion: fileUploadSetting.ddspSvcDiffusion ? { data: fileUploadSetting.ddspSvcDiffusion.data, filename: fileUploadSetting.ddspSvcDiffusion.filename } : null,
|
||||
ddspSvcDiffusionConfig: fileUploadSetting.ddspSvcDiffusionConfig ? { data: fileUploadSetting.ddspSvcDiffusionConfig.data, filename: fileUploadSetting.ddspSvcDiffusionConfig.filename } : null,
|
||||
}
|
||||
setItem(`${INDEXEDDB_KEY_MODEL_DATA}_${slot}`, saveData)
|
||||
} catch (e) {
|
||||
|
2
server/.vscode/settings.json
vendored
2
server/.vscode/settings.json
vendored
@ -9,7 +9,7 @@
|
||||
"editor.formatOnSave": true // ファイル保存時に自動フォーマット
|
||||
},
|
||||
"flake8.args": [
|
||||
"--ignore=E501,E402,E722,E741,W503"
|
||||
"--ignore=E501,E402,E722,E741,E203,W503"
|
||||
// "--max-line-length=150",
|
||||
// "--max-complexity=20"
|
||||
]
|
||||
|
@ -170,6 +170,34 @@ if __name__ == "MMVCServerSIO":
|
||||
"position": 1,
|
||||
}
|
||||
)
|
||||
if os.path.exists(voiceChangerParams.hubert_soft) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/embedder/hubert-soft-0d54a1f4.pt",
|
||||
"saveTo": voiceChangerParams.hubert_soft,
|
||||
"position": 2,
|
||||
}
|
||||
)
|
||||
if os.path.exists(voiceChangerParams.nsf_hifigan) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/resolve/main/ddsp-svc30/nsf_hifigan_20221211/model.bin",
|
||||
"saveTo": voiceChangerParams.nsf_hifigan,
|
||||
"position": 3,
|
||||
}
|
||||
)
|
||||
nsf_hifigan_config = os.path.join(
|
||||
os.path.dirname(voiceChangerParams.nsf_hifigan), "config.json"
|
||||
)
|
||||
|
||||
if os.path.exists(nsf_hifigan_config) is False:
|
||||
downloadParams.append(
|
||||
{
|
||||
"url": "https://huggingface.co/wok000/weights/raw/main/ddsp-svc30/nsf_hifigan_20221211/config.json",
|
||||
"saveTo": nsf_hifigan_config,
|
||||
"position": 4,
|
||||
}
|
||||
)
|
||||
|
||||
with ThreadPoolExecutor() as pool:
|
||||
pool.map(download, downloadParams)
|
||||
|
@ -19,7 +19,7 @@ class ValidationErrorLoggingRoute(APIRoute):
|
||||
async def custom_route_handler(request: Request) -> Response:
|
||||
try:
|
||||
return await original_route_handler(request)
|
||||
except RequestValidationError as exc:
|
||||
except RequestValidationError as exc: # type: ignore
|
||||
print("Exception", request.url, str(exc))
|
||||
body = await request.body()
|
||||
detail = {"errors": exc.errors(), "body": body.decode()}
|
||||
|
@ -1,3 +1,4 @@
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from typing import Union
|
||||
@ -49,10 +50,7 @@ class MMVC_Rest_Fileuploader:
|
||||
def post_concat_uploaded_file(
|
||||
self, filename: str = Form(...), filenameChunkNum: int = Form(...)
|
||||
):
|
||||
slot = 0
|
||||
res = concat_file_chunks(
|
||||
slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR
|
||||
)
|
||||
res = concat_file_chunks(UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
|
||||
json_compatible_item_data = jsonable_encoder(res)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
|
||||
@ -94,23 +92,38 @@ class MMVC_Rest_Fileuploader:
|
||||
featureFilename=featureFilename,
|
||||
indexFilename=indexFilename,
|
||||
)
|
||||
props: LoadModelParams = LoadModelParams(
|
||||
slot=slot, isHalf=isHalf, params=params, files=files
|
||||
)
|
||||
|
||||
paramDict = json.loads(params)
|
||||
print("paramDict", paramDict)
|
||||
|
||||
# Change Filepath
|
||||
for field in fields(props.files):
|
||||
for field in fields(files):
|
||||
key = field.name
|
||||
val = getattr(props.files, key)
|
||||
val = getattr(files, key)
|
||||
if val != "-":
|
||||
uploadPath = os.path.join(UPLOAD_DIR, val)
|
||||
storeDir = os.path.join(UPLOAD_DIR, f"{slot}")
|
||||
storePath = os.path.join(UPLOAD_DIR, f"{slot}", val)
|
||||
storeDir = os.path.dirname(storePath)
|
||||
os.makedirs(storeDir, exist_ok=True)
|
||||
storePath = os.path.join(storeDir, val)
|
||||
shutil.move(uploadPath, storePath)
|
||||
setattr(props.files, key, storePath)
|
||||
setattr(files, key, storePath)
|
||||
else:
|
||||
setattr(props.files, key, None)
|
||||
setattr(files, key, None)
|
||||
|
||||
newFilesDict = {}
|
||||
for key, val in paramDict["files"].items():
|
||||
if val != "-" and val != "":
|
||||
uploadPath = os.path.join(UPLOAD_DIR, val)
|
||||
storePath = os.path.join(UPLOAD_DIR, f"{slot}", val)
|
||||
storeDir = os.path.dirname(storePath)
|
||||
os.makedirs(storeDir, exist_ok=True)
|
||||
shutil.move(uploadPath, storePath)
|
||||
newFilesDict[key] = storePath
|
||||
paramDict["files"] = newFilesDict
|
||||
|
||||
props: LoadModelParams = LoadModelParams(
|
||||
slot=slot, isHalf=isHalf, params=paramDict, files=files
|
||||
)
|
||||
|
||||
info = self.voiceChangerManager.loadModel(props)
|
||||
json_compatible_item_data = jsonable_encoder(info)
|
||||
|
@ -8,7 +8,10 @@ from fastapi import UploadFile
|
||||
def upload_file(upload_dirname: str, file: UploadFile, filename: str):
|
||||
if file and filename:
|
||||
fileobj = file.file
|
||||
upload_dir = open(os.path.join(upload_dirname, filename), 'wb+')
|
||||
target_path = os.path.join(upload_dirname, filename)
|
||||
target_dir = os.path.dirname(target_path)
|
||||
os.makedirs(target_dir, exist_ok=True)
|
||||
upload_dir = open(target_path, "wb+")
|
||||
shutil.copyfileobj(fileobj, upload_dir)
|
||||
upload_dir.close()
|
||||
|
||||
@ -16,20 +19,21 @@ def upload_file(upload_dirname: str, file: UploadFile, filename: str):
|
||||
return {"status": "ERROR", "msg": "uploaded file is not found."}
|
||||
|
||||
|
||||
def concat_file_chunks(slot: int, upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
|
||||
# target_dir = os.path.join(dest_dirname, f"{slot}")
|
||||
target_dir = os.path.join(dest_dirname)
|
||||
def concat_file_chunks(
|
||||
upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str
|
||||
):
|
||||
target_path = os.path.join(upload_dirname, filename)
|
||||
target_dir = os.path.dirname(target_path)
|
||||
os.makedirs(target_dir, exist_ok=True)
|
||||
target_file_name = os.path.join(target_dir, filename)
|
||||
if os.path.exists(target_file_name):
|
||||
os.remove(target_file_name)
|
||||
with open(target_file_name, "ab") as target_file:
|
||||
if os.path.exists(target_path):
|
||||
os.remove(target_path)
|
||||
with open(target_path, "ab") as out:
|
||||
for i in range(chunkNum):
|
||||
chunkName = f"{filename}_{i}"
|
||||
chunk_file_path = os.path.join(upload_dirname, chunkName)
|
||||
stored_chunk_file = open(chunk_file_path, 'rb')
|
||||
target_file.write(stored_chunk_file.read())
|
||||
stored_chunk_file = open(chunk_file_path, "rb")
|
||||
out.write(stored_chunk_file.read())
|
||||
stored_chunk_file.close()
|
||||
os.remove(chunk_file_path)
|
||||
target_file.close()
|
||||
return {"status": "OK", "msg": f"concat files {target_file_name} "}
|
||||
out.close()
|
||||
return {"status": "OK", "msg": f"concat files {out} "}
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os,glob
|
||||
import os
|
||||
import glob
|
||||
|
||||
|
||||
# def get_file_list(top_dir):
|
||||
@ -20,5 +21,6 @@ def get_dir_list(top_dir):
|
||||
dirlist.append(filename)
|
||||
return dirlist
|
||||
|
||||
|
||||
def get_file_list(top_dir):
|
||||
return glob.glob(top_dir)
|
||||
return glob.glob(top_dir)
|
||||
|
@ -1,9 +1,11 @@
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
from dataclasses import asdict
|
||||
import numpy as np
|
||||
import torch
|
||||
from torchaudio.transforms import Resample
|
||||
from torch.nn import functional as F
|
||||
|
||||
if sys.platform.startswith("darwin"):
|
||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||
@ -15,14 +17,19 @@ if sys.platform.startswith("darwin"):
|
||||
else:
|
||||
sys.path.append("DDSP-SVC")
|
||||
|
||||
from dataclasses import dataclass, asdict, field
|
||||
import numpy as np
|
||||
import torch
|
||||
import ddsp.vocoder as vo # type:ignore
|
||||
from ddsp.core import upsample # type:ignore
|
||||
from enhancer import Enhancer # type:ignore
|
||||
from diffusion.infer_gt_mel import DiffGtMel # type: ignore
|
||||
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||
from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings
|
||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||
from Exceptions import NoModeLoadedException
|
||||
from voice_changer.DDSP_SVC.SvcDDSP import SvcDDSP
|
||||
|
||||
|
||||
providers = [
|
||||
"OpenVINOExecutionProvider",
|
||||
@ -32,197 +39,192 @@ providers = [
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class DDSP_SVCSettings:
|
||||
gpu: int = 0
|
||||
dstId: int = 0
|
||||
|
||||
f0Detector: str = "dio" # dio or harvest # parselmouth
|
||||
tran: int = 20
|
||||
predictF0: int = 0 # 0:False, 1:True
|
||||
silentThreshold: float = 0.00001
|
||||
extraConvertSize: int = 1024 * 32
|
||||
|
||||
enableEnhancer: int = 0
|
||||
enhancerTune: int = 0
|
||||
|
||||
framework: str = "PyTorch" # PyTorch or ONNX
|
||||
pyTorchModelFile: str = ""
|
||||
onnxModelFile: str = ""
|
||||
configFile: str = ""
|
||||
|
||||
speakers: dict[str, int] = field(default_factory=lambda: {})
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = [
|
||||
"gpu",
|
||||
"dstId",
|
||||
"tran",
|
||||
"predictF0",
|
||||
"extraConvertSize",
|
||||
"enableEnhancer",
|
||||
"enhancerTune",
|
||||
]
|
||||
floatData = ["silentThreshold", "clusterInferRatio"]
|
||||
strData = ["framework", "f0Detector"]
|
||||
def phase_vocoder(a, b, fade_out, fade_in):
|
||||
fa = torch.fft.rfft(a)
|
||||
fb = torch.fft.rfft(b)
|
||||
absab = torch.abs(fa) + torch.abs(fb)
|
||||
n = a.shape[0]
|
||||
if n % 2 == 0:
|
||||
absab[1:-1] *= 2
|
||||
else:
|
||||
absab[1:] *= 2
|
||||
phia = torch.angle(fa)
|
||||
phib = torch.angle(fb)
|
||||
deltaphase = phib - phia
|
||||
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
|
||||
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
|
||||
t = torch.arange(n).unsqueeze(-1).to(a) / n
|
||||
result = (
|
||||
a * (fade_out**2)
|
||||
+ b * (fade_in**2)
|
||||
+ torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
class DDSP_SVC:
|
||||
initialLoad: bool = True
|
||||
settings: DDSP_SVCSettings = DDSP_SVCSettings()
|
||||
diff_model: DiffGtMel = DiffGtMel()
|
||||
svc_model: SvcDDSP = SvcDDSP()
|
||||
# diff_model: DiffGtMel = DiffGtMel()
|
||||
|
||||
audio_buffer: AudioInOut | None = None
|
||||
prevVol: float = 0
|
||||
# resample_kernel = {}
|
||||
|
||||
def __init__(self, params: VoiceChangerParams):
|
||||
self.settings = DDSP_SVCSettings()
|
||||
self.net_g = None
|
||||
self.onnx_session = None
|
||||
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
self.prevVol = 0
|
||||
self.params = params
|
||||
self.svc_model.setVCParams(params)
|
||||
EmbedderManager.initialize(params)
|
||||
print("DDSP-SVC initialization:", params)
|
||||
|
||||
def useDevice(self):
|
||||
if self.settings.gpu >= 0 and torch.cuda.is_available():
|
||||
return torch.device("cuda", index=self.settings.gpu)
|
||||
else:
|
||||
return torch.device("cpu")
|
||||
# def useDevice(self):
|
||||
# if self.settings.gpu >= 0 and torch.cuda.is_available():
|
||||
# return torch.device("cuda", index=self.settings.gpu)
|
||||
# else:
|
||||
# return torch.device("cpu")
|
||||
|
||||
def loadModel(self, props: LoadModelParams):
|
||||
self.settings.pyTorchModelFile = props.files.pyTorchModelFilename
|
||||
# model
|
||||
model, args = vo.load_model(
|
||||
self.settings.pyTorchModelFile, device=self.useDevice()
|
||||
)
|
||||
self.model = model
|
||||
self.args = args
|
||||
self.sampling_rate = args.data.sampling_rate
|
||||
self.hop_size = int(
|
||||
self.args.data.block_size
|
||||
* self.sampling_rate
|
||||
/ self.args.data.sampling_rate
|
||||
)
|
||||
# target_slot_idx = props.slot
|
||||
self.device = torch.device("cuda", index=0)
|
||||
params = props.params
|
||||
|
||||
# hubert
|
||||
self.vec_path = self.params.hubert_soft
|
||||
self.encoder = vo.Units_Encoder(
|
||||
self.args.data.encoder,
|
||||
self.vec_path,
|
||||
self.args.data.encoder_sample_rate,
|
||||
self.args.data.encoder_hop_size,
|
||||
device=self.useDevice(),
|
||||
)
|
||||
modelFile = params["files"]["ddspSvcModel"]
|
||||
diffusionFile = params["files"]["ddspSvcDiffusion"]
|
||||
self.svc_model.update_model(modelFile)
|
||||
|
||||
# ort_options = onnxruntime.SessionOptions()
|
||||
# ort_options.intra_op_num_threads = 8
|
||||
# self.onnx_session = onnxruntime.InferenceSession(
|
||||
# "model_DDSP-SVC/hubert4.0.onnx",
|
||||
# providers=providers
|
||||
print("diffusion file", diffusionFile)
|
||||
self.diff_model.flush_model(diffusionFile, ddsp_config=self.svc_model.args)
|
||||
|
||||
print("params:", params)
|
||||
# print("params_arg:", self.args)
|
||||
|
||||
# self.settings.pyTorchModelFile = props.files.pyTorchModelFilename
|
||||
# # model
|
||||
# model, args = vo.load_model(
|
||||
# self.settings.pyTorchModelFile, device=self.useDevice()
|
||||
# )
|
||||
# self.model = model
|
||||
# self.args = args
|
||||
# self.sampling_rate = args.data.sampling_rate
|
||||
# self.hop_size = int(
|
||||
# self.args.data.block_size
|
||||
# * self.sampling_rate
|
||||
# / self.args.data.sampling_rate
|
||||
# )
|
||||
# inputs = self.onnx_session.get_inputs()
|
||||
# outputs = self.onnx_session.get_outputs()
|
||||
# for input in inputs:
|
||||
# print("input::::", input)
|
||||
# for output in outputs:
|
||||
# print("output::::", output)
|
||||
|
||||
# f0dec
|
||||
self.f0_detector = vo.F0_Extractor(
|
||||
# "crepe",
|
||||
self.settings.f0Detector,
|
||||
self.sampling_rate,
|
||||
self.hop_size,
|
||||
float(50),
|
||||
float(1100),
|
||||
)
|
||||
# # hubert
|
||||
# self.vec_path = self.params.hubert_soft
|
||||
# self.encoder = vo.Units_Encoder(
|
||||
# self.args.data.encoder,
|
||||
# self.vec_path,
|
||||
# self.args.data.encoder_sample_rate,
|
||||
# self.args.data.encoder_hop_size,
|
||||
# device=self.useDevice(),
|
||||
# )
|
||||
|
||||
self.volume_extractor = vo.Volume_Extractor(self.hop_size)
|
||||
self.enhancer_path = self.params.nsf_hifigan
|
||||
self.enhancer = Enhancer(
|
||||
self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
|
||||
)
|
||||
# # f0dec
|
||||
# self.f0_detector = vo.F0_Extractor(
|
||||
# # "crepe",
|
||||
# self.settings.f0Detector,
|
||||
# self.sampling_rate,
|
||||
# self.hop_size,
|
||||
# float(50),
|
||||
# float(1100),
|
||||
# )
|
||||
|
||||
# self.volume_extractor = vo.Volume_Extractor(self.hop_size)
|
||||
# self.enhancer_path = self.params.nsf_hifigan
|
||||
# self.enhancer = Enhancer(
|
||||
# self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
|
||||
# )
|
||||
return self.get_info()
|
||||
|
||||
def update_settings(self, key: str, val: int | float | str):
|
||||
if key == "onnxExecutionProvider" and self.onnx_session is not None:
|
||||
if val == "CUDAExecutionProvider":
|
||||
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
|
||||
self.settings.gpu = 0
|
||||
provider_options = [{"device_id": self.settings.gpu}]
|
||||
self.onnx_session.set_providers(
|
||||
providers=[val], provider_options=provider_options
|
||||
)
|
||||
else:
|
||||
self.onnx_session.set_providers(providers=[val])
|
||||
elif key in self.settings.intData:
|
||||
val = int(val)
|
||||
setattr(self.settings, key, val)
|
||||
if (
|
||||
key == "gpu"
|
||||
and val >= 0
|
||||
and val < self.gpu_num
|
||||
and self.onnx_session is not None
|
||||
):
|
||||
providers = self.onnx_session.get_providers()
|
||||
print("Providers:", providers)
|
||||
if "CUDAExecutionProvider" in providers:
|
||||
provider_options = [{"device_id": self.settings.gpu}]
|
||||
self.onnx_session.set_providers(
|
||||
providers=["CUDAExecutionProvider"],
|
||||
provider_options=provider_options,
|
||||
)
|
||||
if key == "gpu" and len(self.settings.pyTorchModelFile) > 0:
|
||||
model, _args = vo.load_model(
|
||||
self.settings.pyTorchModelFile, device=self.useDevice()
|
||||
)
|
||||
self.model = model
|
||||
self.enhancer = Enhancer(
|
||||
self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
|
||||
)
|
||||
self.encoder = vo.Units_Encoder(
|
||||
self.args.data.encoder,
|
||||
self.vec_path,
|
||||
self.args.data.encoder_sample_rate,
|
||||
self.args.data.encoder_hop_size,
|
||||
device=self.useDevice(),
|
||||
)
|
||||
# if key == "onnxExecutionProvider" and self.onnx_session is not None:
|
||||
# if val == "CUDAExecutionProvider":
|
||||
# if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
|
||||
# self.settings.gpu = 0
|
||||
# provider_options = [{"device_id": self.settings.gpu}]
|
||||
# self.onnx_session.set_providers(
|
||||
# providers=[val], provider_options=provider_options
|
||||
# )
|
||||
# else:
|
||||
# self.onnx_session.set_providers(providers=[val])
|
||||
# elif key in self.settings.intData:
|
||||
# val = int(val)
|
||||
# setattr(self.settings, key, val)
|
||||
# if (
|
||||
# key == "gpu"
|
||||
# and val >= 0
|
||||
# and val < self.gpu_num
|
||||
# and self.onnx_session is not None
|
||||
# ):
|
||||
# providers = self.onnx_session.get_providers()
|
||||
# print("Providers:", providers)
|
||||
# if "CUDAExecutionProvider" in providers:
|
||||
# provider_options = [{"device_id": self.settings.gpu}]
|
||||
# self.onnx_session.set_providers(
|
||||
# providers=["CUDAExecutionProvider"],
|
||||
# provider_options=provider_options,
|
||||
# )
|
||||
# if key == "gpu" and len(self.settings.pyTorchModelFile) > 0:
|
||||
# model, _args = vo.load_model(
|
||||
# self.settings.pyTorchModelFile, device=self.useDevice()
|
||||
# )
|
||||
# self.model = model
|
||||
# self.enhancer = Enhancer(
|
||||
# self.args.enhancer.type, self.enhancer_path, device=self.useDevice()
|
||||
# )
|
||||
# self.encoder = vo.Units_Encoder(
|
||||
# self.args.data.encoder,
|
||||
# self.vec_path,
|
||||
# self.args.data.encoder_sample_rate,
|
||||
# self.args.data.encoder_hop_size,
|
||||
# device=self.useDevice(),
|
||||
# )
|
||||
|
||||
elif key in self.settings.floatData:
|
||||
setattr(self.settings, key, float(val))
|
||||
elif key in self.settings.strData:
|
||||
setattr(self.settings, key, str(val))
|
||||
if key == "f0Detector":
|
||||
print("f0Detector update", val)
|
||||
# if val == "dio":
|
||||
# val = "parselmouth"
|
||||
# elif key in self.settings.floatData:
|
||||
# setattr(self.settings, key, float(val))
|
||||
# elif key in self.settings.strData:
|
||||
# setattr(self.settings, key, str(val))
|
||||
# if key == "f0Detector":
|
||||
# print("f0Detector update", val)
|
||||
# # if val == "dio":
|
||||
# # val = "parselmouth"
|
||||
|
||||
if hasattr(self, "sampling_rate") is False:
|
||||
self.sampling_rate = 44100
|
||||
self.hop_size = 512
|
||||
# if hasattr(self, "sampling_rate") is False:
|
||||
# self.sampling_rate = 44100
|
||||
# self.hop_size = 512
|
||||
|
||||
self.f0_detector = vo.F0_Extractor(
|
||||
val, self.sampling_rate, self.hop_size, float(50), float(1100)
|
||||
)
|
||||
else:
|
||||
return False
|
||||
# self.f0_detector = vo.F0_Extractor(
|
||||
# val, self.sampling_rate, self.hop_size, float(50), float(1100)
|
||||
# )
|
||||
# else:
|
||||
# return False
|
||||
|
||||
return True
|
||||
|
||||
def get_info(self):
|
||||
data = asdict(self.settings)
|
||||
# data = asdict(self.settings)
|
||||
|
||||
data["onnxExecutionProviders"] = (
|
||||
self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||
)
|
||||
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
||||
for f in files:
|
||||
if data[f] is not None and os.path.exists(data[f]):
|
||||
data[f] = os.path.basename(data[f])
|
||||
else:
|
||||
data[f] = ""
|
||||
# data["onnxExecutionProviders"] = (
|
||||
# self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||
# )
|
||||
# files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
|
||||
# for f in files:
|
||||
# if data[f] is not None and os.path.exists(data[f]):
|
||||
# data[f] = os.path.basename(data[f])
|
||||
# else:
|
||||
# data[f] = ""
|
||||
|
||||
data = {}
|
||||
return data
|
||||
|
||||
def get_processing_sampling_rate(self):
|
||||
return self.sampling_rate
|
||||
return self.svc_model.args.data.sampling_rate
|
||||
|
||||
def generate_input(
|
||||
self,
|
||||
@ -232,6 +234,7 @@ class DDSP_SVC:
|
||||
solaSearchFrame: int = 0,
|
||||
):
|
||||
newData = newData.astype(np.float32) / 32768.0
|
||||
# newData = newData.astype(np.float32)
|
||||
|
||||
if self.audio_buffer is not None:
|
||||
self.audio_buffer = np.concatenate(
|
||||
@ -244,96 +247,186 @@ class DDSP_SVC:
|
||||
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||
)
|
||||
|
||||
if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
|
||||
# if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
# convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
|
||||
|
||||
convertOffset = -1 * convertSize
|
||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||
|
||||
# f0
|
||||
f0 = self.f0_detector.extract(
|
||||
self.audio_buffer * 32768.0,
|
||||
uv_interp=True,
|
||||
silence_front=self.settings.extraConvertSize / self.sampling_rate,
|
||||
)
|
||||
f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
|
||||
f0 = f0 * 2 ** (float(self.settings.tran) / 12)
|
||||
# # f0
|
||||
# f0 = self.f0_detector.extract(
|
||||
# self.audio_buffer * 32768.0,
|
||||
# uv_interp=True,
|
||||
# silence_front=self.settings.extraConvertSize / self.sampling_rate,
|
||||
# )
|
||||
# f0 = torch.from_numpy(f0).float().unsqueeze(-1).unsqueeze(0)
|
||||
# f0 = f0 * 2 ** (float(self.settings.tran) / 12)
|
||||
|
||||
# volume, mask
|
||||
volume = self.volume_extractor.extract(self.audio_buffer)
|
||||
mask = (volume > 10 ** (float(-60) / 20)).astype("float")
|
||||
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||
mask = np.array(
|
||||
[np.max(mask[n : n + 9]) for n in range(len(mask) - 8)] # noqa: E203
|
||||
)
|
||||
mask = torch.from_numpy(mask).float().unsqueeze(-1).unsqueeze(0)
|
||||
mask = upsample(mask, self.args.data.block_size).squeeze(-1)
|
||||
volume = torch.from_numpy(volume).float().unsqueeze(-1).unsqueeze(0)
|
||||
# # volume, mask
|
||||
# volume = self.volume_extractor.extract(self.audio_buffer)
|
||||
# mask = (volume > 10 ** (float(-60) / 20)).astype("float")
|
||||
# mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||
# mask = np.array(
|
||||
# [np.max(mask[n : n + 9]) for n in range(len(mask) - 8)] # noqa: E203
|
||||
# )
|
||||
# mask = torch.from_numpy(mask).float().unsqueeze(-1).unsqueeze(0)
|
||||
# mask = upsample(mask, self.args.data.block_size).squeeze(-1)
|
||||
# volume = torch.from_numpy(volume).float().unsqueeze(-1).unsqueeze(0)
|
||||
|
||||
# embed
|
||||
audio = (
|
||||
torch.from_numpy(self.audio_buffer)
|
||||
.float()
|
||||
.to(self.useDevice())
|
||||
.unsqueeze(0)
|
||||
)
|
||||
seg_units = self.encoder.encode(audio, self.sampling_rate, self.hop_size)
|
||||
# # embed
|
||||
# audio = (
|
||||
# torch.from_numpy(self.audio_buffer)
|
||||
# .float()
|
||||
# .to(self.useDevice())
|
||||
# .unsqueeze(0)
|
||||
# )
|
||||
# seg_units = self.encoder.encode(audio, self.sampling_rate, self.hop_size)
|
||||
|
||||
cropOffset = -1 * (inputSize + crossfadeSize)
|
||||
cropEnd = -1 * (crossfadeSize)
|
||||
crop = self.audio_buffer[cropOffset:cropEnd]
|
||||
# cropOffset = -1 * (inputSize + crossfadeSize)
|
||||
# cropEnd = -1 * (crossfadeSize)
|
||||
# crop = self.audio_buffer[cropOffset:cropEnd]
|
||||
|
||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||
vol = max(rms, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
# rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||
# vol = max(rms, self.prevVol * 0.0)
|
||||
# self.prevVol = vol
|
||||
|
||||
return (seg_units, f0, volume, mask, convertSize, vol)
|
||||
return (self.audio_buffer, inputSize, crossfadeSize, solaSearchFrame)
|
||||
|
||||
def _onnx_inference(self, data):
|
||||
if hasattr(self, "onnx_session") is False or self.onnx_session is None:
|
||||
print("[Voice Changer] No onnx session.")
|
||||
raise NoModeLoadedException("ONNX")
|
||||
# def _onnx_inference(self, data):
|
||||
# if hasattr(self, "onnx_session") is False or self.onnx_session is None:
|
||||
# print("[Voice Changer] No onnx session.")
|
||||
# raise NoModeLoadedException("ONNX")
|
||||
|
||||
raise NoModeLoadedException("ONNX")
|
||||
# raise NoModeLoadedException("ONNX")
|
||||
|
||||
def _pyTorch_inference(self, data):
|
||||
if hasattr(self, "model") is False or self.model is None:
|
||||
print("[Voice Changer] No pyTorch session.")
|
||||
raise NoModeLoadedException("pytorch")
|
||||
# if hasattr(self, "model") is False or self.model is None:
|
||||
# print("[Voice Changer] No pyTorch session.")
|
||||
# raise NoModeLoadedException("pytorch")
|
||||
|
||||
c = data[0].to(self.useDevice())
|
||||
f0 = data[1].to(self.useDevice())
|
||||
volume = data[2].to(self.useDevice())
|
||||
mask = data[3].to(self.useDevice())
|
||||
input_wav = data[0]
|
||||
# inputSize = data[1]
|
||||
# crossfadeSize = data[2]
|
||||
# solaSearchFrame = data[3]
|
||||
# last_delay_frame = int(0.02 * self.svc_model.args.data.sampling_rate)
|
||||
|
||||
# convertSize = data[4]
|
||||
# vol = data[5]
|
||||
# if vol < self.settings.silentThreshold:
|
||||
# print("threshold")
|
||||
# return np.zeros(convertSize).astype(np.int16)
|
||||
# fade_in_window = (
|
||||
# torch.sin(
|
||||
# np.pi * torch.arange(0, 1, 1 / crossfadeSize, device=self.device) / 2
|
||||
# )
|
||||
# ** 2
|
||||
# )
|
||||
# fade_out_window = 1 - fade_in_window
|
||||
|
||||
with torch.no_grad():
|
||||
spk_id = torch.LongTensor(np.array([[self.settings.dstId]])).to(
|
||||
self.useDevice()
|
||||
)
|
||||
seg_output, _, (s_h, s_n) = self.model(
|
||||
c, f0, volume, spk_id=spk_id, spk_mix_dict=None
|
||||
)
|
||||
seg_output *= mask
|
||||
_audio, _model_sr = self.svc_model.infer(
|
||||
input_wav,
|
||||
44100,
|
||||
spk_id=1,
|
||||
threhold=-45,
|
||||
pitch_adjust=10,
|
||||
use_spk_mix=False,
|
||||
spk_mix_dict=None,
|
||||
use_enhancer=False,
|
||||
pitch_extractor_type="harvest",
|
||||
f0_min=50,
|
||||
f0_max=1100,
|
||||
safe_prefix_pad_length=0, # TBD なにこれ?
|
||||
diff_model=self.diff_model,
|
||||
diff_acc=20, # TBD なにこれ?
|
||||
diff_spk_id=1,
|
||||
diff_use=True,
|
||||
diff_use_dpm=False, # TBD なにこれ?
|
||||
k_step=120, # TBD なにこれ?
|
||||
diff_silence=False, # TBD なにこれ?
|
||||
)
|
||||
|
||||
if self.settings.enableEnhancer:
|
||||
seg_output, output_sample_rate = self.enhancer.enhance(
|
||||
seg_output,
|
||||
self.args.data.sampling_rate,
|
||||
f0,
|
||||
self.args.data.block_size,
|
||||
# adaptive_key=float(self.settings.enhancerTune),
|
||||
adaptive_key="auto",
|
||||
silence_front=self.settings.extraConvertSize / self.sampling_rate,
|
||||
)
|
||||
print(" _model_sr", _model_sr)
|
||||
print("_audio", _audio.shape)
|
||||
print("_audio", _audio)
|
||||
return _audio.cpu().numpy() * 32768.0
|
||||
|
||||
result = seg_output.squeeze().cpu().numpy() * 32768.0
|
||||
return np.array(result).astype(np.int16)
|
||||
# if _model_sr != self.svc_model.args.data.sampling_rate:
|
||||
# key_str = str(_model_sr) + "_" + str(self.svc_model.args.data.sampling_rate)
|
||||
# if key_str not in self.resample_kernel:
|
||||
# self.resample_kernel[key_str] = Resample(
|
||||
# _model_sr,
|
||||
# self.svc_model.args.data.sampling_rate,
|
||||
# lowpass_filter_width=128,
|
||||
# ).to(self.device)
|
||||
# _audio = self.resample_kernel[key_str](_audio)
|
||||
# temp_wav = _audio[
|
||||
# -inputSize
|
||||
# - crossfadeSize
|
||||
# - solaSearchFrame
|
||||
# - last_delay_frame : -last_delay_frame
|
||||
# ]
|
||||
|
||||
# # sola shift
|
||||
# conv_input = temp_wav[None, None, : crossfadeSize + solaSearchFrame]
|
||||
# cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :])
|
||||
# cor_den = torch.sqrt(
|
||||
# F.conv1d(
|
||||
# conv_input**2,
|
||||
# torch.ones(1, 1, crossfadeSize, device=self.device),
|
||||
# )
|
||||
# + 1e-8
|
||||
# )
|
||||
# sola_shift = torch.argmax(cor_nom[0, 0] / cor_den[0, 0])
|
||||
# temp_wav = temp_wav[sola_shift : sola_shift + inputSize + crossfadeSize]
|
||||
# print("sola_shift: " + str(int(sola_shift)))
|
||||
|
||||
# # phase vocoder
|
||||
# # if self.config.use_phase_vocoder:
|
||||
# if False:
|
||||
# temp_wav[:crossfadeSize] = phase_vocoder(
|
||||
# self.sola_buffer,
|
||||
# temp_wav[:crossfadeSize],
|
||||
# fade_out_window,
|
||||
# fade_in_window,
|
||||
# )
|
||||
# else:
|
||||
# temp_wav[:crossfadeSize] *= fade_in_window
|
||||
# temp_wav[:crossfadeSize] += self.sola_buffer * fade_out_window
|
||||
|
||||
# self.sola_buffer = temp_wav[-crossfadeSize:]
|
||||
|
||||
# result = temp_wav[:-crossfadeSize, None].repeat(1, 2).cpu().numpy()
|
||||
|
||||
###########################################
|
||||
# c = data[0].to(self.useDevice())
|
||||
# f0 = data[1].to(self.useDevice())
|
||||
# volume = data[2].to(self.useDevice())
|
||||
# mask = data[3].to(self.useDevice())
|
||||
|
||||
# # convertSize = data[4]
|
||||
# # vol = data[5]
|
||||
# # if vol < self.settings.silentThreshold:
|
||||
# # print("threshold")
|
||||
# # return np.zeros(convertSize).astype(np.int16)
|
||||
|
||||
# with torch.no_grad():
|
||||
# spk_id = torch.LongTensor(np.array([[self.settings.dstId]])).to(
|
||||
# self.useDevice()
|
||||
# )
|
||||
# seg_output, _, (s_h, s_n) = self.model(
|
||||
# c, f0, volume, spk_id=spk_id, spk_mix_dict=None
|
||||
# )
|
||||
# seg_output *= mask
|
||||
|
||||
# if self.settings.enableEnhancer:
|
||||
# seg_output, output_sample_rate = self.enhancer.enhance(
|
||||
# seg_output,
|
||||
# self.args.data.sampling_rate,
|
||||
# f0,
|
||||
# self.args.data.block_size,
|
||||
# # adaptive_key=float(self.settings.enhancerTune),
|
||||
# adaptive_key="auto",
|
||||
# silence_front=self.settings.extraConvertSize / self.sampling_rate,
|
||||
# )
|
||||
|
||||
# result = seg_output.squeeze().cpu().numpy() * 32768.0
|
||||
|
||||
# return np.array(result).astype(np.int16)
|
||||
|
||||
def inference(self, data):
|
||||
if self.settings.framework == "ONNX":
|
||||
@ -342,9 +435,9 @@ class DDSP_SVC:
|
||||
audio = self._pyTorch_inference(data)
|
||||
return audio
|
||||
|
||||
def destroy(self):
|
||||
del self.net_g
|
||||
del self.onnx_session
|
||||
# def destroy(self):
|
||||
# del self.net_g
|
||||
# del self.onnx_session
|
||||
|
||||
def __del__(self):
|
||||
del self.net_g
|
||||
|
36
server/voice_changer/DDSP_SVC/DDSP_SVCSetting.py
Normal file
36
server/voice_changer/DDSP_SVC/DDSP_SVCSetting.py
Normal file
@ -0,0 +1,36 @@
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class DDSP_SVCSettings:
|
||||
gpu: int = 0
|
||||
dstId: int = 0
|
||||
|
||||
f0Detector: str = "dio" # dio or harvest # parselmouth
|
||||
tran: int = 20
|
||||
predictF0: int = 0 # 0:False, 1:True
|
||||
silentThreshold: float = 0.00001
|
||||
extraConvertSize: int = 1024 * 32
|
||||
|
||||
enableEnhancer: int = 0
|
||||
enhancerTune: int = 0
|
||||
|
||||
framework: str = "PyTorch" # PyTorch or ONNX
|
||||
pyTorchModelFile: str = ""
|
||||
onnxModelFile: str = ""
|
||||
configFile: str = ""
|
||||
|
||||
speakers: dict[str, int] = field(default_factory=lambda: {})
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = [
|
||||
"gpu",
|
||||
"dstId",
|
||||
"tran",
|
||||
"predictF0",
|
||||
"extraConvertSize",
|
||||
"enableEnhancer",
|
||||
"enhancerTune",
|
||||
]
|
||||
floatData = ["silentThreshold", "clusterInferRatio"]
|
||||
strData = ["framework", "f0Detector"]
|
16
server/voice_changer/DDSP_SVC/ModelSlot.py
Normal file
16
server/voice_changer/DDSP_SVC/ModelSlot.py
Normal file
@ -0,0 +1,16 @@
|
||||
from const import EnumInferenceTypes, EnumEmbedderTypes
|
||||
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelSlot:
|
||||
pyTorchModelFile: str = ""
|
||||
pyTorchDiffusionModelFile: str = ""
|
||||
defaultTrans: int = 0
|
||||
# modelType: EnumDDSPSVCInferenceTypes = EnumDDSPSVCInferenceTypes.pyTorchRVC
|
||||
# samplingRate: int = -1
|
||||
# f0: bool = True
|
||||
# embChannels: int = 256
|
||||
# deprecated: bool = False
|
||||
# embedder: EnumEmbedderTypes = EnumEmbedderTypes.hubert
|
107
server/voice_changer/DDSP_SVC/ModelSlotGenerator.py
Normal file
107
server/voice_changer/DDSP_SVC/ModelSlotGenerator.py
Normal file
@ -0,0 +1,107 @@
|
||||
from const import EnumEmbedderTypes, EnumInferenceTypes
|
||||
from voice_changer.RVC.ModelSlot import ModelSlot
|
||||
|
||||
from voice_changer.utils.LoadModelParams import FilePaths
|
||||
import torch
|
||||
import onnxruntime
|
||||
import json
|
||||
|
||||
|
||||
def generateModelSlot(files: FilePaths, params):
|
||||
modelSlot = ModelSlot()
|
||||
modelSlot.pyTorchModelFile = files.pyTorchModelFilename
|
||||
modelSlot.onnxModelFile = files.onnxModelFilename
|
||||
modelSlot.featureFile = files.featureFilename
|
||||
modelSlot.indexFile = files.indexFilename
|
||||
modelSlot.defaultTrans = params["trans"] if "trans" in params else 0
|
||||
|
||||
modelSlot.isONNX = True if modelSlot.onnxModelFile is not None else False
|
||||
|
||||
if modelSlot.isONNX:
|
||||
_setInfoByONNX(modelSlot, modelSlot.onnxModelFile)
|
||||
else:
|
||||
_setInfoByPytorch(modelSlot, modelSlot.pyTorchModelFile)
|
||||
return modelSlot
|
||||
|
||||
|
||||
def _setInfoByPytorch(slot: ModelSlot, file: str):
|
||||
cpt = torch.load(file, map_location="cpu")
|
||||
config_len = len(cpt["config"])
|
||||
if config_len == 18:
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.pyTorchRVC
|
||||
if slot.f0
|
||||
else EnumInferenceTypes.pyTorchRVCNono
|
||||
)
|
||||
slot.embChannels = 256
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
else:
|
||||
slot.f0 = True if cpt["f0"] == 1 else False
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.pyTorchWebUI
|
||||
if slot.f0
|
||||
else EnumInferenceTypes.pyTorchWebUINono
|
||||
)
|
||||
slot.embChannels = cpt["config"][17]
|
||||
slot.embedder = cpt["embedder_name"]
|
||||
if slot.embedder.endswith("768"):
|
||||
slot.embedder = slot.embedder[:-3]
|
||||
|
||||
if slot.embedder == EnumEmbedderTypes.hubert.value:
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
elif slot.embedder == EnumEmbedderTypes.contentvec.value:
|
||||
slot.embedder = EnumEmbedderTypes.contentvec
|
||||
elif slot.embedder == EnumEmbedderTypes.hubert_jp.value:
|
||||
slot.embedder = EnumEmbedderTypes.hubert_jp
|
||||
else:
|
||||
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
|
||||
|
||||
slot.samplingRate = cpt["config"][-1]
|
||||
|
||||
del cpt
|
||||
|
||||
|
||||
def _setInfoByONNX(slot: ModelSlot, file: str):
|
||||
tmp_onnx_session = onnxruntime.InferenceSession(
|
||||
file, providers=["CPUExecutionProvider"]
|
||||
)
|
||||
modelmeta = tmp_onnx_session.get_modelmeta()
|
||||
try:
|
||||
metadata = json.loads(modelmeta.custom_metadata_map["metadata"])
|
||||
|
||||
# slot.modelType = metadata["modelType"]
|
||||
slot.embChannels = metadata["embChannels"]
|
||||
|
||||
if "embedder" not in metadata:
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
elif metadata["embedder"] == EnumEmbedderTypes.hubert.value:
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
elif metadata["embedder"] == EnumEmbedderTypes.contentvec.value:
|
||||
slot.embedder = EnumEmbedderTypes.contentvec
|
||||
elif metadata["embedder"] == EnumEmbedderTypes.hubert_jp.value:
|
||||
slot.embedder = EnumEmbedderTypes.hubert_jp
|
||||
else:
|
||||
raise RuntimeError("[Voice Changer][setInfoByONNX] unknown embedder")
|
||||
|
||||
slot.f0 = metadata["f0"]
|
||||
slot.modelType = (
|
||||
EnumInferenceTypes.onnxRVC if slot.f0 else EnumInferenceTypes.onnxRVCNono
|
||||
)
|
||||
slot.samplingRate = metadata["samplingRate"]
|
||||
slot.deprecated = False
|
||||
|
||||
except Exception as e:
|
||||
slot.modelType = EnumInferenceTypes.onnxRVC
|
||||
slot.embChannels = 256
|
||||
slot.embedder = EnumEmbedderTypes.hubert
|
||||
slot.f0 = True
|
||||
slot.samplingRate = 48000
|
||||
slot.deprecated = True
|
||||
|
||||
print("[Voice Changer] setInfoByONNX", e)
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
print("[Voice Changer] This onnxfie is depricated. Please regenerate onnxfile.")
|
||||
print("[Voice Changer] ############## !!!! CAUTION !!!! ####################")
|
||||
|
||||
del tmp_onnx_session
|
197
server/voice_changer/DDSP_SVC/SvcDDSP.py
Normal file
197
server/voice_changer/DDSP_SVC/SvcDDSP.py
Normal file
@ -0,0 +1,197 @@
|
||||
# original from: https://raw.githubusercontent.com/yxlllc/DDSP-SVC/master/gui_diff.py
|
||||
|
||||
import torch
|
||||
from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder # type: ignore
|
||||
from ddsp.core import upsample # type: ignore
|
||||
from enhancer import Enhancer # type: ignore
|
||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||
import numpy as np
|
||||
|
||||
|
||||
class SvcDDSP:
|
||||
def __init__(self) -> None:
|
||||
self.model = None
|
||||
self.units_encoder = None
|
||||
self.encoder_type = None
|
||||
self.encoder_ckpt = None
|
||||
self.enhancer = None
|
||||
self.enhancer_type = None
|
||||
self.enhancer_ckpt = None
|
||||
|
||||
def setVCParams(self, params: VoiceChangerParams):
|
||||
self.params = params
|
||||
|
||||
def update_model(self, model_path):
|
||||
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# load ddsp model
|
||||
if self.model is None or self.model_path != model_path:
|
||||
self.model, self.args = load_model(model_path, device=self.device)
|
||||
self.model_path = model_path
|
||||
|
||||
print("ARGS:", self.args)
|
||||
|
||||
# load units encoder
|
||||
if (
|
||||
self.units_encoder is None
|
||||
or self.args.data.encoder != self.encoder_type
|
||||
or self.args.data.encoder_ckpt != self.encoder_ckpt
|
||||
):
|
||||
if self.args.data.encoder == "cnhubertsoftfish":
|
||||
cnhubertsoft_gate = self.args.data.cnhubertsoft_gate
|
||||
else:
|
||||
cnhubertsoft_gate = 10
|
||||
|
||||
# if self.args.data.encoder == "hubertsoft":
|
||||
# encoderPath = self.params.hubert_soft
|
||||
# elif self.args.data.encoder == "hubertbase":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "hubertbase768":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "hubertbase768l12":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "hubertlarge1024l24":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "contentvec":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "contentvec768":
|
||||
# encoderPath = self.params.hubert_base
|
||||
# elif self.args.data.encoder == "contentvec768l12":
|
||||
# encoderPath = self.params.hubert_base
|
||||
|
||||
self.units_encoder = Units_Encoder(
|
||||
self.args.data.encoder,
|
||||
# encoderPath,
|
||||
self.args.data.encoder_ckpt,
|
||||
self.args.data.encoder_sample_rate,
|
||||
self.args.data.encoder_hop_size,
|
||||
cnhubertsoft_gate=cnhubertsoft_gate,
|
||||
device=self.device,
|
||||
)
|
||||
self.encoder_type = self.args.data.encoder
|
||||
# self.encoder_ckpt = encoderPath
|
||||
self.encoder_ckpt = self.args.data.encoder_ckpt
|
||||
|
||||
# load enhancer
|
||||
if (
|
||||
self.enhancer is None
|
||||
or self.args.enhancer.type != self.enhancer_type
|
||||
or self.args.enhancer.ckpt != self.enhancer_ckpt
|
||||
):
|
||||
enhancerPath = self.params.nsf_hifigan
|
||||
self.enhancer = Enhancer(
|
||||
self.args.enhancer.type, enhancerPath, device=self.device
|
||||
)
|
||||
self.enhancer_type = self.args.enhancer.type
|
||||
self.enhancer_ckpt = enhancerPath
|
||||
|
||||
def infer(
|
||||
self,
|
||||
audio,
|
||||
sample_rate,
|
||||
spk_id=1,
|
||||
threhold=-45,
|
||||
pitch_adjust=0,
|
||||
use_spk_mix=False,
|
||||
spk_mix_dict=None,
|
||||
use_enhancer=True,
|
||||
enhancer_adaptive_key="auto",
|
||||
pitch_extractor_type="crepe",
|
||||
f0_min=50,
|
||||
f0_max=1100,
|
||||
safe_prefix_pad_length=0,
|
||||
diff_model=None,
|
||||
diff_acc=None,
|
||||
diff_spk_id=None,
|
||||
diff_use=False,
|
||||
diff_use_dpm=False,
|
||||
k_step=None,
|
||||
diff_silence=False,
|
||||
audio_alignment=False,
|
||||
):
|
||||
print("Infering...")
|
||||
print("audio", audio)
|
||||
# load input
|
||||
# audio, sample_rate = librosa.load(input_wav, sr=None, mono=True)
|
||||
hop_size = (
|
||||
self.args.data.block_size * sample_rate / self.args.data.sampling_rate
|
||||
)
|
||||
if audio_alignment:
|
||||
audio_length = len(audio)
|
||||
# safe front silence
|
||||
if safe_prefix_pad_length > 0.03:
|
||||
silence_front = safe_prefix_pad_length - 0.03
|
||||
else:
|
||||
silence_front = 0
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
|
||||
# extract f0
|
||||
pitch_extractor = F0_Extractor(
|
||||
pitch_extractor_type, sample_rate, hop_size, float(f0_min), float(f0_max)
|
||||
)
|
||||
f0 = pitch_extractor.extract(
|
||||
audio, uv_interp=True, device=self.device, silence_front=silence_front
|
||||
)
|
||||
f0 = torch.from_numpy(f0).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
f0 = f0 * 2 ** (float(pitch_adjust) / 12)
|
||||
|
||||
# extract volume
|
||||
volume_extractor = Volume_Extractor(hop_size)
|
||||
volume = volume_extractor.extract(audio)
|
||||
mask = (volume > 10 ** (float(threhold) / 20)).astype("float")
|
||||
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
|
||||
mask = np.array([np.max(mask[n : n + 9]) for n in range(len(mask) - 8)]) # type: ignore
|
||||
mask = torch.from_numpy(mask).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
mask = upsample(mask, self.args.data.block_size).squeeze(-1)
|
||||
volume = (
|
||||
torch.from_numpy(volume).float().to(self.device).unsqueeze(-1).unsqueeze(0)
|
||||
)
|
||||
|
||||
# extract units
|
||||
units = self.units_encoder.encode(audio_t, sample_rate, hop_size)
|
||||
|
||||
# spk_id or spk_mix_dict
|
||||
spk_id = torch.LongTensor(np.array([[spk_id]])).to(self.device)
|
||||
diff_spk_id = torch.LongTensor(np.array([[diff_spk_id]])).to(self.device)
|
||||
dictionary = None
|
||||
|
||||
if use_spk_mix:
|
||||
dictionary = spk_mix_dict
|
||||
|
||||
# forward and return the output
|
||||
with torch.no_grad():
|
||||
output, _, (s_h, s_n) = self.model(
|
||||
units, f0, volume, spk_id=spk_id, spk_mix_dict=dictionary
|
||||
)
|
||||
|
||||
if diff_use and diff_model is not None:
|
||||
output = diff_model.infer(
|
||||
output,
|
||||
f0,
|
||||
units,
|
||||
volume,
|
||||
acc=diff_acc,
|
||||
spk_id=diff_spk_id,
|
||||
k_step=k_step,
|
||||
use_dpm=diff_use_dpm,
|
||||
silence_front=silence_front,
|
||||
use_silence=diff_silence,
|
||||
spk_mix_dict=dictionary,
|
||||
)
|
||||
output *= mask
|
||||
if use_enhancer and not diff_use:
|
||||
output, output_sample_rate = self.enhancer.enhance(
|
||||
output,
|
||||
self.args.data.sampling_rate,
|
||||
f0,
|
||||
self.args.data.block_size,
|
||||
adaptive_key=enhancer_adaptive_key,
|
||||
silence_front=silence_front,
|
||||
)
|
||||
else:
|
||||
output_sample_rate = self.args.data.sampling_rate
|
||||
|
||||
output = output.squeeze()
|
||||
if audio_alignment:
|
||||
output[:audio_length]
|
||||
return output, output_sample_rate
|
56
server/voice_changer/DDSP_SVC/deviceManager/DeviceManager.py
Normal file
56
server/voice_changer/DDSP_SVC/deviceManager/DeviceManager.py
Normal file
@ -0,0 +1,56 @@
|
||||
import torch
|
||||
|
||||
|
||||
class DeviceManager(object):
|
||||
_instance = None
|
||||
|
||||
@classmethod
|
||||
def get_instance(cls):
|
||||
if cls._instance is None:
|
||||
cls._instance = cls()
|
||||
return cls._instance
|
||||
|
||||
def __init__(self):
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
self.mps_enabled: bool = (
|
||||
getattr(torch.backends, "mps", None) is not None
|
||||
and torch.backends.mps.is_available()
|
||||
)
|
||||
|
||||
def getDevice(self, id: int):
|
||||
if id < 0 or (self.gpu_num == 0 and self.mps_enabled is False):
|
||||
dev = torch.device("cpu")
|
||||
elif self.mps_enabled:
|
||||
dev = torch.device("mps")
|
||||
else:
|
||||
dev = torch.device("cuda", index=id)
|
||||
return dev
|
||||
|
||||
def halfPrecisionAvailable(self, id: int):
|
||||
if self.gpu_num == 0:
|
||||
return False
|
||||
if id < 0:
|
||||
return False
|
||||
|
||||
try:
|
||||
gpuName = torch.cuda.get_device_name(id).upper()
|
||||
if (
|
||||
("16" in gpuName and "V100" not in gpuName)
|
||||
or "P40" in gpuName.upper()
|
||||
or "1070" in gpuName
|
||||
or "1080" in gpuName
|
||||
):
|
||||
return False
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def getDeviceMemory(self, id: int):
|
||||
try:
|
||||
return torch.cuda.get_device_properties(id).total_memory
|
||||
# except Exception as e:
|
||||
except:
|
||||
# print(e)
|
||||
return 0
|
@ -20,7 +20,6 @@ def list_audio_device():
|
||||
audio = pyaudio.PyAudio()
|
||||
audio_input_devices: list[ServerAudioDevice] = []
|
||||
audio_output_devices: list[ServerAudioDevice] = []
|
||||
# audio_devices = {}
|
||||
host_apis = []
|
||||
|
||||
for api_index in range(audio.get_host_api_count()):
|
||||
|
@ -71,9 +71,8 @@ class RVC:
|
||||
|
||||
def loadModel(self, props: LoadModelParams):
|
||||
target_slot_idx = props.slot
|
||||
params_str = props.params
|
||||
params = json.loads(params_str)
|
||||
|
||||
params = props.params
|
||||
|
||||
modelSlot = generateModelSlot(props.files, params)
|
||||
self.settings.modelSlots[target_slot_idx] = modelSlot
|
||||
print(
|
||||
|
@ -1,4 +1,5 @@
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -16,4 +17,4 @@ class LoadModelParams:
|
||||
slot: int
|
||||
isHalf: bool
|
||||
files: FilePaths
|
||||
params: str
|
||||
params: Any
|
||||
|
Loading…
Reference in New Issue
Block a user