This commit is contained in:
wok 2024-07-02 00:49:12 +09:00
parent 1952c76533
commit 9dbbdcf89b
12 changed files with 5192 additions and 543 deletions

View File

@ -1 +1,10 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html> <!DOCTYPE html>
<html style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>Voice Changer Client Demo</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
</body>
</html>

File diff suppressed because one or more lines are too long

View File

@ -1,35 +0,0 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/*!**********************!*\
!*** ./src/index.ts ***!
\**********************/
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

View File

@ -1,11 +1,5 @@
import { VoiceChangerWorkletProcessorRequest } from "../@types/voice-changer-worklet-processor"; import { VoiceChangerWorkletProcessorRequest } from "../@types/voice-changer-worklet-processor";
import { import { DefaultClientSettng, DownSamplingMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletNodeSetting, WorkletSetting } from "../const";
DefaultClientSettng,
DownSamplingMode,
VOICE_CHANGER_CLIENT_EXCEPTION,
WorkletNodeSetting,
WorkletSetting,
} from "../const";
import { io, Socket } from "socket.io-client"; import { io, Socket } from "socket.io-client";
import { DefaultEventsMap } from "@socket.io/component-emitter"; import { DefaultEventsMap } from "@socket.io/component-emitter";
import { ServerRestClient } from "./ServerRestClient"; import { ServerRestClient } from "./ServerRestClient";
@ -14,10 +8,7 @@ export type VoiceChangerWorkletListener = {
notifyVolume: (vol: number) => void; notifyVolume: (vol: number) => void;
notifySendBufferingTime: (time: number) => void; notifySendBufferingTime: (time: number) => void;
notifyResponseTime: (time: number, perf?: number[]) => void; notifyResponseTime: (time: number, perf?: number[]) => void;
notifyException: ( notifyException: (code: VOICE_CHANGER_CLIENT_EXCEPTION, message: string) => void;
code: VOICE_CHANGER_CLIENT_EXCEPTION,
message: string
) => void;
}; };
export type InternalCallback = { export type InternalCallback = {
@ -38,12 +29,8 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
private outputNode: VoiceChangerWorkletNode | null = null; private outputNode: VoiceChangerWorkletNode | null = null;
// Promises // Promises
private startPromiseResolve: private startPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;
| ((value: void | PromiseLike<void>) => void) private stopPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;
| null = null;
private stopPromiseResolve:
| ((value: void | PromiseLike<void>) => void)
| null = null;
// InternalCallback // InternalCallback
private internalCallback: InternalCallback | null = null; private internalCallback: InternalCallback | null = null;
@ -62,16 +49,9 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
// 設定 // 設定
updateSetting = (setting: WorkletNodeSetting) => { updateSetting = (setting: WorkletNodeSetting) => {
console.log( console.log(`[WorkletNode] Updating WorkletNode Setting,`, this.setting, setting);
`[WorkletNode] Updating WorkletNode Setting,`,
this.setting,
setting
);
let recreateSocketIoRequired = false; let recreateSocketIoRequired = false;
if ( if (this.setting.serverUrl != setting.serverUrl || this.setting.protocol != setting.protocol) {
this.setting.serverUrl != setting.serverUrl ||
this.setting.protocol != setting.protocol
) {
recreateSocketIoRequired = true; recreateSocketIoRequired = true;
} }
this.setting = setting; this.setting = setting;
@ -100,10 +80,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
if (this.setting.protocol === "sio") { if (this.setting.protocol === "sio") {
this.socket = io(this.setting.serverUrl + "/test"); this.socket = io(this.setting.serverUrl + "/test");
this.socket.on("connect_error", (err) => { this.socket.on("connect_error", (err) => {
this.listener.notifyException( this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED, `[SIO] rconnection failed ${err}`);
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED,
`[SIO] rconnection failed ${err}`
);
}); });
this.socket.on("connect", () => { this.socket.on("connect", () => {
console.log(`[SIO] connect to ${this.setting.serverUrl}`); console.log(`[SIO] connect to ${this.setting.serverUrl}`);
@ -125,18 +102,12 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
// Quick hack for server device mode // Quick hack for server device mode
if (response[0] == 0) { if (response[0] == 0) {
this.listener.notifyResponseTime( this.listener.notifyResponseTime(Math.round(perf[0] * 1000), perf.slice(1, 4));
Math.round(perf[0] * 1000),
perf.slice(1, 4)
);
return; return;
} }
if (result.byteLength < 128 * 2) { if (result.byteLength < 128 * 2) {
this.listener.notifyException( this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE, `[SIO] recevied data is too short ${result.byteLength}`);
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE,
`[SIO] recevied data is too short ${result.byteLength}`
);
} else { } else {
if (this.outputNode != null) { if (this.outputNode != null) {
this.outputNode.postReceivedVoice(response[1]); this.outputNode.postReceivedVoice(response[1]);
@ -151,13 +122,15 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
postReceivedVoice = (data: ArrayBuffer) => { postReceivedVoice = (data: ArrayBuffer) => {
// Int16 to Float // Int16 to Float
const i16Data = new Int16Array(data); // const i16Data = new Int16Array(data);
const f32Data = new Float32Array(i16Data.length); // const f32Data = new Float32Array(i16Data.length);
// console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
i16Data.forEach((x, i) => { // // console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff; // i16Data.forEach((x, i) => {
f32Data[i] = float; // const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff;
}); // f32Data[i] = float;
// });
const f32Data = new Float32Array(data);
// アップサンプリング // アップサンプリング
let upSampledBuffer: Float32Array | null = null; let upSampledBuffer: Float32Array | null = null;
@ -187,11 +160,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
} }
}; };
private _averageDownsampleBuffer( private _averageDownsampleBuffer(buffer: Float32Array, originalSampleRate: number, destinationSamplerate: number) {
buffer: Float32Array,
originalSampleRate: number,
destinationSamplerate: number
) {
if (originalSampleRate == destinationSamplerate) { if (originalSampleRate == destinationSamplerate) {
return buffer; return buffer;
} }
@ -208,11 +177,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
// Use average value of skipped samples // Use average value of skipped samples
var accum = 0, var accum = 0,
count = 0; count = 0;
for ( for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
var i = offsetBuffer;
i < nextOffsetBuffer && i < buffer.length;
i++
) {
accum += buffer[i]; accum += buffer[i];
count++; count++;
} }
@ -245,6 +210,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
// ダウンサンプリング // ダウンサンプリング
let downsampledBuffer: Float32Array | null = null; let downsampledBuffer: Float32Array | null = null;
if (this.setting.sendingSampleRate == 48000) { if (this.setting.sendingSampleRate == 48000) {
console.log("no downsample");
downsampledBuffer = inputData; downsampledBuffer = inputData;
} else if (this.setting.downSamplingMode == DownSamplingMode.decimate) { } else if (this.setting.downSamplingMode == DownSamplingMode.decimate) {
//////// (Kind 1) 間引き ////////// //////// (Kind 1) 間引き //////////
@ -258,29 +224,25 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
} else { } else {
//////// (Kind 2) 平均 ////////// //////// (Kind 2) 平均 //////////
// downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000) // downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
downsampledBuffer = this._averageDownsampleBuffer( downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
inputData,
48000,
this.setting.sendingSampleRate
);
} }
// Float to Int16 (internalの場合はfloatのまま行く。) // Float to Int16 (internalの場合はfloatのまま行く。)
if (this.setting.protocol != "internal") { // if (this.setting.protocol != "internal") {
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2); // const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
const dataView = new DataView(arrayBuffer); // const dataView = new DataView(arrayBuffer);
for (let i = 0; i < downsampledBuffer.length; i++) { // for (let i = 0; i < downsampledBuffer.length; i++) {
let s = Math.max(-1, Math.min(1, downsampledBuffer[i])); // let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
s = s < 0 ? s * 0x8000 : s * 0x7fff; // s = s < 0 ? s * 0x8000 : s * 0x7fff;
dataView.setInt16(i * 2, s, true); // dataView.setInt16(i * 2, s, true);
} // }
// バッファリング // // バッファリング
this.requestChunks.push(arrayBuffer); // this.requestChunks.push(arrayBuffer);
} else { // } else {
// internal // internal
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer); // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
this.requestChunks.push(downsampledBuffer.buffer); this.requestChunks.push(downsampledBuffer.buffer);
} // }
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。 //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
if (this.requestChunks.length < this.setting.inputChunkNum) { if (this.requestChunks.length < this.setting.inputChunkNum) {
@ -305,10 +267,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
this.listener.notifySendBufferingTime(Date.now() - this.bufferStart); this.listener.notifySendBufferingTime(Date.now() - this.bufferStart);
this.bufferStart = Date.now(); this.bufferStart = Date.now();
} else { } else {
console.warn( console.warn(`[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`, event.data);
`[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`,
event.data
);
} }
} }
@ -325,10 +284,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
const restClient = new ServerRestClient(this.setting.serverUrl); const restClient = new ServerRestClient(this.setting.serverUrl);
const res = await restClient.postVoice(timestamp, newBuffer.buffer); const res = await restClient.postVoice(timestamp, newBuffer.buffer);
if (res.byteLength < 128 * 2) { if (res.byteLength < 128 * 2) {
this.listener.notifyException( this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE, `[REST] recevied data is too short ${res.byteLength}`);
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE,
`[REST] recevied data is too short ${res.byteLength}`
);
} else { } else {
if (this.outputNode != null) { if (this.outputNode != null) {
this.outputNode.postReceivedVoice(res); this.outputNode.postReceivedVoice(res);
@ -339,10 +295,7 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
} }
} else if (this.setting.protocol == "internal") { } else if (this.setting.protocol == "internal") {
if (!this.internalCallback) { if (!this.internalCallback) {
this.listener.notifyException( this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED,
`[AudioWorkletNode] internal audio process callback is not initialized`
);
return; return;
} }
// const res = await this.internalCallback.processAudio(newBuffer); // const res = await this.internalCallback.processAudio(newBuffer);

1
server/.python-version Normal file
View File

@ -0,0 +1 @@
3.10.11

View File

@ -41,32 +41,122 @@ logger.debug(f"---------------- Booting PHASE :{__name__} -----------------")
def setupArgParser(): def setupArgParser():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)") parser.add_argument(
"--logLevel",
type=str,
default="error",
help="Log level info|critical|error. (default: error)",
)
parser.add_argument("-p", type=int, default=18888, help="port") parser.add_argument("-p", type=int, default=18888, help="port")
parser.add_argument("--https", type=strtobool, default=False, help="use https") parser.add_argument("--https", type=strtobool, default=False, help="use https")
parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8") parser.add_argument(
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https") "--test_connect",
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https") type=str,
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate") default="8.8.8.8",
help="test connect to detect ip in https mode. default 8.8.8.8",
)
parser.add_argument(
"--httpsKey", type=str, default="ssl.key", help="path for the key of https"
)
parser.add_argument(
"--httpsCert", type=str, default="ssl.cert", help="path for the cert of https"
)
parser.add_argument(
"--httpsSelfSigned",
type=strtobool,
default=True,
help="generate self-signed certificate",
)
parser.add_argument("--model_dir", type=str, default="model_dir", help="path to model files") parser.add_argument(
parser.add_argument("--sample_mode", type=str, default="production", help="rvc_sample_mode") "--model_dir", type=str, default="model_dir", help="path to model files"
)
parser.add_argument(
"--sample_mode", type=str, default="production", help="rvc_sample_mode"
)
parser.add_argument("--content_vec_500", type=str, default="pretrain/checkpoint_best_legacy_500.pt", help="path to content_vec_500 model(pytorch)") parser.add_argument(
parser.add_argument("--content_vec_500_onnx", type=str, default="pretrain/content_vec_500.onnx", help="path to content_vec_500 model(onnx)") "--content_vec_500",
parser.add_argument("--content_vec_500_onnx_on", type=strtobool, default=True, help="use or not onnx for content_vec_500") type=str,
parser.add_argument("--hubert_base", type=str, default="pretrain/hubert_base.pt", help="path to hubert_base model(pytorch)") default="pretrain/checkpoint_best_legacy_500.pt",
parser.add_argument("--hubert_base_jp", type=str, default="pretrain/rinna_hubert_base_jp.pt", help="path to hubert_base_jp model(pytorch)") help="path to content_vec_500 model(pytorch)",
parser.add_argument("--hubert_soft", type=str, default="pretrain/hubert/hubert-soft-0d54a1f4.pt", help="path to hubert_soft model(pytorch)") )
parser.add_argument("--whisper_tiny", type=str, default="pretrain/whisper_tiny.pt", help="path to hubert_soft model(pytorch)") parser.add_argument(
parser.add_argument("--nsf_hifigan", type=str, default="pretrain/nsf_hifigan/model", help="path to nsf_hifigan model(pytorch)") "--content_vec_500_onnx",
parser.add_argument("--crepe_onnx_full", type=str, default="pretrain/crepe_onnx_full.onnx", help="path to crepe_onnx_full") type=str,
parser.add_argument("--crepe_onnx_tiny", type=str, default="pretrain/crepe_onnx_tiny.onnx", help="path to crepe_onnx_tiny") default="pretrain/content_vec_500.onnx",
parser.add_argument("--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe") help="path to content_vec_500 model(onnx)",
parser.add_argument("--rmvpe_onnx", type=str, default="pretrain/rmvpe.onnx", help="path to rmvpe onnx") )
parser.add_argument(
"--content_vec_500_onnx_on",
type=strtobool,
default=True,
help="use or not onnx for content_vec_500",
)
parser.add_argument(
"--hubert_base",
type=str,
default="pretrain/hubert_base.pt",
help="path to hubert_base model(pytorch)",
)
parser.add_argument(
"--hubert_base_jp",
type=str,
default="pretrain/rinna_hubert_base_jp.pt",
help="path to hubert_base_jp model(pytorch)",
)
parser.add_argument(
"--hubert_soft",
type=str,
default="pretrain/hubert/hubert-soft-0d54a1f4.pt",
help="path to hubert_soft model(pytorch)",
)
parser.add_argument(
"--whisper_tiny",
type=str,
default="pretrain/whisper_tiny.pt",
help="path to hubert_soft model(pytorch)",
)
parser.add_argument(
"--nsf_hifigan",
type=str,
default="pretrain/nsf_hifigan/model",
help="path to nsf_hifigan model(pytorch)",
)
parser.add_argument(
"--crepe_onnx_full",
type=str,
default="pretrain/crepe_onnx_full.onnx",
help="path to crepe_onnx_full",
)
parser.add_argument(
"--crepe_onnx_tiny",
type=str,
default="pretrain/crepe_onnx_tiny.onnx",
help="path to crepe_onnx_tiny",
)
parser.add_argument(
"--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe"
)
parser.add_argument(
"--rmvpe_onnx",
type=str,
default="pretrain/rmvpe.onnx",
help="path to rmvpe onnx",
)
parser.add_argument("--host", type=str, default='127.0.0.1', help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.") parser.add_argument(
parser.add_argument("--allowed-origins", action='append', default=[], help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.") "--host",
type=str,
default="127.0.0.1",
help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.",
)
parser.add_argument(
"--allowed-origins",
action="append",
default=[],
help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.",
)
return parser return parser
@ -121,7 +211,11 @@ HOST = args.host
PORT = args.p PORT = args.p
def localServer(logLevel: str = "critical", key_path: str | None = None, cert_path: str | None = None): def localServer(
logLevel: str = "critical",
key_path: str | None = None,
cert_path: str | None = None,
):
try: try:
uvicorn.run( uvicorn.run(
f"{os.path.basename(__file__)[:-3]}:app_socketio", f"{os.path.basename(__file__)[:-3]}:app_socketio",
@ -140,14 +234,19 @@ if __name__ == "MMVCServerSIO":
mp.freeze_support() mp.freeze_support()
voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams) voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams)
app_fastapi = MMVC_Rest.get_instance(voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT) app_fastapi = MMVC_Rest.get_instance(
app_socketio = MMVC_SocketIOApp.get_instance(app_fastapi, voiceChangerManager, args.allowed_origins, PORT) voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT
)
app_socketio = MMVC_SocketIOApp.get_instance(
app_fastapi, voiceChangerManager, args.allowed_origins, PORT
)
if __name__ == "__mp_main__": if __name__ == "__mp_main__":
# printMessage("サーバプロセスを起動しています。", level=2) # printMessage("サーバプロセスを起動しています。", level=2)
printMessage("The server process is starting up.", level=2) printMessage("The server process is starting up.", level=2)
if __name__ == "__main__": if __name__ == "__main__":
mp.freeze_support() mp.freeze_support()
@ -202,7 +301,9 @@ if __name__ == "__main__":
) )
key_path = os.path.join(SSL_KEY_DIR, keyname) key_path = os.path.join(SSL_KEY_DIR, keyname)
cert_path = os.path.join(SSL_KEY_DIR, certname) cert_path = os.path.join(SSL_KEY_DIR, certname)
printMessage(f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1) printMessage(
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1
)
elif args.https and args.httpsSelfSigned == 0: elif args.https and args.httpsSelfSigned == 0:
# HTTPS # HTTPS
@ -223,8 +324,13 @@ if __name__ == "__main__":
printMessage("http://<IP>:<PORT>/", level=1) printMessage("http://<IP>:<PORT>/", level=1)
# printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2) # printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
printMessage("In many cases, it will launch when you access any of the following URLs.", level=2) printMessage(
if "EX_PORT" in locals() and "EX_IP" in locals(): # シェルスクリプト経由起動(docker) "In many cases, it will launch when you access any of the following URLs.",
level=2,
)
if (
"EX_PORT" in locals() and "EX_IP" in locals()
): # シェルスクリプト経由起動(docker)
if args.https == 1: if args.https == 1:
printMessage(f"https://localhost:{EX_PORT}/", level=1) printMessage(f"https://localhost:{EX_PORT}/", level=1)
for ip in EX_IP.strip().split(" "): for ip in EX_IP.strip().split(" "):
@ -254,12 +360,26 @@ if __name__ == "__main__":
p.start() p.start()
try: try:
if sys.platform.startswith("win"): if sys.platform.startswith("win"):
process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "--disable-gpu", "-u", f"http://localhost:{PORT}/"]) process = subprocess.Popen(
[
NATIVE_CLIENT_FILE_WIN,
"--disable-gpu",
"-u",
f"http://localhost:{PORT}/",
]
)
return_code = process.wait() return_code = process.wait()
logger.info("client closed.") logger.info("client closed.")
p.terminate() p.terminate()
elif sys.platform.startswith("darwin"): elif sys.platform.startswith("darwin"):
process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "--disable-gpu", "-u", f"http://localhost:{PORT}/"]) process = subprocess.Popen(
[
NATIVE_CLIENT_FILE_MAC,
"--disable-gpu",
"-u",
f"http://localhost:{PORT}/",
]
)
return_code = process.wait() return_code = process.wait()
logger.info("client closed.") logger.info("client closed.")
p.terminate() p.terminate()

3126
server/poetry.lock generated Normal file

File diff suppressed because it is too large Load Diff

44
server/pyproject.toml Normal file
View File

@ -0,0 +1,44 @@
[tool.poetry]
name = "server"
version = "0.1.0"
description = ""
authors = ["wok <wok@local.com>"]
readme = "README.md"
[tool.poetry.dependencies]
python = "3.10.11"
uvicorn = "0.21.1"
pyOpenSSL ="23.1.1"
numpy = "1.23.5"
resampy = "0.4.2"
python-socketio = "5.8.0"
fastapi = "0.95.1"
python-multipart = "0.0.6"
onnxruntime-gpu = "1.13.1"
scipy = "1.10.1"
matplotlib = "3.7.1"
websockets = "11.0.2"
faiss-cpu = "1.7.3"
torchcrepe = "0.0.18"
librosa = "0.9.1"
gin = "0.1.6"
gin_config = "0.5.0"
einops = "0.6.0"
local_attention = "1.8.5"
sounddevice = "0.4.6"
dataclasses_json = "0.5.7"
onnxsim = "0.4.28"
torchfcpe = "0.0.3"
torchaudio = "2.3.1"
torch = "2.3.1"
fairseq = "0.12.2"
pyworld = "0.3.4"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.poetry.scripts]
main2 = "MMVCServerSIO:main"
test = "test.test:test"

View File

@ -39,13 +39,18 @@ class MMVC_Rest_VoiceChanger:
# struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav) # struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
# ) # )
unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16) # unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16)
unpackedData = np.array(
struct.unpack("<%sf" % (len(wav) // struct.calcsize("<f")), wav)
).astype(np.float32)
# print(f"[REST] unpackedDataType {unpackedData.dtype}") # print(f"[REST] unpackedDataType {unpackedData.dtype}")
self.tlock.acquire() self.tlock.acquire()
changedVoice = self.voiceChangerManager.changeVoice(unpackedData) changedVoice = self.voiceChangerManager.changeVoice(unpackedData)
self.tlock.release() self.tlock.release()
print("", changedVoice[0].dtype)
changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8") changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8")
data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64} data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64}

View File

@ -60,9 +60,13 @@ class RVCr2(VoiceChangerModel):
# pipelineの生成 # pipelineの生成
try: try:
self.pipeline = createPipeline(self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector) self.pipeline = createPipeline(
self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector
)
except PipelineCreateException as e: # NOQA except PipelineCreateException as e: # NOQA
logger.error("[Voice Changer] pipeline create failed. check your model is valid.") logger.error(
"[Voice Changer] pipeline create failed. check your model is valid."
)
return return
# その他の設定 # その他の設定
@ -88,7 +92,9 @@ class RVCr2(VoiceChangerModel):
elif key in self.settings.strData: elif key in self.settings.strData:
setattr(self.settings, key, str(val)) setattr(self.settings, key, str(val))
if key == "f0Detector" and self.pipeline is not None: if key == "f0Detector" and self.pipeline is not None:
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu) pitchExtractor = PitchExtractorManager.getPitchExtractor(
self.settings.f0Detector, self.settings.gpu
)
self.pipeline.setPitchExtractor(pitchExtractor) self.pipeline.setPitchExtractor(pitchExtractor)
else: else:
return False return False
@ -115,14 +121,16 @@ class RVCr2(VoiceChangerModel):
): ):
# 16k で入ってくる。 # 16k で入ってくる。
inputSize = newData.shape[0] inputSize = newData.shape[0]
newData = newData.astype(np.float32) / 32768.0 # newData = newData.astype(np.float32) / 32768.0
newFeatureLength = inputSize // 160 # hopsize:=160 newFeatureLength = inputSize // 160 # hopsize:=160
if self.audio_buffer is not None: if self.audio_buffer is not None:
# 過去のデータに連結 # 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
if self.slotInfo.f0: if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0) self.pitchf_buffer = np.concatenate(
[self.pitchf_buffer, np.zeros(newFeatureLength)], 0
)
self.feature_buffer = np.concatenate( self.feature_buffer = np.concatenate(
[ [
self.feature_buffer, self.feature_buffer,
@ -134,19 +142,29 @@ class RVCr2(VoiceChangerModel):
self.audio_buffer = newData self.audio_buffer = newData
if self.slotInfo.f0: if self.slotInfo.f0:
self.pitchf_buffer = np.zeros(newFeatureLength) self.pitchf_buffer = np.zeros(newFeatureLength)
self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels]) self.feature_buffer = np.zeros(
[newFeatureLength, self.slotInfo.embChannels]
)
convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 if (
convertSize % 160 != 0
): # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (160 - (convertSize % 160)) convertSize = convertSize + (160 - (convertSize % 160))
outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate) outSize = int(
((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate
)
# バッファがたまっていない場合はzeroで補う # バッファがたまっていない場合はzeroで補う
if self.audio_buffer.shape[0] < convertSize: if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) self.audio_buffer = np.concatenate(
[np.zeros([convertSize]), self.audio_buffer]
)
if self.slotInfo.f0: if self.slotInfo.f0:
self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer]) self.pitchf_buffer = np.concatenate(
[np.zeros([convertSize // 160]), self.pitchf_buffer]
)
self.feature_buffer = np.concatenate( self.feature_buffer = np.concatenate(
[ [
np.zeros([convertSize // 160, self.slotInfo.embChannels]), np.zeros([convertSize // 160, self.slotInfo.embChannels]),
@ -179,27 +197,39 @@ class RVCr2(VoiceChangerModel):
outSize, outSize,
) )
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int): def inference(
self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int
):
if self.pipeline is None: if self.pipeline is None:
logger.info("[Voice Changer] Pipeline is not initialized.") logger.info("[Voice Changer] Pipeline is not initialized.")
raise PipelineNotInitializedException() raise PipelineNotInitializedException()
# 処理は16Kで実施(Pitch, embed, (infer)) # 処理は16Kで実施(Pitch, embed, (infer))
receivedData = cast( # receivedData = cast(
AudioInOut, # AudioInOut,
resampy.resample( # resampy.resample(
# receivedData,
# self.inputSampleRate,
# 16000,
# filter="kaiser_fast",
# ),
# )
receivedData = resampy.resample(
receivedData, receivedData,
self.inputSampleRate, self.inputSampleRate,
16000, 16000,
filter="kaiser_fast", filter="kaiser_fast",
),
) )
crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000) crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000) sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000) extra_frame = int(
(self.settings.extraConvertSize / self.inputSampleRate) * 16000
)
# 入力データ生成 # 入力データ生成
data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame) data = self.generate_input(
receivedData, crossfade_frame, sola_search_frame, extra_frame
)
audio = data[0] audio = data[0]
pitchf = data[1] pitchf = data[1]
@ -234,7 +264,11 @@ class RVCr2(VoiceChangerModel):
index_rate, index_rate,
if_f0, if_f0,
# 0, # 0,
self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0.0, # extaraDataSizeの秒数。入力のサンプリングレートで算出 (
self.settings.extraConvertSize / self.inputSampleRate
if self.settings.silenceFront
else 0.0
), # extaraDataSizeの秒数。入力のサンプリングレートで算出
embOutputLayer, embOutputLayer,
useFinalProj, useFinalProj,
repeat, repeat,
@ -244,19 +278,27 @@ class RVCr2(VoiceChangerModel):
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol) # result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol) result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
result = cast( # result = cast(
AudioInOut, # AudioInOut,
resampy.resample( # resampy.resample(
# result,
# self.slotInfo.samplingRate,
# self.outputSampleRate,
# filter="kaiser_fast",
# ),
# )
result = resampy.resample(
result, result,
self.slotInfo.samplingRate, self.slotInfo.samplingRate,
self.outputSampleRate, self.outputSampleRate,
filter="kaiser_fast", filter="kaiser_fast",
),
) )
return result return result
except DeviceCannotSupportHalfPrecisionException as e: # NOQA except DeviceCannotSupportHalfPrecisionException as e: # NOQA
logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....") logger.warn(
"[Device Manager] Device cannot support half precision. Fallback to float...."
)
self.deviceManager.setForceTensor(True) self.deviceManager.setForceTensor(True)
self.initialize() self.initialize()
# raise e # raise e

View File

@ -55,7 +55,9 @@ class Pipeline(object):
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor)) logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
self.index = index self.index = index
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None self.big_npy = (
index.reconstruct_n(0, index.ntotal) if index is not None else None
)
# self.feature = feature # self.feature = feature
self.targetSR = targetSR self.targetSR = targetSR
@ -69,7 +71,12 @@ class Pipeline(object):
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {} inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
embedderInfo = self.embedder.getEmbedderInfo() embedderInfo = self.embedder.getEmbedderInfo()
pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo() pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf} return {
"inferencer": inferencerInfo,
"embedder": embedderInfo,
"pitchExtractor": pitchExtractorInfo,
"isHalf": self.isHalf,
}
def setPitchExtractor(self, pitchExtractor: PitchExtractor): def setPitchExtractor(self, pitchExtractor: PitchExtractor):
self.pitchExtractor = pitchExtractor self.pitchExtractor = pitchExtractor
@ -88,13 +95,16 @@ class Pipeline(object):
# pitch = pitch[:p_len] # pitch = pitch[:p_len]
# pitchf = pitchf[:p_len] # pitchf = pitchf[:p_len]
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0) pitchf = torch.tensor(
pitchf, device=self.device, dtype=torch.float
).unsqueeze(0)
else: else:
pitch = None pitch = None
pitchf = None pitchf = None
except IndexError as e: # NOQA except IndexError as e: # NOQA
print(e) print(e)
import traceback import traceback
traceback.print_exc() traceback.print_exc()
raise NotEnoughDataExtimateF0() raise NotEnoughDataExtimateF0()
return pitch, pitchf return pitch, pitchf
@ -102,7 +112,9 @@ class Pipeline(object):
def extractFeatures(self, feats, embOutputLayer, useFinalProj): def extractFeatures(self, feats, embOutputLayer, useFinalProj):
with autocast(enabled=self.isHalf): with autocast(enabled=self.isHalf):
try: try:
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) feats = self.embedder.extractFeatures(
feats, embOutputLayer, useFinalProj
)
if torch.isnan(feats).all(): if torch.isnan(feats).all():
raise DeviceCannotSupportHalfPrecisionException() raise DeviceCannotSupportHalfPrecisionException()
return feats return feats
@ -118,8 +130,11 @@ class Pipeline(object):
try: try:
with torch.no_grad(): with torch.no_grad():
with autocast(enabled=self.isHalf): with autocast(enabled=self.isHalf):
audio1 = self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size) audio1 = self.inferencer.infer(
audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16) feats, p_len, pitch, pitchf, sid, out_size
)
# audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
audio1 = (audio1).data
return audio1 return audio1
except RuntimeError as e: except RuntimeError as e:
if "HALF" in e.__str__().upper(): if "HALF" in e.__str__().upper():
@ -149,16 +164,24 @@ class Pipeline(object):
with Timer2("Pipeline-Exec", False) as t: # NOQA with Timer2("Pipeline-Exec", False) as t: # NOQA
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。 # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
search_index = self.index is not None and self.big_npy is not None and index_rate != 0 search_index = (
self.index is not None and self.big_npy is not None and index_rate != 0
)
# self.t_pad = self.sr * repeat # 1秒 # self.t_pad = self.sr * repeat # 1秒
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される) # self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
audio = audio.unsqueeze(0) audio = audio.unsqueeze(0)
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。 quality_padding_sec = (
repeat * (audio.shape[1] - 1)
) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加 self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される) self.t_pad_tgt = round(
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0) self.targetSR * quality_padding_sec
) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(
0
)
p_len = audio_pad.shape[0] // self.window p_len = audio_pad.shape[0] // self.window
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
@ -176,7 +199,9 @@ class Pipeline(object):
t.record("pre-process") t.record("pre-process")
# ピッチ検出 # ピッチ検出
pitch, pitchf = self.extractPitch(audio_pad, if_f0, pitchf, f0_up_key, silence_front) pitch, pitchf = self.extractPitch(
audio_pad, if_f0, pitchf, f0_up_key, silence_front
)
t.record("extract-pitch") t.record("extract-pitch")
# embedding # embedding
@ -203,12 +228,25 @@ class Pipeline(object):
score, ix = self.index.search(npy, k=8) score, ix = self.index.search(npy, k=8)
weight = np.square(1 / score) weight = np.square(1 / score)
weight /= weight.sum(axis=1, keepdims=True) weight /= weight.sum(axis=1, keepdims=True)
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) npy = np.sum(
self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
)
# recover silient font # recover silient font
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:] npy = np.concatenate(
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats [
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) np.zeros([npyOffset, npy.shape[1]], dtype=np.float32),
feature[:npyOffset:2].astype("float32"),
npy,
]
)[-feats.shape[1] :]
feats = (
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+ (1 - index_rate) * feats
)
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
0, 2, 1
)
if protect < 0.5 and search_index: if protect < 0.5 and search_index:
feats0 = feats.clone() feats0 = feats.clone()
@ -280,4 +318,4 @@ class Pipeline(object):
del self.embedder del self.embedder
del self.inferencer del self.inferencer
del self.pitchExtractor del self.pitchExtractor
print('Pipeline has been deleted') print("Pipeline has been deleted")

View File

@ -90,15 +90,22 @@ class VoiceChangerV2(VoiceChangerIF):
self.params = params self.params = params
self.gpu_num = torch.cuda.device_count() self.gpu_num = torch.cuda.device_count()
self.prev_audio = np.zeros(4096) self.prev_audio = np.zeros(4096)
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() self.mps_enabled: bool = (
getattr(torch.backends, "mps", None) is not None
and torch.backends.mps.is_available()
)
self.onnx_device = onnxruntime.get_device() self.onnx_device = onnxruntime.get_device()
self.noCrossFade = False self.noCrossFade = False
logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})") logger.info(
f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})"
)
def setModel(self, model: VoiceChangerModel): def setModel(self, model: VoiceChangerModel):
self.voiceChanger = model self.voiceChanger = model
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) self.voiceChanger.setSamplingRate(
self.settings.inputSampleRate, self.settings.outputSampleRate
)
# if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC": # if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC":
if model.voiceChangerType == "Beatrice": if model.voiceChangerType == "Beatrice":
self.noCrossFade = True self.noCrossFade = True
@ -107,11 +114,15 @@ class VoiceChangerV2(VoiceChangerIF):
def setInputSampleRate(self, sr: int): def setInputSampleRate(self, sr: int):
self.settings.inputSampleRate = sr self.settings.inputSampleRate = sr
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) self.voiceChanger.setSamplingRate(
self.settings.inputSampleRate, self.settings.outputSampleRate
)
def setOutputSampleRate(self, sr: int): def setOutputSampleRate(self, sr: int):
self.settings.outputSampleRate = sr self.settings.outputSampleRate = sr
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) self.voiceChanger.setSamplingRate(
self.settings.inputSampleRate, self.settings.outputSampleRate
)
def get_info(self): def get_info(self):
data = asdict(self.settings) data = asdict(self.settings)
@ -130,7 +141,9 @@ class VoiceChangerV2(VoiceChangerIF):
if key == "serverAudioStated" and val == 0: if key == "serverAudioStated" and val == 0:
self.settings.inputSampleRate = 48000 self.settings.inputSampleRate = 48000
self.settings.outputSampleRate = 48000 self.settings.outputSampleRate = 48000
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) self.voiceChanger.setSamplingRate(
self.settings.inputSampleRate, self.settings.outputSampleRate
)
if key in self.settings.intData: if key in self.settings.intData:
setattr(self.settings, key, int(val)) setattr(self.settings, key, int(val))
@ -146,7 +159,9 @@ class VoiceChangerV2(VoiceChangerIF):
self.settings.outputSampleRate, self.settings.outputSampleRate,
# 16000, # 16000,
) )
print(f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}") print(
f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}"
)
if key == "recordIO" and val == 0: if key == "recordIO" and val == 0:
if hasattr(self, "ioRecorder"): if hasattr(self, "ioRecorder"):
self.ioRecorder.close() self.ioRecorder.close()
@ -155,7 +170,9 @@ class VoiceChangerV2(VoiceChangerIF):
if hasattr(self, "ioRecorder"): if hasattr(self, "ioRecorder"):
self.ioRecorder.close() self.ioRecorder.close()
if key == "inputSampleRate" or key == "outputSampleRate": if key == "inputSampleRate" or key == "outputSampleRate":
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) self.voiceChanger.setSamplingRate(
self.settings.inputSampleRate, self.settings.outputSampleRate
)
elif key in self.settings.floatData: elif key in self.settings.floatData:
setattr(self.settings, key, float(val)) setattr(self.settings, key, float(val))
elif key in self.settings.strData: elif key in self.settings.strData:
@ -168,7 +185,12 @@ class VoiceChangerV2(VoiceChangerIF):
return self.get_info() return self.get_info()
def _generate_strength(self, crossfadeSize: int): def _generate_strength(self, crossfadeSize: int):
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize: if (
self.crossfadeSize != crossfadeSize
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
):
self.crossfadeSize = crossfadeSize self.crossfadeSize = crossfadeSize
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
@ -197,7 +219,9 @@ class VoiceChangerV2(VoiceChangerIF):
] ]
) )
logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}") logger.info(
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
)
# ひとつ前の結果とサイズが変わるため、記録は消去する。 # ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, "np_prev_audio1") is True: if hasattr(self, "np_prev_audio1") is True:
@ -212,13 +236,19 @@ class VoiceChangerV2(VoiceChangerIF):
return self.voiceChanger.get_processing_sampling_rate() return self.voiceChanger.get_processing_sampling_rate()
# receivedData: tuple of short # receivedData: tuple of short
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]: def on_request(
self, receivedData: AudioInOut
) -> tuple[AudioInOut, list[Union[int, float]]]:
try: try:
if self.voiceChanger is None: if self.voiceChanger is None:
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.") raise VoiceChangerIsNotSelectedException(
"Voice Changer is not selected."
)
enableMainprocessTimer = False enableMainprocessTimer = False
with Timer2("main-process", enableMainprocessTimer) as t: with Timer2("main-process", enableMainprocessTimer) as t:
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate() processing_sampling_rate = (
self.voiceChanger.get_processing_sampling_rate()
)
if self.noCrossFade: # Beatrice, LLVC if self.noCrossFade: # Beatrice, LLVC
audio = self.voiceChanger.inference( audio = self.voiceChanger.inference(
@ -232,7 +262,9 @@ class VoiceChangerV2(VoiceChangerIF):
else: else:
sola_search_frame = int(0.012 * processing_sampling_rate) sola_search_frame = int(0.012 * processing_sampling_rate)
block_frame = receivedData.shape[0] block_frame = receivedData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame) crossfade_frame = min(
self.settings.crossFadeOverlapSize, block_frame
)
self._generate_strength(crossfade_frame) self._generate_strength(crossfade_frame)
t.record("generate_strength") t.record("generate_strength")
@ -241,11 +273,14 @@ class VoiceChangerV2(VoiceChangerIF):
crossfade_frame=crossfade_frame, crossfade_frame=crossfade_frame,
sola_search_frame=sola_search_frame, sola_search_frame=sola_search_frame,
) )
print("output audio dtype", audio.dtype)
t.record("inference") t.record("inference")
if hasattr(self, "sola_buffer") is True: if hasattr(self, "sola_buffer") is True:
np.set_printoptions(threshold=10000) np.set_printoptions(threshold=10000)
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame) audio_offset = -1 * (
sola_search_frame + crossfade_frame + block_frame
)
audio = audio[audio_offset:] audio = audio[audio_offset:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
@ -264,24 +299,33 @@ class VoiceChangerV2(VoiceChangerIF):
) )
sola_offset = int(np.argmax(cor_nom / cor_den)) sola_offset = int(np.argmax(cor_nom / cor_den))
sola_end = sola_offset + block_frame sola_end = sola_offset + block_frame
output_wav = audio[sola_offset:sola_end].astype(np.float64) output_wav = audio[sola_offset:sola_end].astype(np.float32)
output_wav[:crossfade_frame] *= self.np_cur_strength output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:] output_wav[:crossfade_frame] += self.sola_buffer[:]
result = output_wav result = output_wav
else: else:
logger.info("[Voice Changer] warming up... generating sola buffer.") logger.info(
result = np.zeros(4096).astype(np.int16) "[Voice Changer] warming up... generating sola buffer."
)
result = np.zeros(4096).astype(np.float32)
t.record("sora") t.record("sora")
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame: if (
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset) hasattr(self, "sola_buffer") is True
and sola_offset < sola_search_frame
):
offset = -1 * (
sola_search_frame + crossfade_frame - sola_offset
)
end = -1 * (sola_search_frame - sola_offset) end = -1 * (sola_search_frame - sola_offset)
sola_buf_org = audio[offset:end] sola_buf_org = audio[offset:end]
self.sola_buffer = sola_buf_org * self.np_prev_strength self.sola_buffer = sola_buf_org * self.np_prev_strength
else: else:
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength self.sola_buffer = (
audio[-crossfade_frame:] * self.np_prev_strength
)
# self.sola_buffer = audio[- crossfade_frame:] # self.sola_buffer = audio[- crossfade_frame:]
t.record("post") t.record("post")
@ -290,9 +334,11 @@ class VoiceChangerV2(VoiceChangerIF):
# 後処理 # 後処理
with Timer2("post-process", False) as t: with Timer2("post-process", False) as t:
result = result.astype(np.int16) result = result.astype(np.float32)
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz") print_convert_processing(
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz"
)
if receivedData.shape[0] != result.shape[0]: if receivedData.shape[0] != result.shape[0]:
# print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0]) # print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0])
@ -311,7 +357,9 @@ class VoiceChangerV2(VoiceChangerIF):
postprocess_time = t.secs postprocess_time = t.secs
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}") print_convert_processing(
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
)
perf = [0, mainprocess_time, postprocess_time] perf = [0, mainprocess_time, postprocess_time]
return outputData, perf return outputData, perf
@ -320,7 +368,9 @@ class VoiceChangerV2(VoiceChangerIF):
logger.warn(f"[Voice Changer] [Exception], {e}") logger.warn(f"[Voice Changer] [Exception], {e}")
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
except ONNXInputArgumentException as e: except ONNXInputArgumentException as e:
logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}") logger.warn(
f"[Voice Changer] [Exception] onnx are waiting valid input., {e}"
)
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
except HalfPrecisionChangingException: except HalfPrecisionChangingException:
logger.warn("[Voice Changer] Switching model configuration....") logger.warn("[Voice Changer] Switching model configuration....")
@ -332,7 +382,9 @@ class VoiceChangerV2(VoiceChangerIF):
logger.warn(f"[Voice Changer] embedder: {e}") logger.warn(f"[Voice Changer] embedder: {e}")
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
except VoiceChangerIsNotSelectedException: except VoiceChangerIsNotSelectedException:
logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.") logger.warn(
"[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc."
)
return np.zeros(1).astype(np.int16), [0, 0, 0] return np.zeros(1).astype(np.int16), [0, 0, 0]
except DeviceCannotSupportHalfPrecisionException: except DeviceCannotSupportHalfPrecisionException:
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。 # RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。