mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-09 03:37:51 +03:00
test 48k
This commit is contained in:
parent
1952c76533
commit
9dbbdcf89b
11
client/demo/dist/index.html
vendored
11
client/demo/dist/index.html
vendored
@ -1 +1,10 @@
|
|||||||
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
<!DOCTYPE html>
|
||||||
|
<html style="width: 100%; height: 100%; overflow: hidden">
|
||||||
|
<head>
|
||||||
|
<meta charset="utf-8" />
|
||||||
|
<title>Voice Changer Client Demo</title>
|
||||||
|
<script defer src="index.js"></script></head>
|
||||||
|
<body style="width: 100%; height: 100%; margin: 0px">
|
||||||
|
<div id="app" style="width: 100%; height: 100%"></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
1298
client/demo/dist/index.js
vendored
1298
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
35
client/demo/dist/index.js.LICENSE.txt
vendored
35
client/demo/dist/index.js.LICENSE.txt
vendored
@ -1,35 +0,0 @@
|
|||||||
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
|
||||||
|
|
||||||
/*!**********************!*\
|
|
||||||
!*** ./src/index.ts ***!
|
|
||||||
\**********************/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* react-dom.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* react.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @license React
|
|
||||||
* scheduler.production.min.js
|
|
||||||
*
|
|
||||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
|
||||||
*
|
|
||||||
* This source code is licensed under the MIT license found in the
|
|
||||||
* LICENSE file in the root directory of this source tree.
|
|
||||||
*/
|
|
@ -1,443 +1,396 @@
|
|||||||
import { VoiceChangerWorkletProcessorRequest } from "../@types/voice-changer-worklet-processor";
|
import { VoiceChangerWorkletProcessorRequest } from "../@types/voice-changer-worklet-processor";
|
||||||
import {
|
import { DefaultClientSettng, DownSamplingMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletNodeSetting, WorkletSetting } from "../const";
|
||||||
DefaultClientSettng,
|
|
||||||
DownSamplingMode,
|
|
||||||
VOICE_CHANGER_CLIENT_EXCEPTION,
|
|
||||||
WorkletNodeSetting,
|
|
||||||
WorkletSetting,
|
|
||||||
} from "../const";
|
|
||||||
import { io, Socket } from "socket.io-client";
|
import { io, Socket } from "socket.io-client";
|
||||||
import { DefaultEventsMap } from "@socket.io/component-emitter";
|
import { DefaultEventsMap } from "@socket.io/component-emitter";
|
||||||
import { ServerRestClient } from "./ServerRestClient";
|
import { ServerRestClient } from "./ServerRestClient";
|
||||||
|
|
||||||
export type VoiceChangerWorkletListener = {
|
export type VoiceChangerWorkletListener = {
|
||||||
notifyVolume: (vol: number) => void;
|
notifyVolume: (vol: number) => void;
|
||||||
notifySendBufferingTime: (time: number) => void;
|
notifySendBufferingTime: (time: number) => void;
|
||||||
notifyResponseTime: (time: number, perf?: number[]) => void;
|
notifyResponseTime: (time: number, perf?: number[]) => void;
|
||||||
notifyException: (
|
notifyException: (code: VOICE_CHANGER_CLIENT_EXCEPTION, message: string) => void;
|
||||||
code: VOICE_CHANGER_CLIENT_EXCEPTION,
|
|
||||||
message: string
|
|
||||||
) => void;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
export type InternalCallback = {
|
export type InternalCallback = {
|
||||||
processAudio: (data: Uint8Array) => Promise<Uint8Array>;
|
processAudio: (data: Uint8Array) => Promise<Uint8Array>;
|
||||||
};
|
};
|
||||||
|
|
||||||
export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
||||||
private listener: VoiceChangerWorkletListener;
|
private listener: VoiceChangerWorkletListener;
|
||||||
|
|
||||||
private setting: WorkletNodeSetting = DefaultClientSettng.workletNodeSetting;
|
private setting: WorkletNodeSetting = DefaultClientSettng.workletNodeSetting;
|
||||||
private requestChunks: ArrayBuffer[] = [];
|
private requestChunks: ArrayBuffer[] = [];
|
||||||
private socket: Socket<DefaultEventsMap, DefaultEventsMap> | null = null;
|
private socket: Socket<DefaultEventsMap, DefaultEventsMap> | null = null;
|
||||||
// performance monitor
|
// performance monitor
|
||||||
private bufferStart = 0;
|
private bufferStart = 0;
|
||||||
|
|
||||||
private isOutputRecording = false;
|
private isOutputRecording = false;
|
||||||
private recordingOutputChunk: Float32Array[] = [];
|
private recordingOutputChunk: Float32Array[] = [];
|
||||||
private outputNode: VoiceChangerWorkletNode | null = null;
|
private outputNode: VoiceChangerWorkletNode | null = null;
|
||||||
|
|
||||||
// Promises
|
// Promises
|
||||||
private startPromiseResolve:
|
private startPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;
|
||||||
| ((value: void | PromiseLike<void>) => void)
|
private stopPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;
|
||||||
| null = null;
|
|
||||||
private stopPromiseResolve:
|
|
||||||
| ((value: void | PromiseLike<void>) => void)
|
|
||||||
| null = null;
|
|
||||||
|
|
||||||
// InternalCallback
|
// InternalCallback
|
||||||
private internalCallback: InternalCallback | null = null;
|
private internalCallback: InternalCallback | null = null;
|
||||||
|
|
||||||
constructor(context: AudioContext, listener: VoiceChangerWorkletListener) {
|
constructor(context: AudioContext, listener: VoiceChangerWorkletListener) {
|
||||||
super(context, "voice-changer-worklet-processor");
|
super(context, "voice-changer-worklet-processor");
|
||||||
this.port.onmessage = this.handleMessage.bind(this);
|
this.port.onmessage = this.handleMessage.bind(this);
|
||||||
this.listener = listener;
|
this.listener = listener;
|
||||||
this.createSocketIO();
|
this.createSocketIO();
|
||||||
console.log(`[worklet_node][voice-changer-worklet-processor] created.`);
|
console.log(`[worklet_node][voice-changer-worklet-processor] created.`);
|
||||||
}
|
|
||||||
|
|
||||||
setOutputNode = (outputNode: VoiceChangerWorkletNode | null) => {
|
|
||||||
this.outputNode = outputNode;
|
|
||||||
};
|
|
||||||
|
|
||||||
// 設定
|
|
||||||
updateSetting = (setting: WorkletNodeSetting) => {
|
|
||||||
console.log(
|
|
||||||
`[WorkletNode] Updating WorkletNode Setting,`,
|
|
||||||
this.setting,
|
|
||||||
setting
|
|
||||||
);
|
|
||||||
let recreateSocketIoRequired = false;
|
|
||||||
if (
|
|
||||||
this.setting.serverUrl != setting.serverUrl ||
|
|
||||||
this.setting.protocol != setting.protocol
|
|
||||||
) {
|
|
||||||
recreateSocketIoRequired = true;
|
|
||||||
}
|
}
|
||||||
this.setting = setting;
|
|
||||||
if (recreateSocketIoRequired) {
|
|
||||||
this.createSocketIO();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
setInternalAudioProcessCallback = (internalCallback: InternalCallback) => {
|
setOutputNode = (outputNode: VoiceChangerWorkletNode | null) => {
|
||||||
this.internalCallback = internalCallback;
|
this.outputNode = outputNode;
|
||||||
};
|
};
|
||||||
|
|
||||||
getSettings = (): WorkletNodeSetting => {
|
// 設定
|
||||||
return this.setting;
|
updateSetting = (setting: WorkletNodeSetting) => {
|
||||||
};
|
console.log(`[WorkletNode] Updating WorkletNode Setting,`, this.setting, setting);
|
||||||
|
let recreateSocketIoRequired = false;
|
||||||
getSocketId = () => {
|
if (this.setting.serverUrl != setting.serverUrl || this.setting.protocol != setting.protocol) {
|
||||||
return this.socket?.id;
|
recreateSocketIoRequired = true;
|
||||||
};
|
|
||||||
|
|
||||||
// 処理
|
|
||||||
private createSocketIO = () => {
|
|
||||||
if (this.socket) {
|
|
||||||
this.socket.close();
|
|
||||||
}
|
|
||||||
if (this.setting.protocol === "sio") {
|
|
||||||
this.socket = io(this.setting.serverUrl + "/test");
|
|
||||||
this.socket.on("connect_error", (err) => {
|
|
||||||
this.listener.notifyException(
|
|
||||||
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED,
|
|
||||||
`[SIO] rconnection failed ${err}`
|
|
||||||
);
|
|
||||||
});
|
|
||||||
this.socket.on("connect", () => {
|
|
||||||
console.log(`[SIO] connect to ${this.setting.serverUrl}`);
|
|
||||||
console.log(`[SIO] ${this.socket?.id}`);
|
|
||||||
});
|
|
||||||
this.socket.on("close", function (socket) {
|
|
||||||
console.log(`[SIO] close ${socket.id}`);
|
|
||||||
});
|
|
||||||
|
|
||||||
this.socket.on("message", (response: any[]) => {
|
|
||||||
console.log("message:", response);
|
|
||||||
});
|
|
||||||
|
|
||||||
this.socket.on("response", (response: any[]) => {
|
|
||||||
const cur = Date.now();
|
|
||||||
const responseTime = cur - response[0];
|
|
||||||
const result = response[1] as ArrayBuffer;
|
|
||||||
const perf = response[2];
|
|
||||||
|
|
||||||
// Quick hack for server device mode
|
|
||||||
if (response[0] == 0) {
|
|
||||||
this.listener.notifyResponseTime(
|
|
||||||
Math.round(perf[0] * 1000),
|
|
||||||
perf.slice(1, 4)
|
|
||||||
);
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
this.setting = setting;
|
||||||
|
if (recreateSocketIoRequired) {
|
||||||
|
this.createSocketIO();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (result.byteLength < 128 * 2) {
|
setInternalAudioProcessCallback = (internalCallback: InternalCallback) => {
|
||||||
this.listener.notifyException(
|
this.internalCallback = internalCallback;
|
||||||
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE,
|
};
|
||||||
`[SIO] recevied data is too short ${result.byteLength}`
|
|
||||||
);
|
getSettings = (): WorkletNodeSetting => {
|
||||||
|
return this.setting;
|
||||||
|
};
|
||||||
|
|
||||||
|
getSocketId = () => {
|
||||||
|
return this.socket?.id;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 処理
|
||||||
|
private createSocketIO = () => {
|
||||||
|
if (this.socket) {
|
||||||
|
this.socket.close();
|
||||||
|
}
|
||||||
|
if (this.setting.protocol === "sio") {
|
||||||
|
this.socket = io(this.setting.serverUrl + "/test");
|
||||||
|
this.socket.on("connect_error", (err) => {
|
||||||
|
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED, `[SIO] rconnection failed ${err}`);
|
||||||
|
});
|
||||||
|
this.socket.on("connect", () => {
|
||||||
|
console.log(`[SIO] connect to ${this.setting.serverUrl}`);
|
||||||
|
console.log(`[SIO] ${this.socket?.id}`);
|
||||||
|
});
|
||||||
|
this.socket.on("close", function (socket) {
|
||||||
|
console.log(`[SIO] close ${socket.id}`);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.socket.on("message", (response: any[]) => {
|
||||||
|
console.log("message:", response);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.socket.on("response", (response: any[]) => {
|
||||||
|
const cur = Date.now();
|
||||||
|
const responseTime = cur - response[0];
|
||||||
|
const result = response[1] as ArrayBuffer;
|
||||||
|
const perf = response[2];
|
||||||
|
|
||||||
|
// Quick hack for server device mode
|
||||||
|
if (response[0] == 0) {
|
||||||
|
this.listener.notifyResponseTime(Math.round(perf[0] * 1000), perf.slice(1, 4));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.byteLength < 128 * 2) {
|
||||||
|
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE, `[SIO] recevied data is too short ${result.byteLength}`);
|
||||||
|
} else {
|
||||||
|
if (this.outputNode != null) {
|
||||||
|
this.outputNode.postReceivedVoice(response[1]);
|
||||||
|
} else {
|
||||||
|
this.postReceivedVoice(response[1]);
|
||||||
|
}
|
||||||
|
this.listener.notifyResponseTime(responseTime, perf);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
postReceivedVoice = (data: ArrayBuffer) => {
|
||||||
|
// Int16 to Float
|
||||||
|
// const i16Data = new Int16Array(data);
|
||||||
|
// const f32Data = new Float32Array(i16Data.length);
|
||||||
|
|
||||||
|
// // console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
|
||||||
|
// i16Data.forEach((x, i) => {
|
||||||
|
// const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff;
|
||||||
|
// f32Data[i] = float;
|
||||||
|
// });
|
||||||
|
const f32Data = new Float32Array(data);
|
||||||
|
|
||||||
|
// アップサンプリング
|
||||||
|
let upSampledBuffer: Float32Array | null = null;
|
||||||
|
if (this.setting.sendingSampleRate == 48000) {
|
||||||
|
upSampledBuffer = f32Data;
|
||||||
} else {
|
} else {
|
||||||
if (this.outputNode != null) {
|
upSampledBuffer = new Float32Array(f32Data.length * 2);
|
||||||
this.outputNode.postReceivedVoice(response[1]);
|
for (let i = 0; i < f32Data.length; i++) {
|
||||||
} else {
|
const currentFrame = f32Data[i];
|
||||||
this.postReceivedVoice(response[1]);
|
const nextFrame = i + 1 < f32Data.length ? f32Data[i + 1] : f32Data[i];
|
||||||
}
|
upSampledBuffer[i * 2] = currentFrame;
|
||||||
this.listener.notifyResponseTime(responseTime, perf);
|
upSampledBuffer[i * 2 + 1] = (currentFrame + nextFrame) / 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
postReceivedVoice = (data: ArrayBuffer) => {
|
const req: VoiceChangerWorkletProcessorRequest = {
|
||||||
// Int16 to Float
|
requestType: "voice",
|
||||||
const i16Data = new Int16Array(data);
|
voice: upSampledBuffer,
|
||||||
const f32Data = new Float32Array(i16Data.length);
|
numTrancateTreshold: 0,
|
||||||
// console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
|
volTrancateThreshold: 0,
|
||||||
i16Data.forEach((x, i) => {
|
volTrancateLength: 0,
|
||||||
const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff;
|
};
|
||||||
f32Data[i] = float;
|
this.port.postMessage(req);
|
||||||
});
|
|
||||||
|
|
||||||
// アップサンプリング
|
if (this.isOutputRecording) {
|
||||||
let upSampledBuffer: Float32Array | null = null;
|
this.recordingOutputChunk.push(upSampledBuffer);
|
||||||
if (this.setting.sendingSampleRate == 48000) {
|
}
|
||||||
upSampledBuffer = f32Data;
|
|
||||||
} else {
|
|
||||||
upSampledBuffer = new Float32Array(f32Data.length * 2);
|
|
||||||
for (let i = 0; i < f32Data.length; i++) {
|
|
||||||
const currentFrame = f32Data[i];
|
|
||||||
const nextFrame = i + 1 < f32Data.length ? f32Data[i + 1] : f32Data[i];
|
|
||||||
upSampledBuffer[i * 2] = currentFrame;
|
|
||||||
upSampledBuffer[i * 2 + 1] = (currentFrame + nextFrame) / 2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const req: VoiceChangerWorkletProcessorRequest = {
|
|
||||||
requestType: "voice",
|
|
||||||
voice: upSampledBuffer,
|
|
||||||
numTrancateTreshold: 0,
|
|
||||||
volTrancateThreshold: 0,
|
|
||||||
volTrancateLength: 0,
|
|
||||||
};
|
};
|
||||||
this.port.postMessage(req);
|
|
||||||
|
|
||||||
if (this.isOutputRecording) {
|
private _averageDownsampleBuffer(buffer: Float32Array, originalSampleRate: number, destinationSamplerate: number) {
|
||||||
this.recordingOutputChunk.push(upSampledBuffer);
|
if (originalSampleRate == destinationSamplerate) {
|
||||||
}
|
return buffer;
|
||||||
};
|
|
||||||
|
|
||||||
private _averageDownsampleBuffer(
|
|
||||||
buffer: Float32Array,
|
|
||||||
originalSampleRate: number,
|
|
||||||
destinationSamplerate: number
|
|
||||||
) {
|
|
||||||
if (originalSampleRate == destinationSamplerate) {
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
if (destinationSamplerate > originalSampleRate) {
|
|
||||||
throw "downsampling rate show be smaller than original sample rate";
|
|
||||||
}
|
|
||||||
const sampleRateRatio = originalSampleRate / destinationSamplerate;
|
|
||||||
const newLength = Math.round(buffer.length / sampleRateRatio);
|
|
||||||
const result = new Float32Array(newLength);
|
|
||||||
let offsetResult = 0;
|
|
||||||
let offsetBuffer = 0;
|
|
||||||
while (offsetResult < result.length) {
|
|
||||||
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
|
|
||||||
// Use average value of skipped samples
|
|
||||||
var accum = 0,
|
|
||||||
count = 0;
|
|
||||||
for (
|
|
||||||
var i = offsetBuffer;
|
|
||||||
i < nextOffsetBuffer && i < buffer.length;
|
|
||||||
i++
|
|
||||||
) {
|
|
||||||
accum += buffer[i];
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
result[offsetResult] = accum / count;
|
|
||||||
// Or you can simply get rid of the skipped samples:
|
|
||||||
// result[offsetResult] = buffer[nextOffsetBuffer];
|
|
||||||
offsetResult++;
|
|
||||||
offsetBuffer = nextOffsetBuffer;
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
handleMessage(event: any) {
|
|
||||||
// console.log(`[Node:handleMessage_] `, event.data.volume);
|
|
||||||
if (event.data.responseType === "start_ok") {
|
|
||||||
if (this.startPromiseResolve) {
|
|
||||||
this.startPromiseResolve();
|
|
||||||
this.startPromiseResolve = null;
|
|
||||||
}
|
|
||||||
} else if (event.data.responseType === "stop_ok") {
|
|
||||||
if (this.stopPromiseResolve) {
|
|
||||||
this.stopPromiseResolve();
|
|
||||||
this.stopPromiseResolve = null;
|
|
||||||
}
|
|
||||||
} else if (event.data.responseType === "volume") {
|
|
||||||
this.listener.notifyVolume(event.data.volume as number);
|
|
||||||
} else if (event.data.responseType === "inputData") {
|
|
||||||
const inputData = event.data.inputData as Float32Array;
|
|
||||||
// console.log("receive input data", inputData);
|
|
||||||
|
|
||||||
// ダウンサンプリング
|
|
||||||
let downsampledBuffer: Float32Array | null = null;
|
|
||||||
if (this.setting.sendingSampleRate == 48000) {
|
|
||||||
downsampledBuffer = inputData;
|
|
||||||
} else if (this.setting.downSamplingMode == DownSamplingMode.decimate) {
|
|
||||||
//////// (Kind 1) 間引き //////////
|
|
||||||
//// 48000Hz で入ってくるので間引いて24000Hzに変換する。
|
|
||||||
downsampledBuffer = new Float32Array(inputData.length / 2);
|
|
||||||
for (let i = 0; i < inputData.length; i++) {
|
|
||||||
if (i % 2 == 0) {
|
|
||||||
downsampledBuffer[i / 2] = inputData[i];
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
if (destinationSamplerate > originalSampleRate) {
|
||||||
//////// (Kind 2) 平均 //////////
|
throw "downsampling rate show be smaller than original sample rate";
|
||||||
// downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
|
|
||||||
downsampledBuffer = this._averageDownsampleBuffer(
|
|
||||||
inputData,
|
|
||||||
48000,
|
|
||||||
this.setting.sendingSampleRate
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Float to Int16 (internalの場合はfloatのまま行く。)
|
|
||||||
if (this.setting.protocol != "internal") {
|
|
||||||
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
|
||||||
const dataView = new DataView(arrayBuffer);
|
|
||||||
for (let i = 0; i < downsampledBuffer.length; i++) {
|
|
||||||
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
|
||||||
s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
|
||||||
dataView.setInt16(i * 2, s, true);
|
|
||||||
}
|
}
|
||||||
// バッファリング
|
const sampleRateRatio = originalSampleRate / destinationSamplerate;
|
||||||
this.requestChunks.push(arrayBuffer);
|
const newLength = Math.round(buffer.length / sampleRateRatio);
|
||||||
} else {
|
const result = new Float32Array(newLength);
|
||||||
// internal
|
let offsetResult = 0;
|
||||||
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
|
let offsetBuffer = 0;
|
||||||
this.requestChunks.push(downsampledBuffer.buffer);
|
while (offsetResult < result.length) {
|
||||||
}
|
var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
|
||||||
|
// Use average value of skipped samples
|
||||||
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
|
var accum = 0,
|
||||||
if (this.requestChunks.length < this.setting.inputChunkNum) {
|
count = 0;
|
||||||
return;
|
for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
|
||||||
}
|
accum += buffer[i];
|
||||||
|
count++;
|
||||||
// リクエスト用の入れ物を作成
|
}
|
||||||
const windowByteLength = this.requestChunks.reduce((prev, cur) => {
|
result[offsetResult] = accum / count;
|
||||||
return prev + cur.byteLength;
|
// Or you can simply get rid of the skipped samples:
|
||||||
}, 0);
|
// result[offsetResult] = buffer[nextOffsetBuffer];
|
||||||
const newBuffer = new Uint8Array(windowByteLength);
|
offsetResult++;
|
||||||
|
offsetBuffer = nextOffsetBuffer;
|
||||||
// リクエストのデータをセット
|
}
|
||||||
this.requestChunks.reduce((prev, cur) => {
|
return result;
|
||||||
newBuffer.set(new Uint8Array(cur), prev);
|
|
||||||
return prev + cur.byteLength;
|
|
||||||
}, 0);
|
|
||||||
|
|
||||||
this.sendBuffer(newBuffer);
|
|
||||||
this.requestChunks = [];
|
|
||||||
|
|
||||||
this.listener.notifySendBufferingTime(Date.now() - this.bufferStart);
|
|
||||||
this.bufferStart = Date.now();
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
`[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`,
|
|
||||||
event.data
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
}
|
handleMessage(event: any) {
|
||||||
|
// console.log(`[Node:handleMessage_] `, event.data.volume);
|
||||||
|
if (event.data.responseType === "start_ok") {
|
||||||
|
if (this.startPromiseResolve) {
|
||||||
|
this.startPromiseResolve();
|
||||||
|
this.startPromiseResolve = null;
|
||||||
|
}
|
||||||
|
} else if (event.data.responseType === "stop_ok") {
|
||||||
|
if (this.stopPromiseResolve) {
|
||||||
|
this.stopPromiseResolve();
|
||||||
|
this.stopPromiseResolve = null;
|
||||||
|
}
|
||||||
|
} else if (event.data.responseType === "volume") {
|
||||||
|
this.listener.notifyVolume(event.data.volume as number);
|
||||||
|
} else if (event.data.responseType === "inputData") {
|
||||||
|
const inputData = event.data.inputData as Float32Array;
|
||||||
|
// console.log("receive input data", inputData);
|
||||||
|
|
||||||
private sendBuffer = async (newBuffer: Uint8Array) => {
|
// ダウンサンプリング
|
||||||
const timestamp = Date.now();
|
let downsampledBuffer: Float32Array | null = null;
|
||||||
if (this.setting.protocol === "sio") {
|
if (this.setting.sendingSampleRate == 48000) {
|
||||||
if (!this.socket) {
|
console.log("no downsample");
|
||||||
console.warn(`sio is not initialized`);
|
downsampledBuffer = inputData;
|
||||||
return;
|
} else if (this.setting.downSamplingMode == DownSamplingMode.decimate) {
|
||||||
}
|
//////// (Kind 1) 間引き //////////
|
||||||
// console.log("emit!")
|
//// 48000Hz で入ってくるので間引いて24000Hzに変換する。
|
||||||
this.socket.emit("request_message", [timestamp, newBuffer.buffer]);
|
downsampledBuffer = new Float32Array(inputData.length / 2);
|
||||||
} else if (this.setting.protocol === "rest") {
|
for (let i = 0; i < inputData.length; i++) {
|
||||||
const restClient = new ServerRestClient(this.setting.serverUrl);
|
if (i % 2 == 0) {
|
||||||
const res = await restClient.postVoice(timestamp, newBuffer.buffer);
|
downsampledBuffer[i / 2] = inputData[i];
|
||||||
if (res.byteLength < 128 * 2) {
|
}
|
||||||
this.listener.notifyException(
|
}
|
||||||
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE,
|
} else {
|
||||||
`[REST] recevied data is too short ${res.byteLength}`
|
//////// (Kind 2) 平均 //////////
|
||||||
);
|
// downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
|
||||||
} else {
|
downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
|
||||||
if (this.outputNode != null) {
|
}
|
||||||
this.outputNode.postReceivedVoice(res);
|
|
||||||
|
// Float to Int16 (internalの場合はfloatのまま行く。)
|
||||||
|
// if (this.setting.protocol != "internal") {
|
||||||
|
// const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
||||||
|
// const dataView = new DataView(arrayBuffer);
|
||||||
|
// for (let i = 0; i < downsampledBuffer.length; i++) {
|
||||||
|
// let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
||||||
|
// s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||||
|
// dataView.setInt16(i * 2, s, true);
|
||||||
|
// }
|
||||||
|
// // バッファリング
|
||||||
|
// this.requestChunks.push(arrayBuffer);
|
||||||
|
// } else {
|
||||||
|
// internal
|
||||||
|
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
|
||||||
|
this.requestChunks.push(downsampledBuffer.buffer);
|
||||||
|
// }
|
||||||
|
|
||||||
|
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
|
||||||
|
if (this.requestChunks.length < this.setting.inputChunkNum) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// リクエスト用の入れ物を作成
|
||||||
|
const windowByteLength = this.requestChunks.reduce((prev, cur) => {
|
||||||
|
return prev + cur.byteLength;
|
||||||
|
}, 0);
|
||||||
|
const newBuffer = new Uint8Array(windowByteLength);
|
||||||
|
|
||||||
|
// リクエストのデータをセット
|
||||||
|
this.requestChunks.reduce((prev, cur) => {
|
||||||
|
newBuffer.set(new Uint8Array(cur), prev);
|
||||||
|
return prev + cur.byteLength;
|
||||||
|
}, 0);
|
||||||
|
|
||||||
|
this.sendBuffer(newBuffer);
|
||||||
|
this.requestChunks = [];
|
||||||
|
|
||||||
|
this.listener.notifySendBufferingTime(Date.now() - this.bufferStart);
|
||||||
|
this.bufferStart = Date.now();
|
||||||
} else {
|
} else {
|
||||||
this.postReceivedVoice(res);
|
console.warn(`[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`, event.data);
|
||||||
}
|
}
|
||||||
this.listener.notifyResponseTime(Date.now() - timestamp);
|
}
|
||||||
}
|
|
||||||
} else if (this.setting.protocol == "internal") {
|
private sendBuffer = async (newBuffer: Uint8Array) => {
|
||||||
if (!this.internalCallback) {
|
const timestamp = Date.now();
|
||||||
this.listener.notifyException(
|
if (this.setting.protocol === "sio") {
|
||||||
VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED,
|
if (!this.socket) {
|
||||||
`[AudioWorkletNode] internal audio process callback is not initialized`
|
console.warn(`sio is not initialized`);
|
||||||
);
|
return;
|
||||||
return;
|
}
|
||||||
}
|
// console.log("emit!")
|
||||||
// const res = await this.internalCallback.processAudio(newBuffer);
|
this.socket.emit("request_message", [timestamp, newBuffer.buffer]);
|
||||||
// if (res.length < 128 * 2) {
|
} else if (this.setting.protocol === "rest") {
|
||||||
// return;
|
const restClient = new ServerRestClient(this.setting.serverUrl);
|
||||||
// }
|
const res = await restClient.postVoice(timestamp, newBuffer.buffer);
|
||||||
// if (this.outputNode != null) {
|
if (res.byteLength < 128 * 2) {
|
||||||
// this.outputNode.postReceivedVoice(res.buffer);
|
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE, `[REST] recevied data is too short ${res.byteLength}`);
|
||||||
// } else {
|
} else {
|
||||||
// this.postReceivedVoice(res.buffer);
|
if (this.outputNode != null) {
|
||||||
// }
|
this.outputNode.postReceivedVoice(res);
|
||||||
this.internalCallback.processAudio(newBuffer).then((res) => {
|
} else {
|
||||||
if (res.length < 128 * 2) {
|
this.postReceivedVoice(res);
|
||||||
return;
|
}
|
||||||
}
|
this.listener.notifyResponseTime(Date.now() - timestamp);
|
||||||
if (this.outputNode != null) {
|
}
|
||||||
this.outputNode.postReceivedVoice(res.buffer);
|
} else if (this.setting.protocol == "internal") {
|
||||||
|
if (!this.internalCallback) {
|
||||||
|
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// const res = await this.internalCallback.processAudio(newBuffer);
|
||||||
|
// if (res.length < 128 * 2) {
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
// if (this.outputNode != null) {
|
||||||
|
// this.outputNode.postReceivedVoice(res.buffer);
|
||||||
|
// } else {
|
||||||
|
// this.postReceivedVoice(res.buffer);
|
||||||
|
// }
|
||||||
|
this.internalCallback.processAudio(newBuffer).then((res) => {
|
||||||
|
if (res.length < 128 * 2) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (this.outputNode != null) {
|
||||||
|
this.outputNode.postReceivedVoice(res.buffer);
|
||||||
|
} else {
|
||||||
|
this.postReceivedVoice(res.buffer);
|
||||||
|
}
|
||||||
|
});
|
||||||
} else {
|
} else {
|
||||||
this.postReceivedVoice(res.buffer);
|
throw "unknown protocol";
|
||||||
}
|
}
|
||||||
});
|
|
||||||
} else {
|
|
||||||
throw "unknown protocol";
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
// Worklet操作
|
|
||||||
configure = (setting: WorkletSetting) => {
|
|
||||||
const req: VoiceChangerWorkletProcessorRequest = {
|
|
||||||
requestType: "config",
|
|
||||||
voice: new Float32Array(1),
|
|
||||||
numTrancateTreshold: setting.numTrancateTreshold,
|
|
||||||
volTrancateThreshold: setting.volTrancateThreshold,
|
|
||||||
volTrancateLength: setting.volTrancateLength,
|
|
||||||
};
|
};
|
||||||
this.port.postMessage(req);
|
|
||||||
};
|
|
||||||
|
|
||||||
start = async () => {
|
// Worklet操作
|
||||||
const p = new Promise<void>((resolve) => {
|
configure = (setting: WorkletSetting) => {
|
||||||
this.startPromiseResolve = resolve;
|
const req: VoiceChangerWorkletProcessorRequest = {
|
||||||
});
|
requestType: "config",
|
||||||
const req: VoiceChangerWorkletProcessorRequest = {
|
voice: new Float32Array(1),
|
||||||
requestType: "start",
|
numTrancateTreshold: setting.numTrancateTreshold,
|
||||||
voice: new Float32Array(1),
|
volTrancateThreshold: setting.volTrancateThreshold,
|
||||||
numTrancateTreshold: 0,
|
volTrancateLength: setting.volTrancateLength,
|
||||||
volTrancateThreshold: 0,
|
};
|
||||||
volTrancateLength: 0,
|
this.port.postMessage(req);
|
||||||
};
|
};
|
||||||
this.port.postMessage(req);
|
|
||||||
await p;
|
|
||||||
};
|
|
||||||
stop = async () => {
|
|
||||||
const p = new Promise<void>((resolve) => {
|
|
||||||
this.stopPromiseResolve = resolve;
|
|
||||||
});
|
|
||||||
const req: VoiceChangerWorkletProcessorRequest = {
|
|
||||||
requestType: "stop",
|
|
||||||
voice: new Float32Array(1),
|
|
||||||
numTrancateTreshold: 0,
|
|
||||||
volTrancateThreshold: 0,
|
|
||||||
volTrancateLength: 0,
|
|
||||||
};
|
|
||||||
this.port.postMessage(req);
|
|
||||||
await p;
|
|
||||||
};
|
|
||||||
trancateBuffer = () => {
|
|
||||||
const req: VoiceChangerWorkletProcessorRequest = {
|
|
||||||
requestType: "trancateBuffer",
|
|
||||||
voice: new Float32Array(1),
|
|
||||||
numTrancateTreshold: 0,
|
|
||||||
volTrancateThreshold: 0,
|
|
||||||
volTrancateLength: 0,
|
|
||||||
};
|
|
||||||
this.port.postMessage(req);
|
|
||||||
};
|
|
||||||
|
|
||||||
startOutputRecording = () => {
|
start = async () => {
|
||||||
this.recordingOutputChunk = [];
|
const p = new Promise<void>((resolve) => {
|
||||||
this.isOutputRecording = true;
|
this.startPromiseResolve = resolve;
|
||||||
};
|
});
|
||||||
stopOutputRecording = () => {
|
const req: VoiceChangerWorkletProcessorRequest = {
|
||||||
this.isOutputRecording = false;
|
requestType: "start",
|
||||||
|
voice: new Float32Array(1),
|
||||||
|
numTrancateTreshold: 0,
|
||||||
|
volTrancateThreshold: 0,
|
||||||
|
volTrancateLength: 0,
|
||||||
|
};
|
||||||
|
this.port.postMessage(req);
|
||||||
|
await p;
|
||||||
|
};
|
||||||
|
stop = async () => {
|
||||||
|
const p = new Promise<void>((resolve) => {
|
||||||
|
this.stopPromiseResolve = resolve;
|
||||||
|
});
|
||||||
|
const req: VoiceChangerWorkletProcessorRequest = {
|
||||||
|
requestType: "stop",
|
||||||
|
voice: new Float32Array(1),
|
||||||
|
numTrancateTreshold: 0,
|
||||||
|
volTrancateThreshold: 0,
|
||||||
|
volTrancateLength: 0,
|
||||||
|
};
|
||||||
|
this.port.postMessage(req);
|
||||||
|
await p;
|
||||||
|
};
|
||||||
|
trancateBuffer = () => {
|
||||||
|
const req: VoiceChangerWorkletProcessorRequest = {
|
||||||
|
requestType: "trancateBuffer",
|
||||||
|
voice: new Float32Array(1),
|
||||||
|
numTrancateTreshold: 0,
|
||||||
|
volTrancateThreshold: 0,
|
||||||
|
volTrancateLength: 0,
|
||||||
|
};
|
||||||
|
this.port.postMessage(req);
|
||||||
|
};
|
||||||
|
|
||||||
const dataSize = this.recordingOutputChunk.reduce((prev, cur) => {
|
startOutputRecording = () => {
|
||||||
return prev + cur.length;
|
this.recordingOutputChunk = [];
|
||||||
}, 0);
|
this.isOutputRecording = true;
|
||||||
const samples = new Float32Array(dataSize);
|
};
|
||||||
let sampleIndex = 0;
|
stopOutputRecording = () => {
|
||||||
for (let i = 0; i < this.recordingOutputChunk.length; i++) {
|
this.isOutputRecording = false;
|
||||||
for (let j = 0; j < this.recordingOutputChunk[i].length; j++) {
|
|
||||||
samples[sampleIndex] = this.recordingOutputChunk[i][j];
|
const dataSize = this.recordingOutputChunk.reduce((prev, cur) => {
|
||||||
sampleIndex++;
|
return prev + cur.length;
|
||||||
}
|
}, 0);
|
||||||
}
|
const samples = new Float32Array(dataSize);
|
||||||
return samples;
|
let sampleIndex = 0;
|
||||||
};
|
for (let i = 0; i < this.recordingOutputChunk.length; i++) {
|
||||||
|
for (let j = 0; j < this.recordingOutputChunk[i].length; j++) {
|
||||||
|
samples[sampleIndex] = this.recordingOutputChunk[i][j];
|
||||||
|
sampleIndex++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return samples;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
1
server/.python-version
Normal file
1
server/.python-version
Normal file
@ -0,0 +1 @@
|
|||||||
|
3.10.11
|
@ -41,32 +41,122 @@ logger.debug(f"---------------- Booting PHASE :{__name__} -----------------")
|
|||||||
|
|
||||||
def setupArgParser():
|
def setupArgParser():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
|
parser.add_argument(
|
||||||
|
"--logLevel",
|
||||||
|
type=str,
|
||||||
|
default="error",
|
||||||
|
help="Log level info|critical|error. (default: error)",
|
||||||
|
)
|
||||||
parser.add_argument("-p", type=int, default=18888, help="port")
|
parser.add_argument("-p", type=int, default=18888, help="port")
|
||||||
parser.add_argument("--https", type=strtobool, default=False, help="use https")
|
parser.add_argument("--https", type=strtobool, default=False, help="use https")
|
||||||
parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
|
parser.add_argument(
|
||||||
parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
|
"--test_connect",
|
||||||
parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
|
type=str,
|
||||||
parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
|
default="8.8.8.8",
|
||||||
|
help="test connect to detect ip in https mode. default 8.8.8.8",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--httpsKey", type=str, default="ssl.key", help="path for the key of https"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--httpsCert", type=str, default="ssl.cert", help="path for the cert of https"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--httpsSelfSigned",
|
||||||
|
type=strtobool,
|
||||||
|
default=True,
|
||||||
|
help="generate self-signed certificate",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--model_dir", type=str, default="model_dir", help="path to model files")
|
parser.add_argument(
|
||||||
parser.add_argument("--sample_mode", type=str, default="production", help="rvc_sample_mode")
|
"--model_dir", type=str, default="model_dir", help="path to model files"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sample_mode", type=str, default="production", help="rvc_sample_mode"
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--content_vec_500", type=str, default="pretrain/checkpoint_best_legacy_500.pt", help="path to content_vec_500 model(pytorch)")
|
parser.add_argument(
|
||||||
parser.add_argument("--content_vec_500_onnx", type=str, default="pretrain/content_vec_500.onnx", help="path to content_vec_500 model(onnx)")
|
"--content_vec_500",
|
||||||
parser.add_argument("--content_vec_500_onnx_on", type=strtobool, default=True, help="use or not onnx for content_vec_500")
|
type=str,
|
||||||
parser.add_argument("--hubert_base", type=str, default="pretrain/hubert_base.pt", help="path to hubert_base model(pytorch)")
|
default="pretrain/checkpoint_best_legacy_500.pt",
|
||||||
parser.add_argument("--hubert_base_jp", type=str, default="pretrain/rinna_hubert_base_jp.pt", help="path to hubert_base_jp model(pytorch)")
|
help="path to content_vec_500 model(pytorch)",
|
||||||
parser.add_argument("--hubert_soft", type=str, default="pretrain/hubert/hubert-soft-0d54a1f4.pt", help="path to hubert_soft model(pytorch)")
|
)
|
||||||
parser.add_argument("--whisper_tiny", type=str, default="pretrain/whisper_tiny.pt", help="path to hubert_soft model(pytorch)")
|
parser.add_argument(
|
||||||
parser.add_argument("--nsf_hifigan", type=str, default="pretrain/nsf_hifigan/model", help="path to nsf_hifigan model(pytorch)")
|
"--content_vec_500_onnx",
|
||||||
parser.add_argument("--crepe_onnx_full", type=str, default="pretrain/crepe_onnx_full.onnx", help="path to crepe_onnx_full")
|
type=str,
|
||||||
parser.add_argument("--crepe_onnx_tiny", type=str, default="pretrain/crepe_onnx_tiny.onnx", help="path to crepe_onnx_tiny")
|
default="pretrain/content_vec_500.onnx",
|
||||||
parser.add_argument("--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe")
|
help="path to content_vec_500 model(onnx)",
|
||||||
parser.add_argument("--rmvpe_onnx", type=str, default="pretrain/rmvpe.onnx", help="path to rmvpe onnx")
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--content_vec_500_onnx_on",
|
||||||
|
type=strtobool,
|
||||||
|
default=True,
|
||||||
|
help="use or not onnx for content_vec_500",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hubert_base",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/hubert_base.pt",
|
||||||
|
help="path to hubert_base model(pytorch)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hubert_base_jp",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/rinna_hubert_base_jp.pt",
|
||||||
|
help="path to hubert_base_jp model(pytorch)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hubert_soft",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/hubert/hubert-soft-0d54a1f4.pt",
|
||||||
|
help="path to hubert_soft model(pytorch)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--whisper_tiny",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/whisper_tiny.pt",
|
||||||
|
help="path to hubert_soft model(pytorch)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--nsf_hifigan",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/nsf_hifigan/model",
|
||||||
|
help="path to nsf_hifigan model(pytorch)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--crepe_onnx_full",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/crepe_onnx_full.onnx",
|
||||||
|
help="path to crepe_onnx_full",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--crepe_onnx_tiny",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/crepe_onnx_tiny.onnx",
|
||||||
|
help="path to crepe_onnx_tiny",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--rmvpe_onnx",
|
||||||
|
type=str,
|
||||||
|
default="pretrain/rmvpe.onnx",
|
||||||
|
help="path to rmvpe onnx",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--host", type=str, default='127.0.0.1', help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.")
|
parser.add_argument(
|
||||||
parser.add_argument("--allowed-origins", action='append', default=[], help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.")
|
"--host",
|
||||||
|
type=str,
|
||||||
|
default="127.0.0.1",
|
||||||
|
help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--allowed-origins",
|
||||||
|
action="append",
|
||||||
|
default=[],
|
||||||
|
help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.",
|
||||||
|
)
|
||||||
|
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
@ -121,7 +211,11 @@ HOST = args.host
|
|||||||
PORT = args.p
|
PORT = args.p
|
||||||
|
|
||||||
|
|
||||||
def localServer(logLevel: str = "critical", key_path: str | None = None, cert_path: str | None = None):
|
def localServer(
|
||||||
|
logLevel: str = "critical",
|
||||||
|
key_path: str | None = None,
|
||||||
|
cert_path: str | None = None,
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
uvicorn.run(
|
uvicorn.run(
|
||||||
f"{os.path.basename(__file__)[:-3]}:app_socketio",
|
f"{os.path.basename(__file__)[:-3]}:app_socketio",
|
||||||
@ -140,14 +234,19 @@ if __name__ == "MMVCServerSIO":
|
|||||||
mp.freeze_support()
|
mp.freeze_support()
|
||||||
|
|
||||||
voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams)
|
voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams)
|
||||||
app_fastapi = MMVC_Rest.get_instance(voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT)
|
app_fastapi = MMVC_Rest.get_instance(
|
||||||
app_socketio = MMVC_SocketIOApp.get_instance(app_fastapi, voiceChangerManager, args.allowed_origins, PORT)
|
voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT
|
||||||
|
)
|
||||||
|
app_socketio = MMVC_SocketIOApp.get_instance(
|
||||||
|
app_fastapi, voiceChangerManager, args.allowed_origins, PORT
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__mp_main__":
|
if __name__ == "__mp_main__":
|
||||||
# printMessage("サーバプロセスを起動しています。", level=2)
|
# printMessage("サーバプロセスを起動しています。", level=2)
|
||||||
printMessage("The server process is starting up.", level=2)
|
printMessage("The server process is starting up.", level=2)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
mp.freeze_support()
|
mp.freeze_support()
|
||||||
|
|
||||||
@ -202,7 +301,9 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
key_path = os.path.join(SSL_KEY_DIR, keyname)
|
key_path = os.path.join(SSL_KEY_DIR, keyname)
|
||||||
cert_path = os.path.join(SSL_KEY_DIR, certname)
|
cert_path = os.path.join(SSL_KEY_DIR, certname)
|
||||||
printMessage(f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1)
|
printMessage(
|
||||||
|
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1
|
||||||
|
)
|
||||||
|
|
||||||
elif args.https and args.httpsSelfSigned == 0:
|
elif args.https and args.httpsSelfSigned == 0:
|
||||||
# HTTPS
|
# HTTPS
|
||||||
@ -223,8 +324,13 @@ if __name__ == "__main__":
|
|||||||
printMessage("http://<IP>:<PORT>/", level=1)
|
printMessage("http://<IP>:<PORT>/", level=1)
|
||||||
|
|
||||||
# printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
|
# printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
|
||||||
printMessage("In many cases, it will launch when you access any of the following URLs.", level=2)
|
printMessage(
|
||||||
if "EX_PORT" in locals() and "EX_IP" in locals(): # シェルスクリプト経由起動(docker)
|
"In many cases, it will launch when you access any of the following URLs.",
|
||||||
|
level=2,
|
||||||
|
)
|
||||||
|
if (
|
||||||
|
"EX_PORT" in locals() and "EX_IP" in locals()
|
||||||
|
): # シェルスクリプト経由起動(docker)
|
||||||
if args.https == 1:
|
if args.https == 1:
|
||||||
printMessage(f"https://localhost:{EX_PORT}/", level=1)
|
printMessage(f"https://localhost:{EX_PORT}/", level=1)
|
||||||
for ip in EX_IP.strip().split(" "):
|
for ip in EX_IP.strip().split(" "):
|
||||||
@ -254,12 +360,26 @@ if __name__ == "__main__":
|
|||||||
p.start()
|
p.start()
|
||||||
try:
|
try:
|
||||||
if sys.platform.startswith("win"):
|
if sys.platform.startswith("win"):
|
||||||
process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "--disable-gpu", "-u", f"http://localhost:{PORT}/"])
|
process = subprocess.Popen(
|
||||||
|
[
|
||||||
|
NATIVE_CLIENT_FILE_WIN,
|
||||||
|
"--disable-gpu",
|
||||||
|
"-u",
|
||||||
|
f"http://localhost:{PORT}/",
|
||||||
|
]
|
||||||
|
)
|
||||||
return_code = process.wait()
|
return_code = process.wait()
|
||||||
logger.info("client closed.")
|
logger.info("client closed.")
|
||||||
p.terminate()
|
p.terminate()
|
||||||
elif sys.platform.startswith("darwin"):
|
elif sys.platform.startswith("darwin"):
|
||||||
process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "--disable-gpu", "-u", f"http://localhost:{PORT}/"])
|
process = subprocess.Popen(
|
||||||
|
[
|
||||||
|
NATIVE_CLIENT_FILE_MAC,
|
||||||
|
"--disable-gpu",
|
||||||
|
"-u",
|
||||||
|
f"http://localhost:{PORT}/",
|
||||||
|
]
|
||||||
|
)
|
||||||
return_code = process.wait()
|
return_code = process.wait()
|
||||||
logger.info("client closed.")
|
logger.info("client closed.")
|
||||||
p.terminate()
|
p.terminate()
|
||||||
|
3126
server/poetry.lock
generated
Normal file
3126
server/poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
44
server/pyproject.toml
Normal file
44
server/pyproject.toml
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
[tool.poetry]
|
||||||
|
name = "server"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = ""
|
||||||
|
authors = ["wok <wok@local.com>"]
|
||||||
|
readme = "README.md"
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = "3.10.11"
|
||||||
|
uvicorn = "0.21.1"
|
||||||
|
pyOpenSSL ="23.1.1"
|
||||||
|
numpy = "1.23.5"
|
||||||
|
resampy = "0.4.2"
|
||||||
|
python-socketio = "5.8.0"
|
||||||
|
fastapi = "0.95.1"
|
||||||
|
python-multipart = "0.0.6"
|
||||||
|
onnxruntime-gpu = "1.13.1"
|
||||||
|
scipy = "1.10.1"
|
||||||
|
matplotlib = "3.7.1"
|
||||||
|
websockets = "11.0.2"
|
||||||
|
faiss-cpu = "1.7.3"
|
||||||
|
torchcrepe = "0.0.18"
|
||||||
|
librosa = "0.9.1"
|
||||||
|
gin = "0.1.6"
|
||||||
|
gin_config = "0.5.0"
|
||||||
|
einops = "0.6.0"
|
||||||
|
local_attention = "1.8.5"
|
||||||
|
sounddevice = "0.4.6"
|
||||||
|
dataclasses_json = "0.5.7"
|
||||||
|
onnxsim = "0.4.28"
|
||||||
|
torchfcpe = "0.0.3"
|
||||||
|
torchaudio = "2.3.1"
|
||||||
|
torch = "2.3.1"
|
||||||
|
fairseq = "0.12.2"
|
||||||
|
pyworld = "0.3.4"
|
||||||
|
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
|
[tool.poetry.scripts]
|
||||||
|
main2 = "MMVCServerSIO:main"
|
||||||
|
test = "test.test:test"
|
@ -39,13 +39,18 @@ class MMVC_Rest_VoiceChanger:
|
|||||||
# struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
|
# struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
|
||||||
# )
|
# )
|
||||||
|
|
||||||
unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16)
|
# unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16)
|
||||||
|
unpackedData = np.array(
|
||||||
|
struct.unpack("<%sf" % (len(wav) // struct.calcsize("<f")), wav)
|
||||||
|
).astype(np.float32)
|
||||||
# print(f"[REST] unpackedDataType {unpackedData.dtype}")
|
# print(f"[REST] unpackedDataType {unpackedData.dtype}")
|
||||||
|
|
||||||
self.tlock.acquire()
|
self.tlock.acquire()
|
||||||
changedVoice = self.voiceChangerManager.changeVoice(unpackedData)
|
changedVoice = self.voiceChangerManager.changeVoice(unpackedData)
|
||||||
self.tlock.release()
|
self.tlock.release()
|
||||||
|
|
||||||
|
print("", changedVoice[0].dtype)
|
||||||
|
|
||||||
changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8")
|
changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8")
|
||||||
data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64}
|
data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64}
|
||||||
|
|
||||||
|
@ -60,9 +60,13 @@ class RVCr2(VoiceChangerModel):
|
|||||||
|
|
||||||
# pipelineの生成
|
# pipelineの生成
|
||||||
try:
|
try:
|
||||||
self.pipeline = createPipeline(self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector)
|
self.pipeline = createPipeline(
|
||||||
|
self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector
|
||||||
|
)
|
||||||
except PipelineCreateException as e: # NOQA
|
except PipelineCreateException as e: # NOQA
|
||||||
logger.error("[Voice Changer] pipeline create failed. check your model is valid.")
|
logger.error(
|
||||||
|
"[Voice Changer] pipeline create failed. check your model is valid."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# その他の設定
|
# その他の設定
|
||||||
@ -88,7 +92,9 @@ class RVCr2(VoiceChangerModel):
|
|||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
setattr(self.settings, key, str(val))
|
setattr(self.settings, key, str(val))
|
||||||
if key == "f0Detector" and self.pipeline is not None:
|
if key == "f0Detector" and self.pipeline is not None:
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||||
|
self.settings.f0Detector, self.settings.gpu
|
||||||
|
)
|
||||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
@ -115,14 +121,16 @@ class RVCr2(VoiceChangerModel):
|
|||||||
):
|
):
|
||||||
# 16k で入ってくる。
|
# 16k で入ってくる。
|
||||||
inputSize = newData.shape[0]
|
inputSize = newData.shape[0]
|
||||||
newData = newData.astype(np.float32) / 32768.0
|
# newData = newData.astype(np.float32) / 32768.0
|
||||||
newFeatureLength = inputSize // 160 # hopsize:=160
|
newFeatureLength = inputSize // 160 # hopsize:=160
|
||||||
|
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
# 過去のデータに連結
|
# 過去のデータに連結
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0)
|
self.pitchf_buffer = np.concatenate(
|
||||||
|
[self.pitchf_buffer, np.zeros(newFeatureLength)], 0
|
||||||
|
)
|
||||||
self.feature_buffer = np.concatenate(
|
self.feature_buffer = np.concatenate(
|
||||||
[
|
[
|
||||||
self.feature_buffer,
|
self.feature_buffer,
|
||||||
@ -134,19 +142,29 @@ class RVCr2(VoiceChangerModel):
|
|||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.zeros(newFeatureLength)
|
self.pitchf_buffer = np.zeros(newFeatureLength)
|
||||||
self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros(
|
||||||
|
[newFeatureLength, self.slotInfo.embChannels]
|
||||||
|
)
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame
|
convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame
|
||||||
|
|
||||||
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if (
|
||||||
|
convertSize % 160 != 0
|
||||||
|
): # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (160 - (convertSize % 160))
|
convertSize = convertSize + (160 - (convertSize % 160))
|
||||||
outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate)
|
outSize = int(
|
||||||
|
((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate
|
||||||
|
)
|
||||||
|
|
||||||
# バッファがたまっていない場合はzeroで補う
|
# バッファがたまっていない場合はzeroで補う
|
||||||
if self.audio_buffer.shape[0] < convertSize:
|
if self.audio_buffer.shape[0] < convertSize:
|
||||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
self.audio_buffer = np.concatenate(
|
||||||
|
[np.zeros([convertSize]), self.audio_buffer]
|
||||||
|
)
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer])
|
self.pitchf_buffer = np.concatenate(
|
||||||
|
[np.zeros([convertSize // 160]), self.pitchf_buffer]
|
||||||
|
)
|
||||||
self.feature_buffer = np.concatenate(
|
self.feature_buffer = np.concatenate(
|
||||||
[
|
[
|
||||||
np.zeros([convertSize // 160, self.slotInfo.embChannels]),
|
np.zeros([convertSize // 160, self.slotInfo.embChannels]),
|
||||||
@ -179,27 +197,39 @@ class RVCr2(VoiceChangerModel):
|
|||||||
outSize,
|
outSize,
|
||||||
)
|
)
|
||||||
|
|
||||||
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
|
def inference(
|
||||||
|
self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int
|
||||||
|
):
|
||||||
if self.pipeline is None:
|
if self.pipeline is None:
|
||||||
logger.info("[Voice Changer] Pipeline is not initialized.")
|
logger.info("[Voice Changer] Pipeline is not initialized.")
|
||||||
raise PipelineNotInitializedException()
|
raise PipelineNotInitializedException()
|
||||||
|
|
||||||
# 処理は16Kで実施(Pitch, embed, (infer))
|
# 処理は16Kで実施(Pitch, embed, (infer))
|
||||||
receivedData = cast(
|
# receivedData = cast(
|
||||||
AudioInOut,
|
# AudioInOut,
|
||||||
resampy.resample(
|
# resampy.resample(
|
||||||
receivedData,
|
# receivedData,
|
||||||
self.inputSampleRate,
|
# self.inputSampleRate,
|
||||||
16000,
|
# 16000,
|
||||||
filter="kaiser_fast",
|
# filter="kaiser_fast",
|
||||||
),
|
# ),
|
||||||
|
# )
|
||||||
|
receivedData = resampy.resample(
|
||||||
|
receivedData,
|
||||||
|
self.inputSampleRate,
|
||||||
|
16000,
|
||||||
|
filter="kaiser_fast",
|
||||||
)
|
)
|
||||||
crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
|
crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
|
||||||
sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
|
sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
|
||||||
extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000)
|
extra_frame = int(
|
||||||
|
(self.settings.extraConvertSize / self.inputSampleRate) * 16000
|
||||||
|
)
|
||||||
|
|
||||||
# 入力データ生成
|
# 入力データ生成
|
||||||
data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame)
|
data = self.generate_input(
|
||||||
|
receivedData, crossfade_frame, sola_search_frame, extra_frame
|
||||||
|
)
|
||||||
|
|
||||||
audio = data[0]
|
audio = data[0]
|
||||||
pitchf = data[1]
|
pitchf = data[1]
|
||||||
@ -234,7 +264,11 @@ class RVCr2(VoiceChangerModel):
|
|||||||
index_rate,
|
index_rate,
|
||||||
if_f0,
|
if_f0,
|
||||||
# 0,
|
# 0,
|
||||||
self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0.0, # extaraDataSizeの秒数。入力のサンプリングレートで算出
|
(
|
||||||
|
self.settings.extraConvertSize / self.inputSampleRate
|
||||||
|
if self.settings.silenceFront
|
||||||
|
else 0.0
|
||||||
|
), # extaraDataSizeの秒数。入力のサンプリングレートで算出
|
||||||
embOutputLayer,
|
embOutputLayer,
|
||||||
useFinalProj,
|
useFinalProj,
|
||||||
repeat,
|
repeat,
|
||||||
@ -244,19 +278,27 @@ class RVCr2(VoiceChangerModel):
|
|||||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||||
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
||||||
|
|
||||||
result = cast(
|
# result = cast(
|
||||||
AudioInOut,
|
# AudioInOut,
|
||||||
resampy.resample(
|
# resampy.resample(
|
||||||
result,
|
# result,
|
||||||
self.slotInfo.samplingRate,
|
# self.slotInfo.samplingRate,
|
||||||
self.outputSampleRate,
|
# self.outputSampleRate,
|
||||||
filter="kaiser_fast",
|
# filter="kaiser_fast",
|
||||||
),
|
# ),
|
||||||
|
# )
|
||||||
|
result = resampy.resample(
|
||||||
|
result,
|
||||||
|
self.slotInfo.samplingRate,
|
||||||
|
self.outputSampleRate,
|
||||||
|
filter="kaiser_fast",
|
||||||
)
|
)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||||
logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....")
|
logger.warn(
|
||||||
|
"[Device Manager] Device cannot support half precision. Fallback to float...."
|
||||||
|
)
|
||||||
self.deviceManager.setForceTensor(True)
|
self.deviceManager.setForceTensor(True)
|
||||||
self.initialize()
|
self.initialize()
|
||||||
# raise e
|
# raise e
|
||||||
|
@ -55,7 +55,9 @@ class Pipeline(object):
|
|||||||
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
|
logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))
|
||||||
|
|
||||||
self.index = index
|
self.index = index
|
||||||
self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
|
self.big_npy = (
|
||||||
|
index.reconstruct_n(0, index.ntotal) if index is not None else None
|
||||||
|
)
|
||||||
# self.feature = feature
|
# self.feature = feature
|
||||||
|
|
||||||
self.targetSR = targetSR
|
self.targetSR = targetSR
|
||||||
@ -69,7 +71,12 @@ class Pipeline(object):
|
|||||||
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
|
||||||
embedderInfo = self.embedder.getEmbedderInfo()
|
embedderInfo = self.embedder.getEmbedderInfo()
|
||||||
pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
|
pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
|
||||||
return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf}
|
return {
|
||||||
|
"inferencer": inferencerInfo,
|
||||||
|
"embedder": embedderInfo,
|
||||||
|
"pitchExtractor": pitchExtractorInfo,
|
||||||
|
"isHalf": self.isHalf,
|
||||||
|
}
|
||||||
|
|
||||||
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
def setPitchExtractor(self, pitchExtractor: PitchExtractor):
|
||||||
self.pitchExtractor = pitchExtractor
|
self.pitchExtractor = pitchExtractor
|
||||||
@ -88,13 +95,16 @@ class Pipeline(object):
|
|||||||
# pitch = pitch[:p_len]
|
# pitch = pitch[:p_len]
|
||||||
# pitchf = pitchf[:p_len]
|
# pitchf = pitchf[:p_len]
|
||||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||||
pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
|
pitchf = torch.tensor(
|
||||||
|
pitchf, device=self.device, dtype=torch.float
|
||||||
|
).unsqueeze(0)
|
||||||
else:
|
else:
|
||||||
pitch = None
|
pitch = None
|
||||||
pitchf = None
|
pitchf = None
|
||||||
except IndexError as e: # NOQA
|
except IndexError as e: # NOQA
|
||||||
print(e)
|
print(e)
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
return pitch, pitchf
|
return pitch, pitchf
|
||||||
@ -102,7 +112,9 @@ class Pipeline(object):
|
|||||||
def extractFeatures(self, feats, embOutputLayer, useFinalProj):
|
def extractFeatures(self, feats, embOutputLayer, useFinalProj):
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
try:
|
try:
|
||||||
feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
|
feats = self.embedder.extractFeatures(
|
||||||
|
feats, embOutputLayer, useFinalProj
|
||||||
|
)
|
||||||
if torch.isnan(feats).all():
|
if torch.isnan(feats).all():
|
||||||
raise DeviceCannotSupportHalfPrecisionException()
|
raise DeviceCannotSupportHalfPrecisionException()
|
||||||
return feats
|
return feats
|
||||||
@ -113,13 +125,16 @@ class Pipeline(object):
|
|||||||
raise DeviceChangingException()
|
raise DeviceChangingException()
|
||||||
else:
|
else:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
def infer(self, feats, p_len, pitch, pitchf, sid, out_size):
|
def infer(self, feats, p_len, pitch, pitchf, sid, out_size):
|
||||||
try:
|
try:
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
with autocast(enabled=self.isHalf):
|
with autocast(enabled=self.isHalf):
|
||||||
audio1 = self.inferencer.infer(feats, p_len, pitch, pitchf, sid, out_size)
|
audio1 = self.inferencer.infer(
|
||||||
audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
|
feats, p_len, pitch, pitchf, sid, out_size
|
||||||
|
)
|
||||||
|
# audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
|
||||||
|
audio1 = (audio1).data
|
||||||
return audio1
|
return audio1
|
||||||
except RuntimeError as e:
|
except RuntimeError as e:
|
||||||
if "HALF" in e.__str__().upper():
|
if "HALF" in e.__str__().upper():
|
||||||
@ -149,16 +164,24 @@ class Pipeline(object):
|
|||||||
|
|
||||||
with Timer2("Pipeline-Exec", False) as t: # NOQA
|
with Timer2("Pipeline-Exec", False) as t: # NOQA
|
||||||
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
# 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
|
||||||
search_index = self.index is not None and self.big_npy is not None and index_rate != 0
|
search_index = (
|
||||||
|
self.index is not None and self.big_npy is not None and index_rate != 0
|
||||||
|
)
|
||||||
# self.t_pad = self.sr * repeat # 1秒
|
# self.t_pad = self.sr * repeat # 1秒
|
||||||
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
|
# self.t_pad_tgt = self.targetSR * repeat # 1秒 出力時のトリミング(モデルのサンプリングで出力される)
|
||||||
audio = audio.unsqueeze(0)
|
audio = audio.unsqueeze(0)
|
||||||
|
|
||||||
quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
|
quality_padding_sec = (
|
||||||
|
repeat * (audio.shape[1] - 1)
|
||||||
|
) / self.sr # padding(reflect)のサイズは元のサイズより小さい必要がある。
|
||||||
|
|
||||||
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
|
self.t_pad = round(self.sr * quality_padding_sec) # 前後に音声を追加
|
||||||
self.t_pad_tgt = round(self.targetSR * quality_padding_sec) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
|
self.t_pad_tgt = round(
|
||||||
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
|
self.targetSR * quality_padding_sec
|
||||||
|
) # 前後に音声を追加 出力時のトリミング(モデルのサンプリングで出力される)
|
||||||
|
audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(
|
||||||
|
0
|
||||||
|
)
|
||||||
p_len = audio_pad.shape[0] // self.window
|
p_len = audio_pad.shape[0] // self.window
|
||||||
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
||||||
|
|
||||||
@ -176,7 +199,9 @@ class Pipeline(object):
|
|||||||
|
|
||||||
t.record("pre-process")
|
t.record("pre-process")
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
pitch, pitchf = self.extractPitch(audio_pad, if_f0, pitchf, f0_up_key, silence_front)
|
pitch, pitchf = self.extractPitch(
|
||||||
|
audio_pad, if_f0, pitchf, f0_up_key, silence_front
|
||||||
|
)
|
||||||
t.record("extract-pitch")
|
t.record("extract-pitch")
|
||||||
|
|
||||||
# embedding
|
# embedding
|
||||||
@ -203,12 +228,25 @@ class Pipeline(object):
|
|||||||
score, ix = self.index.search(npy, k=8)
|
score, ix = self.index.search(npy, k=8)
|
||||||
weight = np.square(1 / score)
|
weight = np.square(1 / score)
|
||||||
weight /= weight.sum(axis=1, keepdims=True)
|
weight /= weight.sum(axis=1, keepdims=True)
|
||||||
npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
|
npy = np.sum(
|
||||||
|
self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
|
||||||
|
)
|
||||||
|
|
||||||
# recover silient font
|
# recover silient font
|
||||||
npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
|
npy = np.concatenate(
|
||||||
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
[
|
||||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
np.zeros([npyOffset, npy.shape[1]], dtype=np.float32),
|
||||||
|
feature[:npyOffset:2].astype("float32"),
|
||||||
|
npy,
|
||||||
|
]
|
||||||
|
)[-feats.shape[1] :]
|
||||||
|
feats = (
|
||||||
|
torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
|
||||||
|
+ (1 - index_rate) * feats
|
||||||
|
)
|
||||||
|
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
|
||||||
|
0, 2, 1
|
||||||
|
)
|
||||||
if protect < 0.5 and search_index:
|
if protect < 0.5 and search_index:
|
||||||
feats0 = feats.clone()
|
feats0 = feats.clone()
|
||||||
|
|
||||||
@ -280,4 +318,4 @@ class Pipeline(object):
|
|||||||
del self.embedder
|
del self.embedder
|
||||||
del self.inferencer
|
del self.inferencer
|
||||||
del self.pitchExtractor
|
del self.pitchExtractor
|
||||||
print('Pipeline has been deleted')
|
print("Pipeline has been deleted")
|
||||||
|
@ -90,15 +90,22 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
self.params = params
|
self.params = params
|
||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
self.prev_audio = np.zeros(4096)
|
self.prev_audio = np.zeros(4096)
|
||||||
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
self.mps_enabled: bool = (
|
||||||
|
getattr(torch.backends, "mps", None) is not None
|
||||||
|
and torch.backends.mps.is_available()
|
||||||
|
)
|
||||||
self.onnx_device = onnxruntime.get_device()
|
self.onnx_device = onnxruntime.get_device()
|
||||||
self.noCrossFade = False
|
self.noCrossFade = False
|
||||||
|
|
||||||
logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
|
logger.info(
|
||||||
|
f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})"
|
||||||
|
)
|
||||||
|
|
||||||
def setModel(self, model: VoiceChangerModel):
|
def setModel(self, model: VoiceChangerModel):
|
||||||
self.voiceChanger = model
|
self.voiceChanger = model
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
# if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC":
|
# if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC":
|
||||||
if model.voiceChangerType == "Beatrice":
|
if model.voiceChangerType == "Beatrice":
|
||||||
self.noCrossFade = True
|
self.noCrossFade = True
|
||||||
@ -107,11 +114,15 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
|
|
||||||
def setInputSampleRate(self, sr: int):
|
def setInputSampleRate(self, sr: int):
|
||||||
self.settings.inputSampleRate = sr
|
self.settings.inputSampleRate = sr
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
def setOutputSampleRate(self, sr: int):
|
def setOutputSampleRate(self, sr: int):
|
||||||
self.settings.outputSampleRate = sr
|
self.settings.outputSampleRate = sr
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
@ -130,7 +141,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
if key == "serverAudioStated" and val == 0:
|
if key == "serverAudioStated" and val == 0:
|
||||||
self.settings.inputSampleRate = 48000
|
self.settings.inputSampleRate = 48000
|
||||||
self.settings.outputSampleRate = 48000
|
self.settings.outputSampleRate = 48000
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
setattr(self.settings, key, int(val))
|
setattr(self.settings, key, int(val))
|
||||||
@ -146,7 +159,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
self.settings.outputSampleRate,
|
self.settings.outputSampleRate,
|
||||||
# 16000,
|
# 16000,
|
||||||
)
|
)
|
||||||
print(f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}")
|
print(
|
||||||
|
f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}"
|
||||||
|
)
|
||||||
if key == "recordIO" and val == 0:
|
if key == "recordIO" and val == 0:
|
||||||
if hasattr(self, "ioRecorder"):
|
if hasattr(self, "ioRecorder"):
|
||||||
self.ioRecorder.close()
|
self.ioRecorder.close()
|
||||||
@ -155,7 +170,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
if hasattr(self, "ioRecorder"):
|
if hasattr(self, "ioRecorder"):
|
||||||
self.ioRecorder.close()
|
self.ioRecorder.close()
|
||||||
if key == "inputSampleRate" or key == "outputSampleRate":
|
if key == "inputSampleRate" or key == "outputSampleRate":
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
elif key in self.settings.floatData:
|
elif key in self.settings.floatData:
|
||||||
setattr(self.settings, key, float(val))
|
setattr(self.settings, key, float(val))
|
||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
@ -168,7 +185,12 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
return self.get_info()
|
return self.get_info()
|
||||||
|
|
||||||
def _generate_strength(self, crossfadeSize: int):
|
def _generate_strength(self, crossfadeSize: int):
|
||||||
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
|
if (
|
||||||
|
self.crossfadeSize != crossfadeSize
|
||||||
|
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
|
||||||
|
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
|
||||||
|
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
|
||||||
|
):
|
||||||
self.crossfadeSize = crossfadeSize
|
self.crossfadeSize = crossfadeSize
|
||||||
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
||||||
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
||||||
@ -197,7 +219,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
|
logger.info(
|
||||||
|
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
|
||||||
|
)
|
||||||
|
|
||||||
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
||||||
if hasattr(self, "np_prev_audio1") is True:
|
if hasattr(self, "np_prev_audio1") is True:
|
||||||
@ -212,13 +236,19 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
return self.voiceChanger.get_processing_sampling_rate()
|
return self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
|
||||||
# receivedData: tuple of short
|
# receivedData: tuple of short
|
||||||
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
def on_request(
|
||||||
|
self, receivedData: AudioInOut
|
||||||
|
) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||||
try:
|
try:
|
||||||
if self.voiceChanger is None:
|
if self.voiceChanger is None:
|
||||||
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
|
raise VoiceChangerIsNotSelectedException(
|
||||||
|
"Voice Changer is not selected."
|
||||||
|
)
|
||||||
enableMainprocessTimer = False
|
enableMainprocessTimer = False
|
||||||
with Timer2("main-process", enableMainprocessTimer) as t:
|
with Timer2("main-process", enableMainprocessTimer) as t:
|
||||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
processing_sampling_rate = (
|
||||||
|
self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
)
|
||||||
|
|
||||||
if self.noCrossFade: # Beatrice, LLVC
|
if self.noCrossFade: # Beatrice, LLVC
|
||||||
audio = self.voiceChanger.inference(
|
audio = self.voiceChanger.inference(
|
||||||
@ -232,7 +262,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
else:
|
else:
|
||||||
sola_search_frame = int(0.012 * processing_sampling_rate)
|
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||||
block_frame = receivedData.shape[0]
|
block_frame = receivedData.shape[0]
|
||||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
crossfade_frame = min(
|
||||||
|
self.settings.crossFadeOverlapSize, block_frame
|
||||||
|
)
|
||||||
self._generate_strength(crossfade_frame)
|
self._generate_strength(crossfade_frame)
|
||||||
t.record("generate_strength")
|
t.record("generate_strength")
|
||||||
|
|
||||||
@ -241,11 +273,14 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
crossfade_frame=crossfade_frame,
|
crossfade_frame=crossfade_frame,
|
||||||
sola_search_frame=sola_search_frame,
|
sola_search_frame=sola_search_frame,
|
||||||
)
|
)
|
||||||
|
print("output audio dtype", audio.dtype)
|
||||||
t.record("inference")
|
t.record("inference")
|
||||||
|
|
||||||
if hasattr(self, "sola_buffer") is True:
|
if hasattr(self, "sola_buffer") is True:
|
||||||
np.set_printoptions(threshold=10000)
|
np.set_printoptions(threshold=10000)
|
||||||
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
|
audio_offset = -1 * (
|
||||||
|
sola_search_frame + crossfade_frame + block_frame
|
||||||
|
)
|
||||||
audio = audio[audio_offset:]
|
audio = audio[audio_offset:]
|
||||||
|
|
||||||
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
||||||
@ -264,24 +299,33 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
)
|
)
|
||||||
sola_offset = int(np.argmax(cor_nom / cor_den))
|
sola_offset = int(np.argmax(cor_nom / cor_den))
|
||||||
sola_end = sola_offset + block_frame
|
sola_end = sola_offset + block_frame
|
||||||
output_wav = audio[sola_offset:sola_end].astype(np.float64)
|
output_wav = audio[sola_offset:sola_end].astype(np.float32)
|
||||||
output_wav[:crossfade_frame] *= self.np_cur_strength
|
output_wav[:crossfade_frame] *= self.np_cur_strength
|
||||||
output_wav[:crossfade_frame] += self.sola_buffer[:]
|
output_wav[:crossfade_frame] += self.sola_buffer[:]
|
||||||
|
|
||||||
result = output_wav
|
result = output_wav
|
||||||
else:
|
else:
|
||||||
logger.info("[Voice Changer] warming up... generating sola buffer.")
|
logger.info(
|
||||||
result = np.zeros(4096).astype(np.int16)
|
"[Voice Changer] warming up... generating sola buffer."
|
||||||
|
)
|
||||||
|
result = np.zeros(4096).astype(np.float32)
|
||||||
|
|
||||||
t.record("sora")
|
t.record("sora")
|
||||||
|
|
||||||
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
|
if (
|
||||||
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
|
hasattr(self, "sola_buffer") is True
|
||||||
|
and sola_offset < sola_search_frame
|
||||||
|
):
|
||||||
|
offset = -1 * (
|
||||||
|
sola_search_frame + crossfade_frame - sola_offset
|
||||||
|
)
|
||||||
end = -1 * (sola_search_frame - sola_offset)
|
end = -1 * (sola_search_frame - sola_offset)
|
||||||
sola_buf_org = audio[offset:end]
|
sola_buf_org = audio[offset:end]
|
||||||
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
||||||
else:
|
else:
|
||||||
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
|
self.sola_buffer = (
|
||||||
|
audio[-crossfade_frame:] * self.np_prev_strength
|
||||||
|
)
|
||||||
# self.sola_buffer = audio[- crossfade_frame:]
|
# self.sola_buffer = audio[- crossfade_frame:]
|
||||||
|
|
||||||
t.record("post")
|
t.record("post")
|
||||||
@ -290,9 +334,11 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
|
|
||||||
# 後処理
|
# 後処理
|
||||||
with Timer2("post-process", False) as t:
|
with Timer2("post-process", False) as t:
|
||||||
result = result.astype(np.int16)
|
result = result.astype(np.float32)
|
||||||
|
|
||||||
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
|
print_convert_processing(
|
||||||
|
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz"
|
||||||
|
)
|
||||||
|
|
||||||
if receivedData.shape[0] != result.shape[0]:
|
if receivedData.shape[0] != result.shape[0]:
|
||||||
# print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0])
|
# print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0])
|
||||||
@ -311,7 +357,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
|
|
||||||
postprocess_time = t.secs
|
postprocess_time = t.secs
|
||||||
|
|
||||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
print_convert_processing(
|
||||||
|
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
|
||||||
|
)
|
||||||
perf = [0, mainprocess_time, postprocess_time]
|
perf = [0, mainprocess_time, postprocess_time]
|
||||||
|
|
||||||
return outputData, perf
|
return outputData, perf
|
||||||
@ -320,7 +368,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
logger.warn(f"[Voice Changer] [Exception], {e}")
|
logger.warn(f"[Voice Changer] [Exception], {e}")
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except ONNXInputArgumentException as e:
|
except ONNXInputArgumentException as e:
|
||||||
logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}")
|
logger.warn(
|
||||||
|
f"[Voice Changer] [Exception] onnx are waiting valid input., {e}"
|
||||||
|
)
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except HalfPrecisionChangingException:
|
except HalfPrecisionChangingException:
|
||||||
logger.warn("[Voice Changer] Switching model configuration....")
|
logger.warn("[Voice Changer] Switching model configuration....")
|
||||||
@ -332,7 +382,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
logger.warn(f"[Voice Changer] embedder: {e}")
|
logger.warn(f"[Voice Changer] embedder: {e}")
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except VoiceChangerIsNotSelectedException:
|
except VoiceChangerIsNotSelectedException:
|
||||||
logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
|
logger.warn(
|
||||||
|
"[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc."
|
||||||
|
)
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except DeviceCannotSupportHalfPrecisionException:
|
except DeviceCannotSupportHalfPrecisionException:
|
||||||
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
||||||
|
Loading…
Reference in New Issue
Block a user