mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
bugfix: beatrice load
This commit is contained in:
parent
3512bbb1eb
commit
d03132d2ab
@ -4,7 +4,6 @@
|
|||||||
# cp -r ~/git-work/voice-changer-js/lib/package.json node_modules/@dannadori/voice-changer-js/
|
# cp -r ~/git-work/voice-changer-js/lib/package.json node_modules/@dannadori/voice-changer-js/
|
||||||
# cp -r ~/git-work/voice-changer-js/lib/dist node_modules/@dannadori/voice-changer-js/
|
# cp -r ~/git-work/voice-changer-js/lib/dist node_modules/@dannadori/voice-changer-js/
|
||||||
|
|
||||||
|
|
||||||
cd ~/git-work/voice-changer-js/lib/ ; npm run build:prod; cd -
|
cd ~/git-work/voice-changer-js/lib/ ; npm run build:prod; cd -
|
||||||
rm -rf node_modules/@dannadori/voice-changer-js
|
rm -rf node_modules/@dannadori/voice-changer-js
|
||||||
mkdir -p node_modules/@dannadori/voice-changer-js/dist
|
mkdir -p node_modules/@dannadori/voice-changer-js/dist
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
import { ClientState } from "@dannadori/voice-changer-client-js";
|
import { ClientState } from "@dannadori/voice-changer-client-js";
|
||||||
|
import { VoiceChangerJSClient } from "@dannadori/voice-changer-js";
|
||||||
import React, { useContext, useEffect, useRef } from "react";
|
import React, { useContext, useEffect, useRef } from "react";
|
||||||
import { ReactNode } from "react";
|
import { ReactNode } from "react";
|
||||||
import { useVCClient } from "../001_globalHooks/001_useVCClient";
|
import { useVCClient } from "../001_globalHooks/001_useVCClient";
|
||||||
import { useAppRoot } from "./001_AppRootProvider";
|
import { useAppRoot } from "./001_AppRootProvider";
|
||||||
import { useMessageBuilder } from "../hooks/useMessageBuilder";
|
import { useMessageBuilder } from "../hooks/useMessageBuilder";
|
||||||
|
|
||||||
import { VoiceChangerJSClient } from "./VoiceChangerJSClient";
|
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
children: ReactNode;
|
children: ReactNode;
|
||||||
};
|
};
|
||||||
@ -58,22 +57,34 @@ export const AppStateProvider = ({ children }: Props) => {
|
|||||||
|
|
||||||
// useEffect(() => {
|
// useEffect(() => {
|
||||||
// if (clientState.clientState.initialized) {
|
// if (clientState.clientState.initialized) {
|
||||||
|
// const baseUrl = "https://192.168.0.247:18888";
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvc2v_40k_f0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvc2v_40k_nof0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvc2v_16k_f0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_40k_f0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_40k_nof0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_32k_f0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv2_amitaro_v2_32k_nof0_24000.bin`;
|
||||||
|
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_32k_f0_24000.bin`;
|
||||||
|
// const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_32k_nof0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_40k_f0_24000.bin`;
|
||||||
|
// // const modelUrl = `${baseUrl}/models/rvcv1_amitaro_v1_40k_nof0_24000.bin`;
|
||||||
|
|
||||||
// voiceChangerJSClient.current = new VoiceChangerJSClient();
|
// voiceChangerJSClient.current = new VoiceChangerJSClient();
|
||||||
// voiceChangerJSClient.current.initialize();
|
// voiceChangerJSClient.current.initialize(
|
||||||
|
// {
|
||||||
|
// baseUrl: baseUrl,
|
||||||
|
// inputSamplingRate: 48000,
|
||||||
|
// outputSamplingRate: 48000,
|
||||||
|
// },
|
||||||
|
// modelUrl,
|
||||||
|
// );
|
||||||
// clientState.clientState.setInternalAudioProcessCallback({
|
// clientState.clientState.setInternalAudioProcessCallback({
|
||||||
// processAudio: async (data: Uint8Array) => {
|
// processAudio: async (data: Uint8Array) => {
|
||||||
// console.log("[CLIENTJS] start --------------------------------------");
|
|
||||||
// const audioF32 = new Float32Array(data.buffer);
|
// const audioF32 = new Float32Array(data.buffer);
|
||||||
// const converted = await voiceChangerJSClient.current!.convert(audioF32);
|
// const converted = await voiceChangerJSClient.current!.convert(audioF32);
|
||||||
|
// const res = new Uint8Array(converted.buffer);
|
||||||
// let audio_int16_out = new Int16Array(converted.length);
|
|
||||||
// for (let i = 0; i < converted.length; i++) {
|
|
||||||
// audio_int16_out[i] = converted[i] * 32768.0;
|
|
||||||
// }
|
|
||||||
// const res = new Uint8Array(audio_int16_out.buffer);
|
|
||||||
// console.log("AUDIO::::audio_int16_out", audio_int16_out);
|
|
||||||
|
|
||||||
// console.log("[CLIENTJS] end --------------------------------------");
|
|
||||||
// return res;
|
// return res;
|
||||||
// },
|
// },
|
||||||
// });
|
// });
|
||||||
|
@ -1,149 +0,0 @@
|
|||||||
import { create, ConverterType } from "@alexanderolsen/libsamplerate-js";
|
|
||||||
import { BlockingQueue } from "./_BlockingQueue";
|
|
||||||
import { WorkerManager, generateConfig, VoiceChangerProcessorInitializeParams, VoiceChangerProcessorConvertParams, FunctionType, VoiceChangerProcessorResult } from "@dannadori/voice-changer-js";
|
|
||||||
|
|
||||||
export class VoiceChangerJSClient {
|
|
||||||
private wm = new WorkerManager();
|
|
||||||
private audioBuffer: Float32Array = new Float32Array(0);
|
|
||||||
private audioInputLength = 24000;
|
|
||||||
|
|
||||||
private inputSamplingRate = 48000;
|
|
||||||
private outputSamplingRate = 48000;
|
|
||||||
private modelInputSamplingRate = 16000;
|
|
||||||
private modelOutputSamplingRate = 40000;
|
|
||||||
private sem = new BlockingQueue<number>();
|
|
||||||
private crossfadeChunks = 1;
|
|
||||||
private solaChunks = 0.5;
|
|
||||||
constructor() {
|
|
||||||
this.sem.enqueue(0);
|
|
||||||
}
|
|
||||||
private lock = async () => {
|
|
||||||
const num = await this.sem.dequeue();
|
|
||||||
return num;
|
|
||||||
};
|
|
||||||
private unlock = (num: number) => {
|
|
||||||
this.sem.enqueue(num + 1);
|
|
||||||
};
|
|
||||||
|
|
||||||
initialize = async () => {
|
|
||||||
console.log("Voice Changer Initializing,,,");
|
|
||||||
const baseUrl = "http://127.0.0.1:18888";
|
|
||||||
|
|
||||||
this.wm = new WorkerManager();
|
|
||||||
const config = generateConfig();
|
|
||||||
config.processorURL = `${baseUrl}/process.js`;
|
|
||||||
config.onnxWasmPaths = `${baseUrl}/`;
|
|
||||||
await this.wm.init(config);
|
|
||||||
|
|
||||||
const initializeParams: VoiceChangerProcessorInitializeParams = {
|
|
||||||
type: FunctionType.initialize,
|
|
||||||
inputLength: 24000,
|
|
||||||
f0_min: 50,
|
|
||||||
f0_max: 1100,
|
|
||||||
embPitchUrl: "http://127.0.0.1:18888/models/emb_pit_24000.bin",
|
|
||||||
rvcv2InputLength: 148,
|
|
||||||
// rvcv2Url: "http://127.0.0.1:18888/models/rvc2v_24000.bin",
|
|
||||||
rvcv2Url: "http://127.0.0.1:18888/models/rvc2vnof0_24000.bin",
|
|
||||||
transfer: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const res = (await this.wm.execute(initializeParams)) as VoiceChangerProcessorResult;
|
|
||||||
console.log("Voice Changer Initialized..", res);
|
|
||||||
};
|
|
||||||
|
|
||||||
convert = async (audio: Float32Array): Promise<Float32Array> => {
|
|
||||||
console.log("convert start....", audio);
|
|
||||||
const lockNum = await this.lock();
|
|
||||||
//resample
|
|
||||||
const audio_16k = await this.resample(audio, this.inputSamplingRate, this.modelInputSamplingRate);
|
|
||||||
//store data and get target data
|
|
||||||
//// store
|
|
||||||
const newAudioBuffer = new Float32Array(this.audioBuffer.length + audio_16k.length);
|
|
||||||
newAudioBuffer.set(this.audioBuffer);
|
|
||||||
newAudioBuffer.set(audio_16k, this.audioBuffer.length);
|
|
||||||
this.audioBuffer = newAudioBuffer;
|
|
||||||
|
|
||||||
//// Buffering.....
|
|
||||||
if (this.audioBuffer.length < this.audioInputLength * 1) {
|
|
||||||
console.log(`skip covert length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`);
|
|
||||||
await this.unlock(lockNum);
|
|
||||||
return new Float32Array(1);
|
|
||||||
} else {
|
|
||||||
console.log(`--------------- convert start... length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
//// get chunks
|
|
||||||
let chunkIndex = 0;
|
|
||||||
const audioChunks: Float32Array[] = [];
|
|
||||||
while (true) {
|
|
||||||
const chunkOffset = chunkIndex * this.audioInputLength - (this.crossfadeChunks + this.solaChunks) * 320 * chunkIndex;
|
|
||||||
const chunkEnd = chunkOffset + this.audioInputLength;
|
|
||||||
if (chunkEnd > this.audioBuffer.length) {
|
|
||||||
this.audioBuffer = this.audioBuffer.slice(chunkOffset);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
const chunk = this.audioBuffer.slice(chunkOffset, chunkEnd);
|
|
||||||
audioChunks.push(chunk);
|
|
||||||
}
|
|
||||||
chunkIndex++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (audioChunks.length == 0) {
|
|
||||||
await this.unlock(lockNum);
|
|
||||||
console.log(`skip covert length:${this.audioBuffer.length}, audio_16k:${audio_16k.length}`);
|
|
||||||
return new Float32Array(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
//convert (each)
|
|
||||||
const convetedAudioChunks: Float32Array[] = [];
|
|
||||||
for (let i = 0; i < audioChunks.length; i++) {
|
|
||||||
const convertParams: VoiceChangerProcessorConvertParams = {
|
|
||||||
type: FunctionType.convert,
|
|
||||||
transfer: [audioChunks[i].buffer],
|
|
||||||
};
|
|
||||||
const res = (await this.wm.execute(convertParams)) as VoiceChangerProcessorResult;
|
|
||||||
const converted = new Float32Array(res.transfer[0] as ArrayBuffer);
|
|
||||||
console.log(`converted.length:::${i}:${converted.length}`);
|
|
||||||
|
|
||||||
convetedAudioChunks.push(converted);
|
|
||||||
}
|
|
||||||
|
|
||||||
//concat
|
|
||||||
let totalLength = convetedAudioChunks.reduce((prev, cur) => prev + cur.length, 0);
|
|
||||||
let convetedAudio = new Float32Array(totalLength);
|
|
||||||
let offset = 0;
|
|
||||||
for (let chunk of convetedAudioChunks) {
|
|
||||||
convetedAudio.set(chunk, offset);
|
|
||||||
offset += chunk.length;
|
|
||||||
}
|
|
||||||
console.log(`converted.length:::convetedAudio:${convetedAudio.length}`);
|
|
||||||
|
|
||||||
//resample
|
|
||||||
// const response = await this.resample(convetedAudio, this.params.modelOutputSamplingRate, this.params.outputSamplingRate);
|
|
||||||
|
|
||||||
const outputDuration = (this.audioInputLength * audioChunks.length - this.crossfadeChunks * 320) / 16000;
|
|
||||||
const outputSamples = outputDuration * this.outputSamplingRate;
|
|
||||||
const convertedOutputRatio = outputSamples / convetedAudio.length;
|
|
||||||
const realOutputSamplingRate = this.modelOutputSamplingRate * convertedOutputRatio;
|
|
||||||
console.log(`realOutputSamplingRate:${realOutputSamplingRate}, `, this.modelOutputSamplingRate, convertedOutputRatio);
|
|
||||||
|
|
||||||
// const response2 = await this.resample(convetedAudio, this.params.modelOutputSamplingRate, realOutputSamplingRate);
|
|
||||||
const response2 = await this.resample(convetedAudio, this.modelOutputSamplingRate, this.outputSamplingRate);
|
|
||||||
|
|
||||||
console.log(`converted from :${audioChunks.length * this.audioInputLength} to:${convetedAudio.length} to:${response2.length}`);
|
|
||||||
console.log(`outputDuration :${outputDuration} outputSamples:${outputSamples}, convertedOutputRatio:${convertedOutputRatio}, realOutputSamplingRate:${realOutputSamplingRate}`);
|
|
||||||
await this.unlock(lockNum);
|
|
||||||
return response2;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Utility
|
|
||||||
resample = async (data: Float32Array, srcSampleRate: number, dstSampleRate: number) => {
|
|
||||||
const converterType = ConverterType.SRC_SINC_BEST_QUALITY;
|
|
||||||
const nChannels = 1;
|
|
||||||
const converter = await create(nChannels, srcSampleRate, dstSampleRate, {
|
|
||||||
converterType: converterType, // default SRC_SINC_FASTEST. see API for more
|
|
||||||
});
|
|
||||||
const res = converter.simple(data);
|
|
||||||
return res;
|
|
||||||
};
|
|
||||||
}
|
|
@ -55,20 +55,44 @@ module.exports = {
|
|||||||
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
new CopyPlugin({
|
||||||
|
patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/ort-wasm-simd.wasm", to: "ort-wasm-simd.wasm" }],
|
||||||
|
}),
|
||||||
// new CopyPlugin({
|
// new CopyPlugin({
|
||||||
// patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/ort-wasm-simd.wasm", to: "ort-wasm-simd.wasm" }],
|
// patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/tfjs-backend-wasm-simd.wasm", to: "tfjs-backend-wasm-simd.wasm" }],
|
||||||
// }),
|
// }),
|
||||||
// new CopyPlugin({
|
// new CopyPlugin({
|
||||||
// patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/process.js", to: "process.js" }],
|
// patterns: [{ from: "./node_modules/@dannadori/voice-changer-js/dist/process.js", to: "process.js" }],
|
||||||
// }),
|
// }),
|
||||||
// new CopyPlugin({
|
// new CopyPlugin({
|
||||||
// patterns: [{ from: "public/models/emb_pit_24000.bin", to: "models/emb_pit_24000.bin" }],
|
// patterns: [{ from: "public/models/rvcv2_emb_pit_24000.bin", to: "models/rvcv2_emb_pit_24000.bin" }],
|
||||||
// }),
|
// }),
|
||||||
// new CopyPlugin({
|
// new CopyPlugin({
|
||||||
// patterns: [{ from: "public/models/rvc2v_24000.bin", to: "models/rvc2v_24000.bin" }],
|
// patterns: [{ from: "public/models/rvcv2_amitaro_v2_32k_f0_24000.bin", to: "models/rvcv2_amitaro_v2_32k_f0_24000.bin" }],
|
||||||
// }),
|
// }),
|
||||||
// new CopyPlugin({
|
// new CopyPlugin({
|
||||||
// patterns: [{ from: "public/models/rvc2vnof0_24000.bin", to: "models/rvc2vnof0_24000.bin" }],
|
// patterns: [{ from: "public/models/rvcv2_amitaro_v2_32k_nof0_24000.bin", to: "models/rvcv2_amitaro_v2_32k_nof0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv2_amitaro_v2_40k_f0_24000.bin", to: "models/rvcv2_amitaro_v2_40k_f0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv2_amitaro_v2_40k_nof0_24000.bin", to: "models/rvcv2_amitaro_v2_40k_nof0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv1_emb_pit_24000.bin", to: "models/rvcv1_emb_pit_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv1_amitaro_v1_32k_f0_24000.bin", to: "models/rvcv1_amitaro_v1_32k_f0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv1_amitaro_v1_32k_nof0_24000.bin", to: "models/rvcv1_amitaro_v1_32k_nof0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv1_amitaro_v1_40k_f0_24000.bin", to: "models/rvcv1_amitaro_v1_40k_f0_24000.bin" }],
|
||||||
|
// }),
|
||||||
|
// new CopyPlugin({
|
||||||
|
// patterns: [{ from: "public/models/rvcv1_amitaro_v1_40k_nof0_24000.bin", to: "models/rvcv1_amitaro_v1_40k_nof0_24000.bin" }],
|
||||||
// }),
|
// }),
|
||||||
],
|
],
|
||||||
};
|
};
|
||||||
|
@ -20,7 +20,7 @@ else:
|
|||||||
|
|
||||||
from .models.diffusion.infer_gt_mel import DiffGtMel
|
from .models.diffusion.infer_gt_mel import DiffGtMel
|
||||||
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings
|
from voice_changer.DDSP_SVC.DDSP_SVCSetting import DDSP_SVCSettings
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
@ -44,15 +44,20 @@ def phase_vocoder(a, b, fade_out, fade_in):
|
|||||||
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
|
deltaphase = deltaphase - 2 * np.pi * torch.floor(deltaphase / 2 / np.pi + 0.5)
|
||||||
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
|
w = 2 * np.pi * torch.arange(n // 2 + 1).to(a) + deltaphase
|
||||||
t = torch.arange(n).unsqueeze(-1).to(a) / n
|
t = torch.arange(n).unsqueeze(-1).to(a) / n
|
||||||
result = a * (fade_out**2) + b * (fade_in**2) + torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
|
result = (
|
||||||
|
a * (fade_out**2)
|
||||||
|
+ b * (fade_in**2)
|
||||||
|
+ torch.sum(absab * torch.cos(w * t + phia), -1) * fade_out * fade_in / n
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class DDSP_SVC:
|
class DDSP_SVC(VoiceChangerModel):
|
||||||
initialLoad: bool = True
|
initialLoad: bool = True
|
||||||
|
|
||||||
def __init__(self, params: VoiceChangerParams, slotInfo: DDSPSVCModelSlot):
|
def __init__(self, params: VoiceChangerParams, slotInfo: DDSPSVCModelSlot):
|
||||||
print("[Voice Changer] [DDSP-SVC] Creating instance ")
|
print("[Voice Changer] [DDSP-SVC] Creating instance ")
|
||||||
|
self.voiceChangerType = "DDSP-SVC"
|
||||||
self.deviceManager = DeviceManager.get_instance()
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
self.params = params
|
self.params = params
|
||||||
@ -71,8 +76,18 @@ class DDSP_SVC:
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.device = self.deviceManager.getDevice(self.settings.gpu)
|
self.device = self.deviceManager.getDevice(self.settings.gpu)
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), "model", self.slotInfo.modelFile)
|
modelPath = os.path.join(
|
||||||
diffPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), "diff", self.slotInfo.diffModelFile)
|
vcparams.model_dir,
|
||||||
|
str(self.slotInfo.slotIndex),
|
||||||
|
"model",
|
||||||
|
self.slotInfo.modelFile,
|
||||||
|
)
|
||||||
|
diffPath = os.path.join(
|
||||||
|
vcparams.model_dir,
|
||||||
|
str(self.slotInfo.slotIndex),
|
||||||
|
"diff",
|
||||||
|
self.slotInfo.diffModelFile,
|
||||||
|
)
|
||||||
|
|
||||||
self.svc_model = SvcDDSP()
|
self.svc_model = SvcDDSP()
|
||||||
self.svc_model.setVCParams(self.params)
|
self.svc_model.setVCParams(self.params)
|
||||||
@ -112,11 +127,15 @@ class DDSP_SVC:
|
|||||||
# newData = newData.astype(np.float32)
|
# newData = newData.astype(np.float32)
|
||||||
|
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
self.audio_buffer = np.concatenate(
|
||||||
|
[self.audio_buffer, newData], 0
|
||||||
|
) # 過去のデータに連結
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = (
|
||||||
|
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||||
|
)
|
||||||
|
|
||||||
# if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
# if convertSize % self.hop_size != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
# convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
|
# convertSize = convertSize + (self.hop_size - (convertSize % self.hop_size))
|
||||||
@ -147,7 +166,8 @@ class DDSP_SVC:
|
|||||||
f0_min=50,
|
f0_min=50,
|
||||||
f0_max=1100,
|
f0_max=1100,
|
||||||
# safe_prefix_pad_length=0, # TBD なにこれ?
|
# safe_prefix_pad_length=0, # TBD なにこれ?
|
||||||
safe_prefix_pad_length=self.settings.extraConvertSize / self.svc_model.args.data.sampling_rate,
|
safe_prefix_pad_length=self.settings.extraConvertSize
|
||||||
|
/ self.svc_model.args.data.sampling_rate,
|
||||||
diff_model=self.diff_model,
|
diff_model=self.diff_model,
|
||||||
diff_acc=self.settings.diffAcc, # TBD なにこれ?
|
diff_acc=self.settings.diffAcc, # TBD なにこれ?
|
||||||
diff_spk_id=self.settings.diffSpkId,
|
diff_spk_id=self.settings.diffSpkId,
|
||||||
@ -155,7 +175,9 @@ class DDSP_SVC:
|
|||||||
# diff_use_dpm=True if self.settings.useDiffDpm == 1 else False, # TBD なにこれ?
|
# diff_use_dpm=True if self.settings.useDiffDpm == 1 else False, # TBD なにこれ?
|
||||||
method=self.settings.diffMethod,
|
method=self.settings.diffMethod,
|
||||||
k_step=self.settings.kStep, # TBD なにこれ?
|
k_step=self.settings.kStep, # TBD なにこれ?
|
||||||
diff_silence=True if self.settings.useDiffSilence == 1 else False, # TBD なにこれ?
|
diff_silence=True
|
||||||
|
if self.settings.useDiffSilence == 1
|
||||||
|
else False, # TBD なにこれ?
|
||||||
)
|
)
|
||||||
|
|
||||||
return _audio.cpu().numpy() * 32768.0
|
return _audio.cpu().numpy() * 32768.0
|
||||||
@ -182,5 +204,4 @@ class DDSP_SVC:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def get_model_current(self):
|
def get_model_current(self):
|
||||||
return [
|
return []
|
||||||
]
|
|
||||||
|
@ -6,16 +6,28 @@ from voice_changer.DiffusionSVC.DiffusionSVCSettings import DiffusionSVCSettings
|
|||||||
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
from voice_changer.DiffusionSVC.inferencer.InferencerManager import InferencerManager
|
||||||
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
from voice_changer.DiffusionSVC.pipeline.Pipeline import Pipeline
|
||||||
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
from voice_changer.DiffusionSVC.pipeline.PipelineGenerator import createPipeline
|
||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractorManager import (
|
||||||
|
PitchExtractorManager,
|
||||||
|
)
|
||||||
from voice_changer.ModelSlotManager import ModelSlotManager
|
from voice_changer.ModelSlotManager import ModelSlotManager
|
||||||
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
from voice_changer.utils.VoiceChangerModel import (
|
||||||
|
AudioInOut,
|
||||||
|
PitchfInOut,
|
||||||
|
FeatureInOut,
|
||||||
|
VoiceChangerModel,
|
||||||
|
)
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
|
|
||||||
# from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
# from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
|
|
||||||
from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException
|
from Exceptions import (
|
||||||
|
DeviceCannotSupportHalfPrecisionException,
|
||||||
|
PipelineCreateException,
|
||||||
|
PipelineNotInitializedException,
|
||||||
|
)
|
||||||
|
|
||||||
logger = VoiceChangaerLogger.get_instance().getLogger()
|
logger = VoiceChangaerLogger.get_instance().getLogger()
|
||||||
|
|
||||||
@ -23,6 +35,7 @@ logger = VoiceChangaerLogger.get_instance().getLogger()
|
|||||||
class DiffusionSVC(VoiceChangerModel):
|
class DiffusionSVC(VoiceChangerModel):
|
||||||
def __init__(self, params: VoiceChangerParams, slotInfo: DiffusionSVCModelSlot):
|
def __init__(self, params: VoiceChangerParams, slotInfo: DiffusionSVCModelSlot):
|
||||||
logger.info("[Voice Changer] [DiffusionSVC] Creating instance ")
|
logger.info("[Voice Changer] [DiffusionSVC] Creating instance ")
|
||||||
|
self.voiceChangerType = "Diffusion-SVC"
|
||||||
self.deviceManager = DeviceManager.get_instance()
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
EmbedderManager.initialize(params)
|
EmbedderManager.initialize(params)
|
||||||
PitchExtractorManager.initialize(params)
|
PitchExtractorManager.initialize(params)
|
||||||
@ -46,9 +59,17 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
|
|
||||||
# pipelineの生成
|
# pipelineの生成
|
||||||
try:
|
try:
|
||||||
self.pipeline = createPipeline(self.slotInfo, self.settings.gpu, self.settings.f0Detector, self.inputSampleRate, self.outputSampleRate)
|
self.pipeline = createPipeline(
|
||||||
|
self.slotInfo,
|
||||||
|
self.settings.gpu,
|
||||||
|
self.settings.f0Detector,
|
||||||
|
self.inputSampleRate,
|
||||||
|
self.outputSampleRate,
|
||||||
|
)
|
||||||
except PipelineCreateException as e: # NOQA
|
except PipelineCreateException as e: # NOQA
|
||||||
logger.error("[Voice Changer] pipeline create failed. check your model is valid.")
|
logger.error(
|
||||||
|
"[Voice Changer] pipeline create failed. check your model is valid."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# その他の設定
|
# その他の設定
|
||||||
@ -76,7 +97,9 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
setattr(self.settings, key, str(val))
|
setattr(self.settings, key, str(val))
|
||||||
if key == "f0Detector" and self.pipeline is not None:
|
if key == "f0Detector" and self.pipeline is not None:
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||||
|
self.settings.f0Detector, self.settings.gpu
|
||||||
|
)
|
||||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
@ -100,30 +123,65 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
crossfadeSize: int,
|
crossfadeSize: int,
|
||||||
solaSearchFrame: int = 0,
|
solaSearchFrame: int = 0,
|
||||||
):
|
):
|
||||||
newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
newData = (
|
||||||
new_feature_length = int(((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) # 100 は hubertのhosizeから (16000 / 160).
|
newData.astype(np.float32) / 32768.0
|
||||||
|
) # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1)
|
||||||
|
new_feature_length = int(
|
||||||
|
((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate)
|
||||||
|
/ 512
|
||||||
|
) # 100 は hubertのhosizeから (16000 / 160).
|
||||||
# ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。
|
# ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
# 過去のデータに連結
|
# 過去のデータに連結
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(new_feature_length)], 0)
|
self.pitchf_buffer = np.concatenate(
|
||||||
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([new_feature_length, self.slotInfo.embChannels])], 0)
|
[self.pitchf_buffer, np.zeros(new_feature_length)], 0
|
||||||
|
)
|
||||||
|
self.feature_buffer = np.concatenate(
|
||||||
|
[
|
||||||
|
self.feature_buffer,
|
||||||
|
np.zeros([new_feature_length, self.slotInfo.embChannels]),
|
||||||
|
],
|
||||||
|
0,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
self.pitchf_buffer = np.zeros(new_feature_length)
|
self.pitchf_buffer = np.zeros(new_feature_length)
|
||||||
self.feature_buffer = np.zeros([new_feature_length, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros(
|
||||||
|
[new_feature_length, self.slotInfo.embChannels]
|
||||||
|
)
|
||||||
|
|
||||||
convertSize = newData.shape[0] + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = (
|
||||||
|
newData.shape[0]
|
||||||
|
+ crossfadeSize
|
||||||
|
+ solaSearchFrame
|
||||||
|
+ self.settings.extraConvertSize
|
||||||
|
)
|
||||||
|
|
||||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (128 - (convertSize % 128))
|
convertSize = convertSize + (128 - (convertSize % 128))
|
||||||
|
|
||||||
# バッファがたまっていない場合はzeroで補う
|
# バッファがたまっていない場合はzeroで補う
|
||||||
generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1
|
generateFeatureLength = (
|
||||||
|
int(
|
||||||
|
((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate)
|
||||||
|
/ 512
|
||||||
|
)
|
||||||
|
+ 1
|
||||||
|
)
|
||||||
if self.audio_buffer.shape[0] < convertSize:
|
if self.audio_buffer.shape[0] < convertSize:
|
||||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
self.audio_buffer = np.concatenate(
|
||||||
self.pitchf_buffer = np.concatenate([np.zeros(generateFeatureLength), self.pitchf_buffer])
|
[np.zeros([convertSize]), self.audio_buffer]
|
||||||
self.feature_buffer = np.concatenate([np.zeros([generateFeatureLength, self.slotInfo.embChannels]), self.feature_buffer])
|
)
|
||||||
|
self.pitchf_buffer = np.concatenate(
|
||||||
|
[np.zeros(generateFeatureLength), self.pitchf_buffer]
|
||||||
|
)
|
||||||
|
self.feature_buffer = np.concatenate(
|
||||||
|
[
|
||||||
|
np.zeros([generateFeatureLength, self.slotInfo.embChannels]),
|
||||||
|
self.feature_buffer,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
featureOffset = -1 * generateFeatureLength
|
featureOffset = -1 * generateFeatureLength
|
||||||
@ -139,9 +197,17 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
vol = float(max(vol, self.prevVol * 0.0))
|
vol = float(max(vol, self.prevVol * 0.0))
|
||||||
self.prevVol = vol
|
self.prevVol = vol
|
||||||
|
|
||||||
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol)
|
return (
|
||||||
|
self.audio_buffer,
|
||||||
|
self.pitchf_buffer,
|
||||||
|
self.feature_buffer,
|
||||||
|
convertSize,
|
||||||
|
vol,
|
||||||
|
)
|
||||||
|
|
||||||
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
|
def inference(
|
||||||
|
self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int
|
||||||
|
):
|
||||||
if self.pipeline is None:
|
if self.pipeline is None:
|
||||||
logger.info("[Voice Changer] Pipeline is not initialized.")
|
logger.info("[Voice Changer] Pipeline is not initialized.")
|
||||||
raise PipelineNotInitializedException()
|
raise PipelineNotInitializedException()
|
||||||
@ -169,7 +235,11 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
speedUp = self.settings.speedUp
|
speedUp = self.settings.speedUp
|
||||||
embOutputLayer = 12
|
embOutputLayer = 12
|
||||||
useFinalProj = False
|
useFinalProj = False
|
||||||
silenceFrontSec = self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0. # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
silenceFrontSec = (
|
||||||
|
self.settings.extraConvertSize / self.inputSampleRate
|
||||||
|
if self.settings.silenceFront
|
||||||
|
else 0.0
|
||||||
|
) # extaraConvertSize(既にモデルのサンプリングレートにリサンプリング済み)の秒数。モデルのサンプリングレートで処理(★1)。
|
||||||
|
|
||||||
try:
|
try:
|
||||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||||
@ -190,7 +260,9 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
result = audio_out.detach().cpu().numpy()
|
result = audio_out.detach().cpu().numpy()
|
||||||
return result
|
return result
|
||||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||||
logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....")
|
logger.warn(
|
||||||
|
"[Device Manager] Device cannot support half precision. Fallback to float...."
|
||||||
|
)
|
||||||
self.deviceManager.setForceTensor(True)
|
self.deviceManager.setForceTensor(True)
|
||||||
self.initialize()
|
self.initialize()
|
||||||
# raise e
|
# raise e
|
||||||
|
@ -3,7 +3,7 @@ import os
|
|||||||
from data.ModelSlot import MMVCv13ModelSlot
|
from data.ModelSlot import MMVCv13ModelSlot
|
||||||
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
||||||
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||||
@ -48,9 +48,10 @@ class MMVCv13Settings:
|
|||||||
strData: list[str] = field(default_factory=lambda: [])
|
strData: list[str] = field(default_factory=lambda: [])
|
||||||
|
|
||||||
|
|
||||||
class MMVCv13:
|
class MMVCv13(VoiceChangerModel):
|
||||||
def __init__(self, slotInfo: MMVCv13ModelSlot):
|
def __init__(self, slotInfo: MMVCv13ModelSlot):
|
||||||
print("[Voice Changer] [MMVCv13] Creating instance ")
|
print("[Voice Changer] [MMVCv13] Creating instance ")
|
||||||
|
self.voiceChangerType = "MMVCv13"
|
||||||
self.settings = MMVCv13Settings()
|
self.settings = MMVCv13Settings()
|
||||||
self.net_g = None
|
self.net_g = None
|
||||||
self.onnx_session = None
|
self.onnx_session = None
|
||||||
@ -65,8 +66,12 @@ class MMVCv13:
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
print("[Voice Changer] [MMVCv13] Initializing... ")
|
print("[Voice Changer] [MMVCv13] Initializing... ")
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile)
|
configPath = os.path.join(
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile
|
||||||
|
)
|
||||||
|
modelPath = os.path.join(
|
||||||
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile
|
||||||
|
)
|
||||||
|
|
||||||
self.hps = get_hparams_from_file(configPath)
|
self.hps = get_hparams_from_file(configPath)
|
||||||
if self.slotInfo.isONNX:
|
if self.slotInfo.isONNX:
|
||||||
@ -77,7 +82,13 @@ class MMVCv13:
|
|||||||
provider_options=options,
|
provider_options=options,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.net_g = SynthesizerTrn(len(symbols), self.hps.data.filter_length // 2 + 1, self.hps.train.segment_size // self.hps.data.hop_length, n_speakers=self.hps.data.n_speakers, **self.hps.model)
|
self.net_g = SynthesizerTrn(
|
||||||
|
len(symbols),
|
||||||
|
self.hps.data.filter_length // 2 + 1,
|
||||||
|
self.hps.train.segment_size // self.hps.data.hop_length,
|
||||||
|
n_speakers=self.hps.data.n_speakers,
|
||||||
|
**self.hps.model
|
||||||
|
)
|
||||||
self.net_g.eval()
|
self.net_g.eval()
|
||||||
load_checkpoint(modelPath, self.net_g, None)
|
load_checkpoint(modelPath, self.net_g, None)
|
||||||
|
|
||||||
@ -89,7 +100,11 @@ class MMVCv13:
|
|||||||
def getOnnxExecutionProvider(self):
|
def getOnnxExecutionProvider(self):
|
||||||
availableProviders = onnxruntime.get_available_providers()
|
availableProviders = onnxruntime.get_available_providers()
|
||||||
devNum = torch.cuda.device_count()
|
devNum = torch.cuda.device_count()
|
||||||
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0:
|
if (
|
||||||
|
self.settings.gpu >= 0
|
||||||
|
and "CUDAExecutionProvider" in availableProviders
|
||||||
|
and devNum > 0
|
||||||
|
):
|
||||||
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
||||||
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
||||||
return ["DmlExecutionProvider"], [{}]
|
return ["DmlExecutionProvider"], [{}]
|
||||||
@ -110,7 +125,11 @@ class MMVCv13:
|
|||||||
if key == "gpu" and self.slotInfo.isONNX:
|
if key == "gpu" and self.slotInfo.isONNX:
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
modelPath = os.path.join(
|
||||||
|
vcparams.model_dir,
|
||||||
|
str(self.slotInfo.slotIndex),
|
||||||
|
self.slotInfo.modelFile,
|
||||||
|
)
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
modelPath,
|
modelPath,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
@ -136,7 +155,9 @@ class MMVCv13:
|
|||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
data["onnxExecutionProviders"] = (
|
||||||
|
self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_processing_sampling_rate(self):
|
def get_processing_sampling_rate(self):
|
||||||
@ -166,7 +187,9 @@ class MMVCv13:
|
|||||||
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
||||||
|
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
self.audio_buffer = np.concatenate(
|
||||||
|
[self.audio_buffer, newData], 0
|
||||||
|
) # 過去のデータに連結
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
|
|
||||||
@ -175,7 +198,9 @@ class MMVCv13:
|
|||||||
# if convertSize < 8192:
|
# if convertSize < 8192:
|
||||||
# convertSize = 8192
|
# convertSize = 8192
|
||||||
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
convertSize = convertSize + (
|
||||||
|
self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
|
||||||
|
)
|
||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||||
@ -207,9 +232,7 @@ class MMVCv13:
|
|||||||
"sid_src": sid_src.numpy(),
|
"sid_src": sid_src.numpy(),
|
||||||
"sid_tgt": sid_tgt1.numpy(),
|
"sid_tgt": sid_tgt1.numpy(),
|
||||||
},
|
},
|
||||||
)[
|
)[0][0, 0]
|
||||||
0
|
|
||||||
][0, 0]
|
|
||||||
* self.hps.data.max_wav_value
|
* self.hps.data.max_wav_value
|
||||||
)
|
)
|
||||||
return audio1
|
return audio1
|
||||||
@ -225,10 +248,19 @@ class MMVCv13:
|
|||||||
dev = torch.device("cuda", index=self.settings.gpu)
|
dev = torch.device("cuda", index=self.settings.gpu)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.to(dev) for x in data]
|
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [
|
||||||
|
x.to(dev) for x in data
|
||||||
|
]
|
||||||
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
|
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
|
||||||
|
|
||||||
audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target)[0, 0].data * self.hps.data.max_wav_value
|
audio1 = (
|
||||||
|
self.net_g.to(dev)
|
||||||
|
.voice_conversion(
|
||||||
|
spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_target
|
||||||
|
)[0, 0]
|
||||||
|
.data
|
||||||
|
* self.hps.data.max_wav_value
|
||||||
|
)
|
||||||
result = audio1.float().cpu().numpy()
|
result = audio1.float().cpu().numpy()
|
||||||
|
|
||||||
return result
|
return result
|
||||||
@ -265,5 +297,5 @@ class MMVCv13:
|
|||||||
{
|
{
|
||||||
"key": "dstId",
|
"key": "dstId",
|
||||||
"val": self.settings.dstId,
|
"val": self.settings.dstId,
|
||||||
}
|
},
|
||||||
]
|
]
|
||||||
|
@ -2,7 +2,7 @@ import sys
|
|||||||
import os
|
import os
|
||||||
from data.ModelSlot import MMVCv15ModelSlot
|
from data.ModelSlot import MMVCv15ModelSlot
|
||||||
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
baseDir = [x for x in sys.path if x.endswith("Contents/MacOS")]
|
||||||
@ -56,9 +56,10 @@ class MMVCv15Settings:
|
|||||||
strData = ["f0Detector"]
|
strData = ["f0Detector"]
|
||||||
|
|
||||||
|
|
||||||
class MMVCv15:
|
class MMVCv15(VoiceChangerModel):
|
||||||
def __init__(self, slotInfo: MMVCv15ModelSlot):
|
def __init__(self, slotInfo: MMVCv15ModelSlot):
|
||||||
print("[Voice Changer] [MMVCv15] Creating instance ")
|
print("[Voice Changer] [MMVCv15] Creating instance ")
|
||||||
|
self.voiceChangerType = "MMVCv15"
|
||||||
self.settings = MMVCv15Settings()
|
self.settings = MMVCv15Settings()
|
||||||
self.net_g = None
|
self.net_g = None
|
||||||
self.onnx_session: onnxruntime.InferenceSession | None = None
|
self.onnx_session: onnxruntime.InferenceSession | None = None
|
||||||
@ -72,8 +73,12 @@ class MMVCv15:
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
print("[Voice Changer] [MMVCv15] Initializing... ")
|
print("[Voice Changer] [MMVCv15] Initializing... ")
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile)
|
configPath = os.path.join(
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile
|
||||||
|
)
|
||||||
|
modelPath = os.path.join(
|
||||||
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile
|
||||||
|
)
|
||||||
|
|
||||||
self.hps = get_hparams_from_file(configPath)
|
self.hps = get_hparams_from_file(configPath)
|
||||||
|
|
||||||
@ -110,7 +115,11 @@ class MMVCv15:
|
|||||||
# print("ONNX INPUT SHAPE", i.name, i.shape)
|
# print("ONNX INPUT SHAPE", i.name, i.shape)
|
||||||
if i.name == "sin":
|
if i.name == "sin":
|
||||||
self.onxx_input_length = i.shape[2]
|
self.onxx_input_length = i.shape[2]
|
||||||
self.settings.maxInputLength = self.onxx_input_length - (0.012 * self.hps.data.sampling_rate) - 1024 # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA
|
self.settings.maxInputLength = (
|
||||||
|
self.onxx_input_length
|
||||||
|
- (0.012 * self.hps.data.sampling_rate)
|
||||||
|
- 1024
|
||||||
|
) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA
|
||||||
else:
|
else:
|
||||||
self.net_g.eval()
|
self.net_g.eval()
|
||||||
load_checkpoint(modelPath, self.net_g, None)
|
load_checkpoint(modelPath, self.net_g, None)
|
||||||
@ -125,7 +134,11 @@ class MMVCv15:
|
|||||||
def getOnnxExecutionProvider(self):
|
def getOnnxExecutionProvider(self):
|
||||||
availableProviders = onnxruntime.get_available_providers()
|
availableProviders = onnxruntime.get_available_providers()
|
||||||
devNum = torch.cuda.device_count()
|
devNum = torch.cuda.device_count()
|
||||||
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0:
|
if (
|
||||||
|
self.settings.gpu >= 0
|
||||||
|
and "CUDAExecutionProvider" in availableProviders
|
||||||
|
and devNum > 0
|
||||||
|
):
|
||||||
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
||||||
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
||||||
return ["DmlExecutionProvider"], [{}]
|
return ["DmlExecutionProvider"], [{}]
|
||||||
@ -145,7 +158,11 @@ class MMVCv15:
|
|||||||
if key == "gpu" and self.slotInfo.isONNX:
|
if key == "gpu" and self.slotInfo.isONNX:
|
||||||
providers, options = self.getOnnxExecutionProvider()
|
providers, options = self.getOnnxExecutionProvider()
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
modelPath = os.path.join(
|
||||||
|
vcparams.model_dir,
|
||||||
|
str(self.slotInfo.slotIndex),
|
||||||
|
self.slotInfo.modelFile,
|
||||||
|
)
|
||||||
self.onnx_session = onnxruntime.InferenceSession(
|
self.onnx_session = onnxruntime.InferenceSession(
|
||||||
modelPath,
|
modelPath,
|
||||||
providers=providers,
|
providers=providers,
|
||||||
@ -155,7 +172,11 @@ class MMVCv15:
|
|||||||
for i in inputs_info:
|
for i in inputs_info:
|
||||||
if i.name == "sin":
|
if i.name == "sin":
|
||||||
self.onxx_input_length = i.shape[2]
|
self.onxx_input_length = i.shape[2]
|
||||||
self.settings.maxInputLength = self.onxx_input_length - (0.012 * self.hps.data.sampling_rate) - 1024 # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA
|
self.settings.maxInputLength = (
|
||||||
|
self.onxx_input_length
|
||||||
|
- (0.012 * self.hps.data.sampling_rate)
|
||||||
|
- 1024
|
||||||
|
) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA
|
||||||
elif key in self.settings.floatData:
|
elif key in self.settings.floatData:
|
||||||
setattr(self.settings, key, float(val))
|
setattr(self.settings, key, float(val))
|
||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
@ -168,7 +189,9 @@ class MMVCv15:
|
|||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
data["onnxExecutionProviders"] = (
|
||||||
|
self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||||
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def get_processing_sampling_rate(self):
|
def get_processing_sampling_rate(self):
|
||||||
@ -179,7 +202,9 @@ class MMVCv15:
|
|||||||
def _get_f0(self, detector: str, newData: AudioInOut):
|
def _get_f0(self, detector: str, newData: AudioInOut):
|
||||||
audio_norm_np = newData.astype(np.float64)
|
audio_norm_np = newData.astype(np.float64)
|
||||||
if detector == "dio":
|
if detector == "dio":
|
||||||
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
|
_f0, _time = pw.dio(
|
||||||
|
audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5
|
||||||
|
)
|
||||||
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
|
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
|
||||||
else:
|
else:
|
||||||
f0, t = pw.harvest(
|
f0, t = pw.harvest(
|
||||||
@ -189,7 +214,9 @@ class MMVCv15:
|
|||||||
f0_floor=71.0,
|
f0_floor=71.0,
|
||||||
f0_ceil=1000.0,
|
f0_ceil=1000.0,
|
||||||
)
|
)
|
||||||
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
|
f0 = convert_continuos_f0(
|
||||||
|
f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length)
|
||||||
|
)
|
||||||
f0 = torch.from_numpy(f0.astype(np.float32))
|
f0 = torch.from_numpy(f0.astype(np.float32))
|
||||||
return f0
|
return f0
|
||||||
|
|
||||||
@ -216,12 +243,16 @@ class MMVCv15:
|
|||||||
):
|
):
|
||||||
# maxInputLength を更新(ここでやると非効率だが、とりあえず。)
|
# maxInputLength を更新(ここでやると非効率だが、とりあえず。)
|
||||||
if self.slotInfo.isONNX:
|
if self.slotInfo.isONNX:
|
||||||
self.settings.maxInputLength = self.onxx_input_length - crossfadeSize - solaSearchFrame # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA get_infoで返る値。この関数内の処理では使わない。
|
self.settings.maxInputLength = (
|
||||||
|
self.onxx_input_length - crossfadeSize - solaSearchFrame
|
||||||
|
) # onnxの場合は入力長固(crossfadeの1024は仮) # NOQA get_infoで返る値。この関数内の処理では使わない。
|
||||||
|
|
||||||
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
||||||
|
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
self.audio_buffer = np.concatenate(
|
||||||
|
[self.audio_buffer, newData], 0
|
||||||
|
) # 過去のデータに連結
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
|
|
||||||
@ -230,7 +261,9 @@ class MMVCv15:
|
|||||||
# if convertSize < 8192:
|
# if convertSize < 8192:
|
||||||
# convertSize = 8192
|
# convertSize = 8192
|
||||||
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
convertSize = convertSize + (
|
||||||
|
self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
|
||||||
|
)
|
||||||
|
|
||||||
# ONNX は固定長
|
# ONNX は固定長
|
||||||
if self.slotInfo.isONNX:
|
if self.slotInfo.isONNX:
|
||||||
@ -266,9 +299,7 @@ class MMVCv15:
|
|||||||
"sid_src": sid_src.numpy(),
|
"sid_src": sid_src.numpy(),
|
||||||
"sid_tgt": sid_tgt1.numpy(),
|
"sid_tgt": sid_tgt1.numpy(),
|
||||||
},
|
},
|
||||||
)[
|
)[0][0, 0]
|
||||||
0
|
|
||||||
][0, 0]
|
|
||||||
* self.hps.data.max_wav_value
|
* self.hps.data.max_wav_value
|
||||||
)
|
)
|
||||||
return audio1
|
return audio1
|
||||||
@ -287,7 +318,12 @@ class MMVCv15:
|
|||||||
sid_src = sid_src.to(dev)
|
sid_src = sid_src.to(dev)
|
||||||
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
|
sid_target = torch.LongTensor([self.settings.dstId]).to(dev)
|
||||||
|
|
||||||
audio1 = self.net_g.to(dev).voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0].data * self.hps.data.max_wav_value
|
audio1 = (
|
||||||
|
self.net_g.to(dev)
|
||||||
|
.voice_conversion(spec, spec_lengths, f0, sid_src, sid_target)[0, 0]
|
||||||
|
.data
|
||||||
|
* self.hps.data.max_wav_value
|
||||||
|
)
|
||||||
result = audio1.float().cpu().numpy()
|
result = audio1.float().cpu().numpy()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -332,5 +368,5 @@ class MMVCv15:
|
|||||||
{
|
{
|
||||||
"key": "f0Factor",
|
"key": "f0Factor",
|
||||||
"val": self.settings.f0Factor,
|
"val": self.settings.f0Factor,
|
||||||
}
|
},
|
||||||
]
|
]
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
'''
|
"""
|
||||||
VoiceChangerV2向け
|
VoiceChangerV2向け
|
||||||
'''
|
"""
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
@ -9,7 +9,12 @@ from mods.log_control import VoiceChangaerLogger
|
|||||||
|
|
||||||
from voice_changer.RVC.RVCSettings import RVCSettings
|
from voice_changer.RVC.RVCSettings import RVCSettings
|
||||||
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
from voice_changer.RVC.embedder.EmbedderManager import EmbedderManager
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut, PitchfInOut, FeatureInOut, VoiceChangerModel
|
from voice_changer.utils.VoiceChangerModel import (
|
||||||
|
AudioInOut,
|
||||||
|
PitchfInOut,
|
||||||
|
FeatureInOut,
|
||||||
|
VoiceChangerModel,
|
||||||
|
)
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
from voice_changer.RVC.onnxExporter.export2onnx import export2onnx
|
||||||
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
from voice_changer.RVC.pitchExtractor.PitchExtractorManager import PitchExtractorManager
|
||||||
@ -17,7 +22,11 @@ from voice_changer.RVC.pipeline.PipelineGenerator import createPipeline
|
|||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
from voice_changer.RVC.pipeline.Pipeline import Pipeline
|
||||||
|
|
||||||
from Exceptions import DeviceCannotSupportHalfPrecisionException, PipelineCreateException, PipelineNotInitializedException
|
from Exceptions import (
|
||||||
|
DeviceCannotSupportHalfPrecisionException,
|
||||||
|
PipelineCreateException,
|
||||||
|
PipelineNotInitializedException,
|
||||||
|
)
|
||||||
import resampy
|
import resampy
|
||||||
from typing import cast
|
from typing import cast
|
||||||
|
|
||||||
@ -27,6 +36,8 @@ logger = VoiceChangaerLogger.get_instance().getLogger()
|
|||||||
class RVCr2(VoiceChangerModel):
|
class RVCr2(VoiceChangerModel):
|
||||||
def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot):
|
def __init__(self, params: VoiceChangerParams, slotInfo: RVCModelSlot):
|
||||||
logger.info("[Voice Changer] [RVCr2] Creating instance ")
|
logger.info("[Voice Changer] [RVCr2] Creating instance ")
|
||||||
|
self.voiceChangerType = "RVC"
|
||||||
|
|
||||||
self.deviceManager = DeviceManager.get_instance()
|
self.deviceManager = DeviceManager.get_instance()
|
||||||
EmbedderManager.initialize(params)
|
EmbedderManager.initialize(params)
|
||||||
PitchExtractorManager.initialize(params)
|
PitchExtractorManager.initialize(params)
|
||||||
@ -48,9 +59,13 @@ class RVCr2(VoiceChangerModel):
|
|||||||
|
|
||||||
# pipelineの生成
|
# pipelineの生成
|
||||||
try:
|
try:
|
||||||
self.pipeline = createPipeline(self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector)
|
self.pipeline = createPipeline(
|
||||||
|
self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector
|
||||||
|
)
|
||||||
except PipelineCreateException as e: # NOQA
|
except PipelineCreateException as e: # NOQA
|
||||||
logger.error("[Voice Changer] pipeline create failed. check your model is valid.")
|
logger.error(
|
||||||
|
"[Voice Changer] pipeline create failed. check your model is valid."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
# その他の設定
|
# その他の設定
|
||||||
@ -76,7 +91,9 @@ class RVCr2(VoiceChangerModel):
|
|||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
setattr(self.settings, key, str(val))
|
setattr(self.settings, key, str(val))
|
||||||
if key == "f0Detector" and self.pipeline is not None:
|
if key == "f0Detector" and self.pipeline is not None:
|
||||||
pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
|
pitchExtractor = PitchExtractorManager.getPitchExtractor(
|
||||||
|
self.settings.f0Detector, self.settings.gpu
|
||||||
|
)
|
||||||
self.pipeline.setPitchExtractor(pitchExtractor)
|
self.pipeline.setPitchExtractor(pitchExtractor)
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
@ -99,7 +116,7 @@ class RVCr2(VoiceChangerModel):
|
|||||||
newData: AudioInOut,
|
newData: AudioInOut,
|
||||||
crossfadeSize: int,
|
crossfadeSize: int,
|
||||||
solaSearchFrame: int,
|
solaSearchFrame: int,
|
||||||
extra_frame: int
|
extra_frame: int,
|
||||||
):
|
):
|
||||||
# 16k で入ってくる。
|
# 16k で入ってくる。
|
||||||
inputSize = newData.shape[0]
|
inputSize = newData.shape[0]
|
||||||
@ -110,26 +127,47 @@ class RVCr2(VoiceChangerModel):
|
|||||||
# 過去のデータに連結
|
# 過去のデータに連結
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0)
|
self.pitchf_buffer = np.concatenate(
|
||||||
self.feature_buffer = np.concatenate([self.feature_buffer, np.zeros([newFeatureLength, self.slotInfo.embChannels])], 0)
|
[self.pitchf_buffer, np.zeros(newFeatureLength)], 0
|
||||||
|
)
|
||||||
|
self.feature_buffer = np.concatenate(
|
||||||
|
[
|
||||||
|
self.feature_buffer,
|
||||||
|
np.zeros([newFeatureLength, self.slotInfo.embChannels]),
|
||||||
|
],
|
||||||
|
0,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.zeros(newFeatureLength)
|
self.pitchf_buffer = np.zeros(newFeatureLength)
|
||||||
self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels])
|
self.feature_buffer = np.zeros(
|
||||||
|
[newFeatureLength, self.slotInfo.embChannels]
|
||||||
|
)
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame
|
convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame
|
||||||
|
|
||||||
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % 160 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (160 - (convertSize % 160))
|
convertSize = convertSize + (160 - (convertSize % 160))
|
||||||
outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate)
|
outSize = int(
|
||||||
|
((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate
|
||||||
|
)
|
||||||
|
|
||||||
# バッファがたまっていない場合はzeroで補う
|
# バッファがたまっていない場合はzeroで補う
|
||||||
if self.audio_buffer.shape[0] < convertSize:
|
if self.audio_buffer.shape[0] < convertSize:
|
||||||
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
|
self.audio_buffer = np.concatenate(
|
||||||
|
[np.zeros([convertSize]), self.audio_buffer]
|
||||||
|
)
|
||||||
if self.slotInfo.f0:
|
if self.slotInfo.f0:
|
||||||
self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer])
|
self.pitchf_buffer = np.concatenate(
|
||||||
self.feature_buffer = np.concatenate([np.zeros([convertSize // 160, self.slotInfo.embChannels]), self.feature_buffer])
|
[np.zeros([convertSize // 160]), self.pitchf_buffer]
|
||||||
|
)
|
||||||
|
self.feature_buffer = np.concatenate(
|
||||||
|
[
|
||||||
|
np.zeros([convertSize // 160, self.slotInfo.embChannels]),
|
||||||
|
self.feature_buffer,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
# 不要部分をトリミング
|
# 不要部分をトリミング
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
@ -147,9 +185,18 @@ class RVCr2(VoiceChangerModel):
|
|||||||
vol = max(vol, self.prevVol * 0.0)
|
vol = max(vol, self.prevVol * 0.0)
|
||||||
self.prevVol = vol
|
self.prevVol = vol
|
||||||
|
|
||||||
return (self.audio_buffer, self.pitchf_buffer, self.feature_buffer, convertSize, vol, outSize)
|
return (
|
||||||
|
self.audio_buffer,
|
||||||
|
self.pitchf_buffer,
|
||||||
|
self.feature_buffer,
|
||||||
|
convertSize,
|
||||||
|
vol,
|
||||||
|
outSize,
|
||||||
|
)
|
||||||
|
|
||||||
def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
|
def inference(
|
||||||
|
self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int
|
||||||
|
):
|
||||||
if self.pipeline is None:
|
if self.pipeline is None:
|
||||||
logger.info("[Voice Changer] Pipeline is not initialized.")
|
logger.info("[Voice Changer] Pipeline is not initialized.")
|
||||||
raise PipelineNotInitializedException()
|
raise PipelineNotInitializedException()
|
||||||
@ -165,10 +212,14 @@ class RVCr2(VoiceChangerModel):
|
|||||||
)
|
)
|
||||||
crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
|
crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
|
||||||
sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
|
sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
|
||||||
extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000)
|
extra_frame = int(
|
||||||
|
(self.settings.extraConvertSize / self.inputSampleRate) * 16000
|
||||||
|
)
|
||||||
|
|
||||||
# 入力データ生成
|
# 入力データ生成
|
||||||
data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame)
|
data = self.generate_input(
|
||||||
|
receivedData, crossfade_frame, sola_search_frame, extra_frame
|
||||||
|
)
|
||||||
|
|
||||||
audio = data[0]
|
audio = data[0]
|
||||||
pitchf = data[1]
|
pitchf = data[1]
|
||||||
@ -203,12 +254,14 @@ class RVCr2(VoiceChangerModel):
|
|||||||
index_rate,
|
index_rate,
|
||||||
if_f0,
|
if_f0,
|
||||||
# 0,
|
# 0,
|
||||||
self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0., # extaraDataSizeの秒数。入力のサンプリングレートで算出
|
self.settings.extraConvertSize / self.inputSampleRate
|
||||||
|
if self.settings.silenceFront
|
||||||
|
else 0.0, # extaraDataSizeの秒数。入力のサンプリングレートで算出
|
||||||
embOutputLayer,
|
embOutputLayer,
|
||||||
useFinalProj,
|
useFinalProj,
|
||||||
repeat,
|
repeat,
|
||||||
protect,
|
protect,
|
||||||
outSize
|
outSize,
|
||||||
)
|
)
|
||||||
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
# result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
|
||||||
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)
|
||||||
@ -224,7 +277,9 @@ class RVCr2(VoiceChangerModel):
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
except DeviceCannotSupportHalfPrecisionException as e: # NOQA
|
||||||
logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....")
|
logger.warn(
|
||||||
|
"[Device Manager] Device cannot support half precision. Fallback to float...."
|
||||||
|
)
|
||||||
self.deviceManager.setForceTensor(True)
|
self.deviceManager.setForceTensor(True)
|
||||||
self.initialize()
|
self.initialize()
|
||||||
# raise e
|
# raise e
|
||||||
|
@ -3,7 +3,7 @@ import os
|
|||||||
from data.ModelSlot import SoVitsSvc40ModelSlot
|
from data.ModelSlot import SoVitsSvc40ModelSlot
|
||||||
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
from voice_changer.VoiceChangerParamsManager import VoiceChangerParamsManager
|
||||||
|
|
||||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
from voice_changer.utils.VoiceChangerModel import AudioInOut, VoiceChangerModel
|
||||||
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
||||||
|
|
||||||
if sys.platform.startswith("darwin"):
|
if sys.platform.startswith("darwin"):
|
||||||
@ -27,7 +27,13 @@ import pyworld as pw
|
|||||||
|
|
||||||
# from models import SynthesizerTrn # type:ignore
|
# from models import SynthesizerTrn # type:ignore
|
||||||
from .models.models import SynthesizerTrn
|
from .models.models import SynthesizerTrn
|
||||||
from .models.utils import interpolate_f0, get_hparams_from_file, load_checkpoint, repeat_expand_2d, get_hubert_content
|
from .models.utils import (
|
||||||
|
interpolate_f0,
|
||||||
|
get_hparams_from_file,
|
||||||
|
load_checkpoint,
|
||||||
|
repeat_expand_2d,
|
||||||
|
get_hubert_content,
|
||||||
|
)
|
||||||
from .models.cluster import get_cluster_model, get_cluster_center_result
|
from .models.cluster import get_cluster_model, get_cluster_center_result
|
||||||
from fairseq import checkpoint_utils
|
from fairseq import checkpoint_utils
|
||||||
import librosa
|
import librosa
|
||||||
@ -64,9 +70,10 @@ class SoVitsSvc40Settings:
|
|||||||
strData = ["f0Detector"]
|
strData = ["f0Detector"]
|
||||||
|
|
||||||
|
|
||||||
class SoVitsSvc40:
|
class SoVitsSvc40(VoiceChangerModel):
|
||||||
def __init__(self, params: VoiceChangerParams, slotInfo: SoVitsSvc40ModelSlot):
|
def __init__(self, params: VoiceChangerParams, slotInfo: SoVitsSvc40ModelSlot):
|
||||||
print("[Voice Changer] [so-vits-svc40] Creating instance ")
|
print("[Voice Changer] [so-vits-svc40] Creating instance ")
|
||||||
|
self.voiceChangerType = "so-vits-svc-40"
|
||||||
self.settings = SoVitsSvc40Settings()
|
self.settings = SoVitsSvc40Settings()
|
||||||
self.net_g = None
|
self.net_g = None
|
||||||
self.onnx_session = None
|
self.onnx_session = None
|
||||||
@ -94,20 +101,31 @@ class SoVitsSvc40:
|
|||||||
def initialize(self):
|
def initialize(self):
|
||||||
print("[Voice Changer] [so-vits-svc40] Initializing... ")
|
print("[Voice Changer] [so-vits-svc40] Initializing... ")
|
||||||
vcparams = VoiceChangerParamsManager.get_instance().params
|
vcparams = VoiceChangerParamsManager.get_instance().params
|
||||||
configPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile)
|
configPath = os.path.join(
|
||||||
modelPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile)
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.configFile
|
||||||
|
)
|
||||||
|
modelPath = os.path.join(
|
||||||
|
vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.modelFile
|
||||||
|
)
|
||||||
self.hps = get_hparams_from_file(configPath)
|
self.hps = get_hparams_from_file(configPath)
|
||||||
self.settings.speakers = self.hps.spk
|
self.settings.speakers = self.hps.spk
|
||||||
|
|
||||||
# cluster
|
# cluster
|
||||||
try:
|
try:
|
||||||
if self.slotInfo.clusterFile is not None:
|
if self.slotInfo.clusterFile is not None:
|
||||||
clusterPath = os.path.join(vcparams.model_dir, str(self.slotInfo.slotIndex), self.slotInfo.clusterFile)
|
clusterPath = os.path.join(
|
||||||
|
vcparams.model_dir,
|
||||||
|
str(self.slotInfo.slotIndex),
|
||||||
|
self.slotInfo.clusterFile,
|
||||||
|
)
|
||||||
self.cluster_model = get_cluster_model(clusterPath)
|
self.cluster_model = get_cluster_model(clusterPath)
|
||||||
else:
|
else:
|
||||||
self.cluster_model = None
|
self.cluster_model = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("[Voice Changer] [so-vits-svc40] EXCEPTION during loading cluster model ", e)
|
print(
|
||||||
|
"[Voice Changer] [so-vits-svc40] EXCEPTION during loading cluster model ",
|
||||||
|
e,
|
||||||
|
)
|
||||||
print("[Voice Changer] [so-vits-svc40] fallback to without cluster")
|
print("[Voice Changer] [so-vits-svc40] fallback to without cluster")
|
||||||
self.cluster_model = None
|
self.cluster_model = None
|
||||||
|
|
||||||
@ -132,7 +150,11 @@ class SoVitsSvc40:
|
|||||||
def getOnnxExecutionProvider(self):
|
def getOnnxExecutionProvider(self):
|
||||||
availableProviders = onnxruntime.get_available_providers()
|
availableProviders = onnxruntime.get_available_providers()
|
||||||
devNum = torch.cuda.device_count()
|
devNum = torch.cuda.device_count()
|
||||||
if self.settings.gpu >= 0 and "CUDAExecutionProvider" in availableProviders and devNum > 0:
|
if (
|
||||||
|
self.settings.gpu >= 0
|
||||||
|
and "CUDAExecutionProvider" in availableProviders
|
||||||
|
and devNum > 0
|
||||||
|
):
|
||||||
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
return ["CUDAExecutionProvider"], [{"device_id": self.settings.gpu}]
|
||||||
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
elif self.settings.gpu >= 0 and "DmlExecutionProvider" in availableProviders:
|
||||||
return ["DmlExecutionProvider"], [{}]
|
return ["DmlExecutionProvider"], [{}]
|
||||||
@ -170,7 +192,9 @@ class SoVitsSvc40:
|
|||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
|
|
||||||
data["onnxExecutionProviders"] = self.onnx_session.get_providers() if self.onnx_session is not None else []
|
data["onnxExecutionProviders"] = (
|
||||||
|
self.onnx_session.get_providers() if self.onnx_session is not None else []
|
||||||
|
)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@ -196,7 +220,9 @@ class SoVitsSvc40:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if wav_44k.shape[0] % self.hps.data.hop_length != 0:
|
if wav_44k.shape[0] % self.hps.data.hop_length != 0:
|
||||||
print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}")
|
print(
|
||||||
|
f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}"
|
||||||
|
)
|
||||||
|
|
||||||
f0, uv = interpolate_f0(f0)
|
f0, uv = interpolate_f0(f0)
|
||||||
f0 = torch.FloatTensor(f0)
|
f0 = torch.FloatTensor(f0)
|
||||||
@ -205,7 +231,9 @@ class SoVitsSvc40:
|
|||||||
f0 = f0.unsqueeze(0)
|
f0 = f0.unsqueeze(0)
|
||||||
uv = uv.unsqueeze(0)
|
uv = uv.unsqueeze(0)
|
||||||
|
|
||||||
wav16k_numpy = librosa.resample(audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000)
|
wav16k_numpy = librosa.resample(
|
||||||
|
audio_buffer, orig_sr=self.hps.data.sampling_rate, target_sr=16000
|
||||||
|
)
|
||||||
wav16k_tensor = torch.from_numpy(wav16k_numpy)
|
wav16k_tensor = torch.from_numpy(wav16k_numpy)
|
||||||
|
|
||||||
if (self.settings.gpu < 0 or self.gpu_num == 0) or self.slotInfo.isONNX:
|
if (self.settings.gpu < 0 or self.gpu_num == 0) or self.slotInfo.isONNX:
|
||||||
@ -226,7 +254,9 @@ class SoVitsSvc40:
|
|||||||
if self.hps.model.ssl_dim == 768:
|
if self.hps.model.ssl_dim == 768:
|
||||||
self.hubert_model = self.hubert_model.to(dev)
|
self.hubert_model = self.hubert_model.to(dev)
|
||||||
wav16k_tensor = wav16k_tensor.to(dev)
|
wav16k_tensor = wav16k_tensor.to(dev)
|
||||||
c = get_hubert_content_layer9(self.hubert_model, wav_16k_tensor=wav16k_tensor)
|
c = get_hubert_content_layer9(
|
||||||
|
self.hubert_model, wav_16k_tensor=wav16k_tensor
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.hubert_model = self.hubert_model.to(dev)
|
self.hubert_model = self.hubert_model.to(dev)
|
||||||
wav16k_tensor = wav16k_tensor.to(dev)
|
wav16k_tensor = wav16k_tensor.to(dev)
|
||||||
@ -237,16 +267,29 @@ class SoVitsSvc40:
|
|||||||
|
|
||||||
c = repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
c = repeat_expand_2d(c.squeeze(0), f0.shape[1])
|
||||||
|
|
||||||
if self.settings.clusterInferRatio != 0 and hasattr(self, "cluster_model") and self.cluster_model is not None:
|
if (
|
||||||
speaker = [key for key, value in self.settings.speakers.items() if value == self.settings.dstId]
|
self.settings.clusterInferRatio != 0
|
||||||
|
and hasattr(self, "cluster_model")
|
||||||
|
and self.cluster_model is not None
|
||||||
|
):
|
||||||
|
speaker = [
|
||||||
|
key
|
||||||
|
for key, value in self.settings.speakers.items()
|
||||||
|
if value == self.settings.dstId
|
||||||
|
]
|
||||||
if len(speaker) != 1:
|
if len(speaker) != 1:
|
||||||
pass
|
pass
|
||||||
# print("not only one speaker found.", speaker)
|
# print("not only one speaker found.", speaker)
|
||||||
else:
|
else:
|
||||||
cluster_c = get_cluster_center_result(self.cluster_model, c.cpu().numpy().T, speaker[0]).T
|
cluster_c = get_cluster_center_result(
|
||||||
|
self.cluster_model, c.cpu().numpy().T, speaker[0]
|
||||||
|
).T
|
||||||
cluster_c = torch.FloatTensor(cluster_c).to(dev)
|
cluster_c = torch.FloatTensor(cluster_c).to(dev)
|
||||||
c = c.to(dev)
|
c = c.to(dev)
|
||||||
c = self.settings.clusterInferRatio * cluster_c + (1 - self.settings.clusterInferRatio) * c
|
c = (
|
||||||
|
self.settings.clusterInferRatio * cluster_c
|
||||||
|
+ (1 - self.settings.clusterInferRatio) * c
|
||||||
|
)
|
||||||
|
|
||||||
c = c.unsqueeze(0)
|
c = c.unsqueeze(0)
|
||||||
return c, f0, uv
|
return c, f0, uv
|
||||||
@ -261,14 +304,20 @@ class SoVitsSvc40:
|
|||||||
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
newData = newData.astype(np.float32) / self.hps.data.max_wav_value
|
||||||
|
|
||||||
if self.audio_buffer is not None:
|
if self.audio_buffer is not None:
|
||||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
self.audio_buffer = np.concatenate(
|
||||||
|
[self.audio_buffer, newData], 0
|
||||||
|
) # 過去のデータに連結
|
||||||
else:
|
else:
|
||||||
self.audio_buffer = newData
|
self.audio_buffer = newData
|
||||||
|
|
||||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
convertSize = (
|
||||||
|
inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||||
|
)
|
||||||
|
|
||||||
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
convertSize = convertSize + (
|
||||||
|
self.hps.data.hop_length - (convertSize % self.hps.data.hop_length)
|
||||||
|
)
|
||||||
|
|
||||||
convertOffset = -1 * convertSize
|
convertOffset = -1 * convertSize
|
||||||
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
|
||||||
@ -306,7 +355,9 @@ class SoVitsSvc40:
|
|||||||
"f0": f0.astype(np.float32),
|
"f0": f0.astype(np.float32),
|
||||||
"uv": uv.astype(np.float32),
|
"uv": uv.astype(np.float32),
|
||||||
"g": sid_target.astype(np.int64),
|
"g": sid_target.astype(np.int64),
|
||||||
"noise_scale": np.array([self.settings.noiseScale]).astype(np.float32),
|
"noise_scale": np.array([self.settings.noiseScale]).astype(
|
||||||
|
np.float32
|
||||||
|
),
|
||||||
# "predict_f0": np.array([self.settings.dstId]).astype(np.int64),
|
# "predict_f0": np.array([self.settings.dstId]).astype(np.int64),
|
||||||
},
|
},
|
||||||
)[0][0, 0]
|
)[0][0, 0]
|
||||||
@ -385,8 +436,7 @@ class SoVitsSvc40:
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def get_model_current(self):
|
def get_model_current(self):
|
||||||
return [
|
return []
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def resize_f0(x, target_len):
|
def resize_f0(x, target_len):
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
'''
|
"""
|
||||||
■ VoiceChangerV2
|
■ VoiceChangerV2
|
||||||
- VoiceChangerとの差分
|
- VoiceChangerとの差分
|
||||||
・リサンプル処理の無駄を省くため、VoiceChangerModelにリサンプル処理を移譲
|
・リサンプル処理の無駄を省くため、VoiceChangerModelにリサンプル処理を移譲
|
||||||
@ -7,7 +7,7 @@
|
|||||||
- 適用VoiceChangerModel
|
- 適用VoiceChangerModel
|
||||||
・DiffusionSVC
|
・DiffusionSVC
|
||||||
・RVC
|
・RVC
|
||||||
'''
|
"""
|
||||||
|
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
@ -18,7 +18,8 @@ import numpy as np
|
|||||||
from dataclasses import dataclass, asdict, field
|
from dataclasses import dataclass, asdict, field
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
from mods.log_control import VoiceChangaerLogger
|
from mods.log_control import VoiceChangaerLogger
|
||||||
from voice_changer.Beatrice.Beatrice import Beatrice
|
|
||||||
|
# from voice_changer.Beatrice.Beatrice import Beatrice
|
||||||
|
|
||||||
from voice_changer.IORecorder import IORecorder
|
from voice_changer.IORecorder import IORecorder
|
||||||
|
|
||||||
@ -89,27 +90,38 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
self.params = params
|
self.params = params
|
||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
self.prev_audio = np.zeros(4096)
|
self.prev_audio = np.zeros(4096)
|
||||||
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
self.mps_enabled: bool = (
|
||||||
|
getattr(torch.backends, "mps", None) is not None
|
||||||
|
and torch.backends.mps.is_available()
|
||||||
|
)
|
||||||
self.onnx_device = onnxruntime.get_device()
|
self.onnx_device = onnxruntime.get_device()
|
||||||
self.noCrossFade = False
|
self.noCrossFade = False
|
||||||
|
|
||||||
logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
|
logger.info(
|
||||||
|
f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})"
|
||||||
|
)
|
||||||
|
|
||||||
def setModel(self, model: VoiceChangerModel):
|
def setModel(self, model: VoiceChangerModel):
|
||||||
self.voiceChanger = model
|
self.voiceChanger = model
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
if isinstance(model, Beatrice):
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
if model.voiceChangerType == "Beatrice":
|
||||||
self.noCrossFade = True
|
self.noCrossFade = True
|
||||||
else:
|
else:
|
||||||
self.noCrossFade = False
|
self.noCrossFade = False
|
||||||
|
|
||||||
def setInputSampleRate(self, sr: int):
|
def setInputSampleRate(self, sr: int):
|
||||||
self.settings.inputSampleRate = sr
|
self.settings.inputSampleRate = sr
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
def setOutputSampleRate(self, sr: int):
|
def setOutputSampleRate(self, sr: int):
|
||||||
self.settings.outputSampleRate = sr
|
self.settings.outputSampleRate = sr
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
def get_info(self):
|
def get_info(self):
|
||||||
data = asdict(self.settings)
|
data = asdict(self.settings)
|
||||||
@ -128,7 +140,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
if key == "serverAudioStated" and val == 0:
|
if key == "serverAudioStated" and val == 0:
|
||||||
self.settings.inputSampleRate = 48000
|
self.settings.inputSampleRate = 48000
|
||||||
self.settings.outputSampleRate = 48000
|
self.settings.outputSampleRate = 48000
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
|
|
||||||
if key in self.settings.intData:
|
if key in self.settings.intData:
|
||||||
setattr(self.settings, key, int(val))
|
setattr(self.settings, key, int(val))
|
||||||
@ -137,7 +151,12 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
if key == "recordIO" and val == 1:
|
if key == "recordIO" and val == 1:
|
||||||
if hasattr(self, "ioRecorder"):
|
if hasattr(self, "ioRecorder"):
|
||||||
self.ioRecorder.close()
|
self.ioRecorder.close()
|
||||||
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.ioRecorder = IORecorder(
|
||||||
|
STREAM_INPUT_FILE,
|
||||||
|
STREAM_OUTPUT_FILE,
|
||||||
|
self.settings.inputSampleRate,
|
||||||
|
self.settings.outputSampleRate,
|
||||||
|
)
|
||||||
if key == "recordIO" and val == 0:
|
if key == "recordIO" and val == 0:
|
||||||
if hasattr(self, "ioRecorder"):
|
if hasattr(self, "ioRecorder"):
|
||||||
self.ioRecorder.close()
|
self.ioRecorder.close()
|
||||||
@ -146,7 +165,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
if hasattr(self, "ioRecorder"):
|
if hasattr(self, "ioRecorder"):
|
||||||
self.ioRecorder.close()
|
self.ioRecorder.close()
|
||||||
if key == "inputSampleRate" or key == "outputSampleRate":
|
if key == "inputSampleRate" or key == "outputSampleRate":
|
||||||
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
|
self.voiceChanger.setSamplingRate(
|
||||||
|
self.settings.inputSampleRate, self.settings.outputSampleRate
|
||||||
|
)
|
||||||
elif key in self.settings.floatData:
|
elif key in self.settings.floatData:
|
||||||
setattr(self.settings, key, float(val))
|
setattr(self.settings, key, float(val))
|
||||||
elif key in self.settings.strData:
|
elif key in self.settings.strData:
|
||||||
@ -159,7 +180,12 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
return self.get_info()
|
return self.get_info()
|
||||||
|
|
||||||
def _generate_strength(self, crossfadeSize: int):
|
def _generate_strength(self, crossfadeSize: int):
|
||||||
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
|
if (
|
||||||
|
self.crossfadeSize != crossfadeSize
|
||||||
|
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
|
||||||
|
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
|
||||||
|
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
|
||||||
|
):
|
||||||
self.crossfadeSize = crossfadeSize
|
self.crossfadeSize = crossfadeSize
|
||||||
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
||||||
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
||||||
@ -188,7 +214,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
|
logger.info(
|
||||||
|
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
|
||||||
|
)
|
||||||
|
|
||||||
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
||||||
if hasattr(self, "np_prev_audio1") is True:
|
if hasattr(self, "np_prev_audio1") is True:
|
||||||
@ -203,13 +231,19 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
return self.voiceChanger.get_processing_sampling_rate()
|
return self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
|
||||||
# receivedData: tuple of short
|
# receivedData: tuple of short
|
||||||
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
def on_request(
|
||||||
|
self, receivedData: AudioInOut
|
||||||
|
) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||||
try:
|
try:
|
||||||
if self.voiceChanger is None:
|
if self.voiceChanger is None:
|
||||||
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
|
raise VoiceChangerIsNotSelectedException(
|
||||||
|
"Voice Changer is not selected."
|
||||||
|
)
|
||||||
|
|
||||||
with Timer("main-process") as t:
|
with Timer("main-process") as t:
|
||||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
processing_sampling_rate = (
|
||||||
|
self.voiceChanger.get_processing_sampling_rate()
|
||||||
|
)
|
||||||
|
|
||||||
if self.noCrossFade: # Beatrice
|
if self.noCrossFade: # Beatrice
|
||||||
audio = self.voiceChanger.inference(
|
audio = self.voiceChanger.inference(
|
||||||
@ -223,18 +257,22 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
else:
|
else:
|
||||||
sola_search_frame = int(0.012 * processing_sampling_rate)
|
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||||
block_frame = receivedData.shape[0]
|
block_frame = receivedData.shape[0]
|
||||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
crossfade_frame = min(
|
||||||
|
self.settings.crossFadeOverlapSize, block_frame
|
||||||
|
)
|
||||||
self._generate_strength(crossfade_frame)
|
self._generate_strength(crossfade_frame)
|
||||||
|
|
||||||
audio = self.voiceChanger.inference(
|
audio = self.voiceChanger.inference(
|
||||||
receivedData,
|
receivedData,
|
||||||
crossfade_frame=crossfade_frame,
|
crossfade_frame=crossfade_frame,
|
||||||
sola_search_frame=sola_search_frame
|
sola_search_frame=sola_search_frame,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hasattr(self, "sola_buffer") is True:
|
if hasattr(self, "sola_buffer") is True:
|
||||||
np.set_printoptions(threshold=10000)
|
np.set_printoptions(threshold=10000)
|
||||||
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
|
audio_offset = -1 * (
|
||||||
|
sola_search_frame + crossfade_frame + block_frame
|
||||||
|
)
|
||||||
audio = audio[audio_offset:]
|
audio = audio[audio_offset:]
|
||||||
|
|
||||||
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
||||||
@ -259,16 +297,25 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
|
|
||||||
result = output_wav
|
result = output_wav
|
||||||
else:
|
else:
|
||||||
logger.info("[Voice Changer] warming up... generating sola buffer.")
|
logger.info(
|
||||||
|
"[Voice Changer] warming up... generating sola buffer."
|
||||||
|
)
|
||||||
result = np.zeros(4096).astype(np.int16)
|
result = np.zeros(4096).astype(np.int16)
|
||||||
|
|
||||||
if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
|
if (
|
||||||
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
|
hasattr(self, "sola_buffer") is True
|
||||||
|
and sola_offset < sola_search_frame
|
||||||
|
):
|
||||||
|
offset = -1 * (
|
||||||
|
sola_search_frame + crossfade_frame - sola_offset
|
||||||
|
)
|
||||||
end = -1 * (sola_search_frame - sola_offset)
|
end = -1 * (sola_search_frame - sola_offset)
|
||||||
sola_buf_org = audio[offset:end]
|
sola_buf_org = audio[offset:end]
|
||||||
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
||||||
else:
|
else:
|
||||||
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
|
self.sola_buffer = (
|
||||||
|
audio[-crossfade_frame:] * self.np_prev_strength
|
||||||
|
)
|
||||||
# self.sola_buffer = audio[- crossfade_frame:]
|
# self.sola_buffer = audio[- crossfade_frame:]
|
||||||
|
|
||||||
mainprocess_time = t.secs
|
mainprocess_time = t.secs
|
||||||
@ -277,7 +324,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
with Timer("post-process") as t:
|
with Timer("post-process") as t:
|
||||||
result = result.astype(np.int16)
|
result = result.astype(np.int16)
|
||||||
|
|
||||||
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
|
print_convert_processing(
|
||||||
|
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz"
|
||||||
|
)
|
||||||
|
|
||||||
if receivedData.shape[0] != result.shape[0]:
|
if receivedData.shape[0] != result.shape[0]:
|
||||||
outputData = pad_array(result, receivedData.shape[0])
|
outputData = pad_array(result, receivedData.shape[0])
|
||||||
@ -291,7 +340,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
|
|
||||||
postprocess_time = t.secs
|
postprocess_time = t.secs
|
||||||
|
|
||||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
print_convert_processing(
|
||||||
|
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
|
||||||
|
)
|
||||||
perf = [0, mainprocess_time, postprocess_time]
|
perf = [0, mainprocess_time, postprocess_time]
|
||||||
|
|
||||||
return outputData, perf
|
return outputData, perf
|
||||||
@ -300,7 +351,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
logger.warn(f"[Voice Changer] [Exception], {e}")
|
logger.warn(f"[Voice Changer] [Exception], {e}")
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except ONNXInputArgumentException as e:
|
except ONNXInputArgumentException as e:
|
||||||
logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}")
|
logger.warn(
|
||||||
|
f"[Voice Changer] [Exception] onnx are waiting valid input., {e}"
|
||||||
|
)
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except HalfPrecisionChangingException:
|
except HalfPrecisionChangingException:
|
||||||
logger.warn("[Voice Changer] Switching model configuration....")
|
logger.warn("[Voice Changer] Switching model configuration....")
|
||||||
@ -312,7 +365,9 @@ class VoiceChangerV2(VoiceChangerIF):
|
|||||||
logger.warn(f"[Voice Changer] embedder: {e}")
|
logger.warn(f"[Voice Changer] embedder: {e}")
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except VoiceChangerIsNotSelectedException:
|
except VoiceChangerIsNotSelectedException:
|
||||||
logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
|
logger.warn(
|
||||||
|
"[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc."
|
||||||
|
)
|
||||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||||
except DeviceCannotSupportHalfPrecisionException:
|
except DeviceCannotSupportHalfPrecisionException:
|
||||||
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from typing import Any, Protocol, TypeAlias
|
from typing import Any, Protocol, TypeAlias
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from const import VoiceChangerType
|
||||||
|
|
||||||
from voice_changer.utils.LoadModelParams import LoadModelParams
|
from voice_changer.utils.LoadModelParams import LoadModelParams
|
||||||
|
|
||||||
@ -10,6 +11,8 @@ FeatureInOut: TypeAlias = np.ndarray[Any, np.dtype[np.int16]]
|
|||||||
|
|
||||||
|
|
||||||
class VoiceChangerModel(Protocol):
|
class VoiceChangerModel(Protocol):
|
||||||
|
voiceChangerType: VoiceChangerType = "RVC"
|
||||||
|
|
||||||
# loadModel: Callable[..., dict[str, Any]]
|
# loadModel: Callable[..., dict[str, Any]]
|
||||||
def loadModel(self, params: LoadModelParams):
|
def loadModel(self, params: LoadModelParams):
|
||||||
...
|
...
|
||||||
@ -23,7 +26,13 @@ class VoiceChangerModel(Protocol):
|
|||||||
def inference(self, data: tuple[Any, ...]) -> Any:
|
def inference(self, data: tuple[Any, ...]) -> Any:
|
||||||
...
|
...
|
||||||
|
|
||||||
def generate_input(self, newData: AudioInOut, inputSize: int, crossfadeSize: int, solaSearchFrame: int) -> tuple[Any, ...]:
|
def generate_input(
|
||||||
|
self,
|
||||||
|
newData: AudioInOut,
|
||||||
|
inputSize: int,
|
||||||
|
crossfadeSize: int,
|
||||||
|
solaSearchFrame: int,
|
||||||
|
) -> tuple[Any, ...]:
|
||||||
...
|
...
|
||||||
|
|
||||||
def update_settings(self, key: str, val: int | float | str) -> bool:
|
def update_settings(self, key: str, val: int | float | str) -> bool:
|
||||||
|
Loading…
Reference in New Issue
Block a user