diff --git a/.gitignore b/.gitignore index e6bacf72..f9fdae91 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ server/memo.md client/lib/dist client/lib/worklet/dist +client/demo/public/models +client/demo/dist/models +client/demo/src/001_provider/backup # client/demo/dist/ # demo用に残す docker/cudnn/ diff --git a/client/demo/webpack.common.js b/client/demo/webpack.common.js index ea92cc80..d61423f8 100644 --- a/client/demo/webpack.common.js +++ b/client/demo/webpack.common.js @@ -1,11 +1,15 @@ const path = require("path"); const HtmlWebpackPlugin = require("html-webpack-plugin"); const CopyPlugin = require("copy-webpack-plugin"); +const webpack = require("webpack"); module.exports = { mode: "production", entry: "./src/000_index.tsx", resolve: { extensions: [".ts", ".tsx", ".js"], + fallback: { + buffer: require.resolve("buffer/"), + }, }, module: { rules: [ @@ -29,7 +33,7 @@ module.exports = { test: /\.css$/, use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"], }, - + { test: /\.json$/, type: "asset/inline" }, ], }, output: { @@ -37,6 +41,9 @@ module.exports = { path: path.resolve(__dirname, "dist"), }, plugins: [ + new webpack.ProvidePlugin({ + Buffer: ["buffer", "Buffer"], + }), new HtmlWebpackPlugin({ template: path.resolve(__dirname, "public/index.html"), filename: "./index.html", @@ -47,5 +54,5 @@ module.exports = { new CopyPlugin({ patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }], }), - ] + ], }; diff --git a/client/lib/src/client/VoiceChangerWorkletNode.ts b/client/lib/src/client/VoiceChangerWorkletNode.ts index 5b2521e0..b07360f0 100644 --- a/client/lib/src/client/VoiceChangerWorkletNode.ts +++ b/client/lib/src/client/VoiceChangerWorkletNode.ts @@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = { }; export type InternalCallback = { - processAudio: (data: Uint8Array) => Uint8Array; + processAudio: (data: Uint8Array) => Promise; }; export class VoiceChangerWorkletNode extends AudioWorkletNode { @@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode { downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate); } - // Float to Int16 - const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2); - const dataView = new DataView(arrayBuffer); - for (let i = 0; i < downsampledBuffer.length; i++) { - let s = Math.max(-1, Math.min(1, downsampledBuffer[i])); - s = s < 0 ? s * 0x8000 : s * 0x7fff; - dataView.setInt16(i * 2, s, true); + // Float to Int16 (internalの場合はfloatのまま行く。) + if (this.setting.protocol != "internal") { + const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2); + const dataView = new DataView(arrayBuffer); + for (let i = 0; i < downsampledBuffer.length; i++) { + let s = Math.max(-1, Math.min(1, downsampledBuffer[i])); + s = s < 0 ? s * 0x8000 : s * 0x7fff; + dataView.setInt16(i * 2, s, true); + } + // バッファリング + this.requestChunks.push(arrayBuffer); + } else { + // internal + // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer); + this.requestChunks.push(downsampledBuffer.buffer); } - // バッファリング - this.requestChunks.push(arrayBuffer); - //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。 if (this.requestChunks.length < this.setting.inputChunkNum) { return; @@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode { this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`); return; } - const res = this.internalCallback.processAudio(newBuffer); + const res = await this.internalCallback.processAudio(newBuffer); + if (res.length == 0) { + return; + } if (this.outputNode != null) { this.outputNode.postReceivedVoice(res.buffer); } else { diff --git a/client/lib/worklet/src/voice-changer-worklet-processor.ts b/client/lib/worklet/src/voice-changer-worklet-processor.ts index 123e8a15..1b663aae 100644 --- a/client/lib/worklet/src/voice-changer-worklet-processor.ts +++ b/client/lib/worklet/src/voice-changer-worklet-processor.ts @@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor { private isRecording = false; playBuffer: Float32Array[] = []; + unpushedF32Data: Float32Array = new Float32Array(0); /** * @constructor */ @@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor { } const f32Data = request.voice; - const chunkNum = f32Data.length / this.BLOCK_SIZE; + const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length); + concatedF32Data.set(this.unpushedF32Data); + concatedF32Data.set(f32Data, this.unpushedF32Data.length); + + const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE); for (let i = 0; i < chunkNum; i++) { - const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE); + const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE); this.playBuffer.push(block); } + this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE); } pushData = (inputData: Float32Array) => { @@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor { } if (this.playBuffer.length === 0) { - // console.log("[worklet] no play buffer") + // console.log("[worklet] no play buffer"); return true; } - + // console.log("[worklet] play buffer"); //// 一定期間無音状態が続いている場合はスキップ。 // let voice: Float32Array | undefined // while (true) { diff --git a/server/sio/MMVC_SocketIOApp.py b/server/sio/MMVC_SocketIOApp.py index d51ec601..30a97c68 100644 --- a/server/sio/MMVC_SocketIOApp.py +++ b/server/sio/MMVC_SocketIOApp.py @@ -40,6 +40,10 @@ class MMVC_SocketIOApp: "filename": f"{getFrontendPath()}/assets/buymeacoffee.png", "content_type": "image/png", }, + "/ort-wasm-simd.wasm": { + "filename": f"{getFrontendPath()}/ort-wasm-simd.wasm", + "content_type": "application/wasm", + }, "": f"{getFrontendPath()}", "/": f"{getFrontendPath()}/index.html", }, diff --git a/server/voice_changer/RVC/pipeline/Pipeline.py b/server/voice_changer/RVC/pipeline/Pipeline.py index 05a17177..e047f6b5 100644 --- a/server/voice_changer/RVC/pipeline/Pipeline.py +++ b/server/voice_changer/RVC/pipeline/Pipeline.py @@ -93,9 +93,9 @@ class Pipeline(object): pitch = None pitchf = None except IndexError as e: # NOQA - # print(e) - # import traceback - # traceback.print_exc() + print(e) + import traceback + traceback.print_exc() raise NotEnoughDataExtimateF0() return pitch, pitchf diff --git a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py index 64b1ed42..14a31304 100644 --- a/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py +++ b/server/voice_changer/RVC/pitchExtractor/CrepeOnnxPitchExtractor.py @@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor): ) def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0): - n_frames = int(len(audio) // window) + 1 start_frame = int(silence_front * sr / window) real_silence_front = start_frame * window / sr diff --git a/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py b/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py new file mode 100644 index 00000000..5ed8fa0b --- /dev/null +++ b/server/voice_changer/RVC/pitchExtractor/torchcrepe2/TorchCrepe2.py @@ -0,0 +1,160 @@ +# 実験用。使用しない。 + +import torch +from torchcrepe.model import Crepe +import os +CENTS_PER_BIN = 20 +PITCH_BINS = 360 + + +class TorchCrepe2(torch.nn.Module): + + def __init__(self, model='full'): + super().__init__() + self.crepe = Crepe(model) + file = os.path.join(os.path.dirname(__file__), f'{model}.pth') + self.crepe.load_state_dict(torch.load(file, map_location="cpu")) + self.crepe = self.crepe.to(torch.device("cpu")) + self.crepe.eval() + + self.sample_rate = 16000 + self.hop_length = 160 + self.window_size = 1024 + + def forward(self, audio, f0_min: int, f0_max: int): + # total_frames = 1 + int(audio.size(1) // self.hop_length) + audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2)) + # batch_size = total_frames + + start = 0 + end = audio.size(1) + + # Chunk + frames = torch.nn.functional.unfold( + audio[:, None, None, start:end], + kernel_size=(1, self.window_size), + stride=(1, self.hop_length)) + + frames = frames.transpose(1, 2).reshape(-1, self.window_size) + + # Place on device + # frames = frames.to(device) + + # Mean-center + frames -= frames.mean(dim=1, keepdim=True) + + # Scale + frames /= torch.max(torch.tensor(1e-10, device=frames.device), + frames.std(dim=1, keepdim=True)) + + probabilities = self.crepe(frames.to(torch.device("cpu"))) + probabilities = probabilities.reshape( + audio.size(0), -1, PITCH_BINS).transpose(1, 2) + + minidx = frequency_to_bins(torch.tensor(f0_min)) + maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil) + + probabilities[:, :minidx] = -float('inf') + probabilities[:, maxidx:] = -float('inf') + + bins, pitch = weighted_argmax(probabilities) + + return pitch, periodicity(probabilities, bins) + + +def weighted_argmax(logits): + """Sample observations using weighted sum near the argmax""" + # Find center of analysis window + bins = logits.argmax(dim=1) + + # Find bounds of analysis window + start = torch.max(torch.tensor(0, device=logits.device), bins - 4) + end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5) + + # Mask out everything outside of window + for batch in range(logits.size(0)): + for time in range(logits.size(2)): + logits[batch, :start[batch, time], time] = -float('inf') + logits[batch, end[batch, time]:, time] = -float('inf') + + # Construct weights + if not hasattr(weighted_argmax, 'weights'): + weights = bins_to_cents(torch.arange(360)) + weighted_argmax.weights = weights[None, :, None] + + # Ensure devices are the same (no-op if they are) + weighted_argmax.weights = weighted_argmax.weights.to(logits.device) + + # Convert to probabilities + with torch.no_grad(): + probs = torch.sigmoid(logits) + + # Apply weights + cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1) + + # Convert to frequency in Hz + return bins, cents_to_frequency(cents) + + +def bins_to_cents(bins): + """Converts pitch bins to cents""" + cents = CENTS_PER_BIN * bins + 1997.3794084376191 + + # Trade quantization error for noise + return dither(cents) + + +def dither(cents): + """Dither the predicted pitch in cents to remove quantization error""" + # noise = scipy.stats.triang.rvs(c=0.5, + # loc=-CENTS_PER_BIN, + # scale=2 * CENTS_PER_BIN, + # size=cents.size()) + + # 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値 + c = 0.5 + loc = -CENTS_PER_BIN + scale = 2 * CENTS_PER_BIN + u = torch.rand(cents.size()) + # f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2) + f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2)) + noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc + mask = u >= c + noise[mask] = 2 * (scale - noise[mask]) + return cents + cents.new_tensor(noise) + + +def cents_to_frequency(cents): + """Converts cents to frequency in Hz""" + return 10 * 2 ** (cents / 1200) + + +def periodicity(probabilities, bins): + """Computes the periodicity from the network output and pitch bins""" + # shape=(batch * time / hop_length, 360) + probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) + + # shape=(batch * time / hop_length, 1) + bins_stacked = bins.reshape(-1, 1).to(torch.int64) + + # Use maximum logit over pitch bins as periodicity + periodicity = probs_stacked.gather(1, bins_stacked) + + # shape=(batch, time / hop_length) + return periodicity.reshape(probabilities.size(0), probabilities.size(2)) + + +def cents_to_bins(cents, quantize_fn=torch.floor): + """Converts cents to pitch bins""" + bins = (cents - 1997.3794084376191) / CENTS_PER_BIN + return quantize_fn(bins).int() + + +def frequency_to_bins(frequency, quantize_fn=torch.floor): + """Convert frequency in Hz to pitch bins""" + return cents_to_bins(frequency_to_cents(frequency), quantize_fn) + + +def frequency_to_cents(frequency): + """Convert frequency in Hz to cents""" + return 1200 * torch.log2(frequency / 10.)