This commit is contained in:
w-okada 2023-10-09 12:15:03 +09:00
parent 2b9a497eb0
commit 782d003a91
8 changed files with 209 additions and 22 deletions

3
.gitignore vendored
View File

@ -36,6 +36,9 @@ server/memo.md
client/lib/dist
client/lib/worklet/dist
client/demo/public/models
client/demo/dist/models
client/demo/src/001_provider/backup
# client/demo/dist/ # demo用に残す
docker/cudnn/

View File

@ -1,11 +1,15 @@
const path = require("path");
const HtmlWebpackPlugin = require("html-webpack-plugin");
const CopyPlugin = require("copy-webpack-plugin");
const webpack = require("webpack");
module.exports = {
mode: "production",
entry: "./src/000_index.tsx",
resolve: {
extensions: [".ts", ".tsx", ".js"],
fallback: {
buffer: require.resolve("buffer/"),
},
},
module: {
rules: [
@ -29,7 +33,7 @@ module.exports = {
test: /\.css$/,
use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
},
{ test: /\.json$/, type: "asset/inline" },
],
},
output: {
@ -37,6 +41,9 @@ module.exports = {
path: path.resolve(__dirname, "dist"),
},
plugins: [
new webpack.ProvidePlugin({
Buffer: ["buffer", "Buffer"],
}),
new HtmlWebpackPlugin({
template: path.resolve(__dirname, "public/index.html"),
filename: "./index.html",
@ -47,5 +54,5 @@ module.exports = {
new CopyPlugin({
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
}),
]
],
};

View File

@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = {
};
export type InternalCallback = {
processAudio: (data: Uint8Array) => Uint8Array;
processAudio: (data: Uint8Array) => Promise<Uint8Array>;
};
export class VoiceChangerWorkletNode extends AudioWorkletNode {
@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
}
// Float to Int16
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
const dataView = new DataView(arrayBuffer);
for (let i = 0; i < downsampledBuffer.length; i++) {
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
s = s < 0 ? s * 0x8000 : s * 0x7fff;
dataView.setInt16(i * 2, s, true);
// Float to Int16 (internalの場合はfloatのまま行く。)
if (this.setting.protocol != "internal") {
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
const dataView = new DataView(arrayBuffer);
for (let i = 0; i < downsampledBuffer.length; i++) {
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
s = s < 0 ? s * 0x8000 : s * 0x7fff;
dataView.setInt16(i * 2, s, true);
}
// バッファリング
this.requestChunks.push(arrayBuffer);
} else {
// internal
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
this.requestChunks.push(downsampledBuffer.buffer);
}
// バッファリング
this.requestChunks.push(arrayBuffer);
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
if (this.requestChunks.length < this.setting.inputChunkNum) {
return;
@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
return;
}
const res = this.internalCallback.processAudio(newBuffer);
const res = await this.internalCallback.processAudio(newBuffer);
if (res.length == 0) {
return;
}
if (this.outputNode != null) {
this.outputNode.postReceivedVoice(res.buffer);
} else {

View File

@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
private isRecording = false;
playBuffer: Float32Array[] = [];
unpushedF32Data: Float32Array = new Float32Array(0);
/**
* @constructor
*/
@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
}
const f32Data = request.voice;
const chunkNum = f32Data.length / this.BLOCK_SIZE;
const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length);
concatedF32Data.set(this.unpushedF32Data);
concatedF32Data.set(f32Data, this.unpushedF32Data.length);
const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE);
for (let i = 0; i < chunkNum; i++) {
const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
this.playBuffer.push(block);
}
this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE);
}
pushData = (inputData: Float32Array) => {
@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
}
if (this.playBuffer.length === 0) {
// console.log("[worklet] no play buffer")
// console.log("[worklet] no play buffer");
return true;
}
// console.log("[worklet] play buffer");
//// 一定期間無音状態が続いている場合はスキップ。
// let voice: Float32Array | undefined
// while (true) {

View File

@ -40,6 +40,10 @@ class MMVC_SocketIOApp:
"filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
"content_type": "image/png",
},
"/ort-wasm-simd.wasm": {
"filename": f"{getFrontendPath()}/ort-wasm-simd.wasm",
"content_type": "application/wasm",
},
"": f"{getFrontendPath()}",
"/": f"{getFrontendPath()}/index.html",
},

View File

@ -93,9 +93,9 @@ class Pipeline(object):
pitch = None
pitchf = None
except IndexError as e: # NOQA
# print(e)
# import traceback
# traceback.print_exc()
print(e)
import traceback
traceback.print_exc()
raise NotEnoughDataExtimateF0()
return pitch, pitchf

View File

@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
)
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
n_frames = int(len(audio) // window) + 1
start_frame = int(silence_front * sr / window)
real_silence_front = start_frame * window / sr

View File

@ -0,0 +1,160 @@
# 実験用。使用しない。
import torch
from torchcrepe.model import Crepe
import os
CENTS_PER_BIN = 20
PITCH_BINS = 360
class TorchCrepe2(torch.nn.Module):
def __init__(self, model='full'):
super().__init__()
self.crepe = Crepe(model)
file = os.path.join(os.path.dirname(__file__), f'{model}.pth')
self.crepe.load_state_dict(torch.load(file, map_location="cpu"))
self.crepe = self.crepe.to(torch.device("cpu"))
self.crepe.eval()
self.sample_rate = 16000
self.hop_length = 160
self.window_size = 1024
def forward(self, audio, f0_min: int, f0_max: int):
# total_frames = 1 + int(audio.size(1) // self.hop_length)
audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2))
# batch_size = total_frames
start = 0
end = audio.size(1)
# Chunk
frames = torch.nn.functional.unfold(
audio[:, None, None, start:end],
kernel_size=(1, self.window_size),
stride=(1, self.hop_length))
frames = frames.transpose(1, 2).reshape(-1, self.window_size)
# Place on device
# frames = frames.to(device)
# Mean-center
frames -= frames.mean(dim=1, keepdim=True)
# Scale
frames /= torch.max(torch.tensor(1e-10, device=frames.device),
frames.std(dim=1, keepdim=True))
probabilities = self.crepe(frames.to(torch.device("cpu")))
probabilities = probabilities.reshape(
audio.size(0), -1, PITCH_BINS).transpose(1, 2)
minidx = frequency_to_bins(torch.tensor(f0_min))
maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil)
probabilities[:, :minidx] = -float('inf')
probabilities[:, maxidx:] = -float('inf')
bins, pitch = weighted_argmax(probabilities)
return pitch, periodicity(probabilities, bins)
def weighted_argmax(logits):
"""Sample observations using weighted sum near the argmax"""
# Find center of analysis window
bins = logits.argmax(dim=1)
# Find bounds of analysis window
start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
# Mask out everything outside of window
for batch in range(logits.size(0)):
for time in range(logits.size(2)):
logits[batch, :start[batch, time], time] = -float('inf')
logits[batch, end[batch, time]:, time] = -float('inf')
# Construct weights
if not hasattr(weighted_argmax, 'weights'):
weights = bins_to_cents(torch.arange(360))
weighted_argmax.weights = weights[None, :, None]
# Ensure devices are the same (no-op if they are)
weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
# Convert to probabilities
with torch.no_grad():
probs = torch.sigmoid(logits)
# Apply weights
cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
# Convert to frequency in Hz
return bins, cents_to_frequency(cents)
def bins_to_cents(bins):
"""Converts pitch bins to cents"""
cents = CENTS_PER_BIN * bins + 1997.3794084376191
# Trade quantization error for noise
return dither(cents)
def dither(cents):
"""Dither the predicted pitch in cents to remove quantization error"""
# noise = scipy.stats.triang.rvs(c=0.5,
# loc=-CENTS_PER_BIN,
# scale=2 * CENTS_PER_BIN,
# size=cents.size())
# 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値
c = 0.5
loc = -CENTS_PER_BIN
scale = 2 * CENTS_PER_BIN
u = torch.rand(cents.size())
# f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2)
f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2))
noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc
mask = u >= c
noise[mask] = 2 * (scale - noise[mask])
return cents + cents.new_tensor(noise)
def cents_to_frequency(cents):
"""Converts cents to frequency in Hz"""
return 10 * 2 ** (cents / 1200)
def periodicity(probabilities, bins):
"""Computes the periodicity from the network output and pitch bins"""
# shape=(batch * time / hop_length, 360)
probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
# shape=(batch * time / hop_length, 1)
bins_stacked = bins.reshape(-1, 1).to(torch.int64)
# Use maximum logit over pitch bins as periodicity
periodicity = probs_stacked.gather(1, bins_stacked)
# shape=(batch, time / hop_length)
return periodicity.reshape(probabilities.size(0), probabilities.size(2))
def cents_to_bins(cents, quantize_fn=torch.floor):
"""Converts cents to pitch bins"""
bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
return quantize_fn(bins).int()
def frequency_to_bins(frequency, quantize_fn=torch.floor):
"""Convert frequency in Hz to pitch bins"""
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
def frequency_to_cents(frequency):
"""Convert frequency in Hz to cents"""
return 1200 * torch.log2(frequency / 10.)