mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
update
This commit is contained in:
parent
2b9a497eb0
commit
782d003a91
3
.gitignore
vendored
3
.gitignore
vendored
@ -36,6 +36,9 @@ server/memo.md
|
||||
|
||||
client/lib/dist
|
||||
client/lib/worklet/dist
|
||||
client/demo/public/models
|
||||
client/demo/dist/models
|
||||
client/demo/src/001_provider/backup
|
||||
# client/demo/dist/ # demo用に残す
|
||||
|
||||
docker/cudnn/
|
||||
|
@ -1,11 +1,15 @@
|
||||
const path = require("path");
|
||||
const HtmlWebpackPlugin = require("html-webpack-plugin");
|
||||
const CopyPlugin = require("copy-webpack-plugin");
|
||||
const webpack = require("webpack");
|
||||
module.exports = {
|
||||
mode: "production",
|
||||
entry: "./src/000_index.tsx",
|
||||
resolve: {
|
||||
extensions: [".ts", ".tsx", ".js"],
|
||||
fallback: {
|
||||
buffer: require.resolve("buffer/"),
|
||||
},
|
||||
},
|
||||
module: {
|
||||
rules: [
|
||||
@ -29,7 +33,7 @@ module.exports = {
|
||||
test: /\.css$/,
|
||||
use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
|
||||
},
|
||||
|
||||
{ test: /\.json$/, type: "asset/inline" },
|
||||
],
|
||||
},
|
||||
output: {
|
||||
@ -37,6 +41,9 @@ module.exports = {
|
||||
path: path.resolve(__dirname, "dist"),
|
||||
},
|
||||
plugins: [
|
||||
new webpack.ProvidePlugin({
|
||||
Buffer: ["buffer", "Buffer"],
|
||||
}),
|
||||
new HtmlWebpackPlugin({
|
||||
template: path.resolve(__dirname, "public/index.html"),
|
||||
filename: "./index.html",
|
||||
@ -47,5 +54,5 @@ module.exports = {
|
||||
new CopyPlugin({
|
||||
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
||||
}),
|
||||
]
|
||||
],
|
||||
};
|
||||
|
@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = {
|
||||
};
|
||||
|
||||
export type InternalCallback = {
|
||||
processAudio: (data: Uint8Array) => Uint8Array;
|
||||
processAudio: (data: Uint8Array) => Promise<Uint8Array>;
|
||||
};
|
||||
|
||||
export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
||||
@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
||||
downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
|
||||
}
|
||||
|
||||
// Float to Int16
|
||||
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
||||
const dataView = new DataView(arrayBuffer);
|
||||
for (let i = 0; i < downsampledBuffer.length; i++) {
|
||||
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
||||
s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
dataView.setInt16(i * 2, s, true);
|
||||
// Float to Int16 (internalの場合はfloatのまま行く。)
|
||||
if (this.setting.protocol != "internal") {
|
||||
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
||||
const dataView = new DataView(arrayBuffer);
|
||||
for (let i = 0; i < downsampledBuffer.length; i++) {
|
||||
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
||||
s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||
dataView.setInt16(i * 2, s, true);
|
||||
}
|
||||
// バッファリング
|
||||
this.requestChunks.push(arrayBuffer);
|
||||
} else {
|
||||
// internal
|
||||
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
|
||||
this.requestChunks.push(downsampledBuffer.buffer);
|
||||
}
|
||||
|
||||
// バッファリング
|
||||
this.requestChunks.push(arrayBuffer);
|
||||
|
||||
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
|
||||
if (this.requestChunks.length < this.setting.inputChunkNum) {
|
||||
return;
|
||||
@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
||||
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
|
||||
return;
|
||||
}
|
||||
const res = this.internalCallback.processAudio(newBuffer);
|
||||
const res = await this.internalCallback.processAudio(newBuffer);
|
||||
if (res.length == 0) {
|
||||
return;
|
||||
}
|
||||
if (this.outputNode != null) {
|
||||
this.outputNode.postReceivedVoice(res.buffer);
|
||||
} else {
|
||||
|
@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
||||
private isRecording = false;
|
||||
|
||||
playBuffer: Float32Array[] = [];
|
||||
unpushedF32Data: Float32Array = new Float32Array(0);
|
||||
/**
|
||||
* @constructor
|
||||
*/
|
||||
@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
|
||||
const f32Data = request.voice;
|
||||
const chunkNum = f32Data.length / this.BLOCK_SIZE;
|
||||
const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length);
|
||||
concatedF32Data.set(this.unpushedF32Data);
|
||||
concatedF32Data.set(f32Data, this.unpushedF32Data.length);
|
||||
|
||||
const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE);
|
||||
for (let i = 0; i < chunkNum; i++) {
|
||||
const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
|
||||
const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
|
||||
this.playBuffer.push(block);
|
||||
}
|
||||
this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE);
|
||||
}
|
||||
|
||||
pushData = (inputData: Float32Array) => {
|
||||
@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
||||
}
|
||||
|
||||
if (this.playBuffer.length === 0) {
|
||||
// console.log("[worklet] no play buffer")
|
||||
// console.log("[worklet] no play buffer");
|
||||
return true;
|
||||
}
|
||||
|
||||
// console.log("[worklet] play buffer");
|
||||
//// 一定期間無音状態が続いている場合はスキップ。
|
||||
// let voice: Float32Array | undefined
|
||||
// while (true) {
|
||||
|
@ -40,6 +40,10 @@ class MMVC_SocketIOApp:
|
||||
"filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
|
||||
"content_type": "image/png",
|
||||
},
|
||||
"/ort-wasm-simd.wasm": {
|
||||
"filename": f"{getFrontendPath()}/ort-wasm-simd.wasm",
|
||||
"content_type": "application/wasm",
|
||||
},
|
||||
"": f"{getFrontendPath()}",
|
||||
"/": f"{getFrontendPath()}/index.html",
|
||||
},
|
||||
|
@ -93,9 +93,9 @@ class Pipeline(object):
|
||||
pitch = None
|
||||
pitchf = None
|
||||
except IndexError as e: # NOQA
|
||||
# print(e)
|
||||
# import traceback
|
||||
# traceback.print_exc()
|
||||
print(e)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
raise NotEnoughDataExtimateF0()
|
||||
return pitch, pitchf
|
||||
|
||||
|
@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||
)
|
||||
|
||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||
n_frames = int(len(audio) // window) + 1
|
||||
start_frame = int(silence_front * sr / window)
|
||||
real_silence_front = start_frame * window / sr
|
||||
|
||||
|
@ -0,0 +1,160 @@
|
||||
# 実験用。使用しない。
|
||||
|
||||
import torch
|
||||
from torchcrepe.model import Crepe
|
||||
import os
|
||||
CENTS_PER_BIN = 20
|
||||
PITCH_BINS = 360
|
||||
|
||||
|
||||
class TorchCrepe2(torch.nn.Module):
|
||||
|
||||
def __init__(self, model='full'):
|
||||
super().__init__()
|
||||
self.crepe = Crepe(model)
|
||||
file = os.path.join(os.path.dirname(__file__), f'{model}.pth')
|
||||
self.crepe.load_state_dict(torch.load(file, map_location="cpu"))
|
||||
self.crepe = self.crepe.to(torch.device("cpu"))
|
||||
self.crepe.eval()
|
||||
|
||||
self.sample_rate = 16000
|
||||
self.hop_length = 160
|
||||
self.window_size = 1024
|
||||
|
||||
def forward(self, audio, f0_min: int, f0_max: int):
|
||||
# total_frames = 1 + int(audio.size(1) // self.hop_length)
|
||||
audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2))
|
||||
# batch_size = total_frames
|
||||
|
||||
start = 0
|
||||
end = audio.size(1)
|
||||
|
||||
# Chunk
|
||||
frames = torch.nn.functional.unfold(
|
||||
audio[:, None, None, start:end],
|
||||
kernel_size=(1, self.window_size),
|
||||
stride=(1, self.hop_length))
|
||||
|
||||
frames = frames.transpose(1, 2).reshape(-1, self.window_size)
|
||||
|
||||
# Place on device
|
||||
# frames = frames.to(device)
|
||||
|
||||
# Mean-center
|
||||
frames -= frames.mean(dim=1, keepdim=True)
|
||||
|
||||
# Scale
|
||||
frames /= torch.max(torch.tensor(1e-10, device=frames.device),
|
||||
frames.std(dim=1, keepdim=True))
|
||||
|
||||
probabilities = self.crepe(frames.to(torch.device("cpu")))
|
||||
probabilities = probabilities.reshape(
|
||||
audio.size(0), -1, PITCH_BINS).transpose(1, 2)
|
||||
|
||||
minidx = frequency_to_bins(torch.tensor(f0_min))
|
||||
maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil)
|
||||
|
||||
probabilities[:, :minidx] = -float('inf')
|
||||
probabilities[:, maxidx:] = -float('inf')
|
||||
|
||||
bins, pitch = weighted_argmax(probabilities)
|
||||
|
||||
return pitch, periodicity(probabilities, bins)
|
||||
|
||||
|
||||
def weighted_argmax(logits):
|
||||
"""Sample observations using weighted sum near the argmax"""
|
||||
# Find center of analysis window
|
||||
bins = logits.argmax(dim=1)
|
||||
|
||||
# Find bounds of analysis window
|
||||
start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
|
||||
end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
|
||||
|
||||
# Mask out everything outside of window
|
||||
for batch in range(logits.size(0)):
|
||||
for time in range(logits.size(2)):
|
||||
logits[batch, :start[batch, time], time] = -float('inf')
|
||||
logits[batch, end[batch, time]:, time] = -float('inf')
|
||||
|
||||
# Construct weights
|
||||
if not hasattr(weighted_argmax, 'weights'):
|
||||
weights = bins_to_cents(torch.arange(360))
|
||||
weighted_argmax.weights = weights[None, :, None]
|
||||
|
||||
# Ensure devices are the same (no-op if they are)
|
||||
weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
|
||||
|
||||
# Convert to probabilities
|
||||
with torch.no_grad():
|
||||
probs = torch.sigmoid(logits)
|
||||
|
||||
# Apply weights
|
||||
cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
|
||||
|
||||
# Convert to frequency in Hz
|
||||
return bins, cents_to_frequency(cents)
|
||||
|
||||
|
||||
def bins_to_cents(bins):
|
||||
"""Converts pitch bins to cents"""
|
||||
cents = CENTS_PER_BIN * bins + 1997.3794084376191
|
||||
|
||||
# Trade quantization error for noise
|
||||
return dither(cents)
|
||||
|
||||
|
||||
def dither(cents):
|
||||
"""Dither the predicted pitch in cents to remove quantization error"""
|
||||
# noise = scipy.stats.triang.rvs(c=0.5,
|
||||
# loc=-CENTS_PER_BIN,
|
||||
# scale=2 * CENTS_PER_BIN,
|
||||
# size=cents.size())
|
||||
|
||||
# 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値
|
||||
c = 0.5
|
||||
loc = -CENTS_PER_BIN
|
||||
scale = 2 * CENTS_PER_BIN
|
||||
u = torch.rand(cents.size())
|
||||
# f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2)
|
||||
f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2))
|
||||
noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc
|
||||
mask = u >= c
|
||||
noise[mask] = 2 * (scale - noise[mask])
|
||||
return cents + cents.new_tensor(noise)
|
||||
|
||||
|
||||
def cents_to_frequency(cents):
|
||||
"""Converts cents to frequency in Hz"""
|
||||
return 10 * 2 ** (cents / 1200)
|
||||
|
||||
|
||||
def periodicity(probabilities, bins):
|
||||
"""Computes the periodicity from the network output and pitch bins"""
|
||||
# shape=(batch * time / hop_length, 360)
|
||||
probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
|
||||
|
||||
# shape=(batch * time / hop_length, 1)
|
||||
bins_stacked = bins.reshape(-1, 1).to(torch.int64)
|
||||
|
||||
# Use maximum logit over pitch bins as periodicity
|
||||
periodicity = probs_stacked.gather(1, bins_stacked)
|
||||
|
||||
# shape=(batch, time / hop_length)
|
||||
return periodicity.reshape(probabilities.size(0), probabilities.size(2))
|
||||
|
||||
|
||||
def cents_to_bins(cents, quantize_fn=torch.floor):
|
||||
"""Converts cents to pitch bins"""
|
||||
bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
|
||||
return quantize_fn(bins).int()
|
||||
|
||||
|
||||
def frequency_to_bins(frequency, quantize_fn=torch.floor):
|
||||
"""Convert frequency in Hz to pitch bins"""
|
||||
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
|
||||
|
||||
|
||||
def frequency_to_cents(frequency):
|
||||
"""Convert frequency in Hz to cents"""
|
||||
return 1200 * torch.log2(frequency / 10.)
|
Loading…
Reference in New Issue
Block a user