mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-02 16:23:58 +03:00
update
This commit is contained in:
parent
2b9a497eb0
commit
782d003a91
3
.gitignore
vendored
3
.gitignore
vendored
@ -36,6 +36,9 @@ server/memo.md
|
|||||||
|
|
||||||
client/lib/dist
|
client/lib/dist
|
||||||
client/lib/worklet/dist
|
client/lib/worklet/dist
|
||||||
|
client/demo/public/models
|
||||||
|
client/demo/dist/models
|
||||||
|
client/demo/src/001_provider/backup
|
||||||
# client/demo/dist/ # demo用に残す
|
# client/demo/dist/ # demo用に残す
|
||||||
|
|
||||||
docker/cudnn/
|
docker/cudnn/
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
const path = require("path");
|
const path = require("path");
|
||||||
const HtmlWebpackPlugin = require("html-webpack-plugin");
|
const HtmlWebpackPlugin = require("html-webpack-plugin");
|
||||||
const CopyPlugin = require("copy-webpack-plugin");
|
const CopyPlugin = require("copy-webpack-plugin");
|
||||||
|
const webpack = require("webpack");
|
||||||
module.exports = {
|
module.exports = {
|
||||||
mode: "production",
|
mode: "production",
|
||||||
entry: "./src/000_index.tsx",
|
entry: "./src/000_index.tsx",
|
||||||
resolve: {
|
resolve: {
|
||||||
extensions: [".ts", ".tsx", ".js"],
|
extensions: [".ts", ".tsx", ".js"],
|
||||||
|
fallback: {
|
||||||
|
buffer: require.resolve("buffer/"),
|
||||||
|
},
|
||||||
},
|
},
|
||||||
module: {
|
module: {
|
||||||
rules: [
|
rules: [
|
||||||
@ -29,7 +33,7 @@ module.exports = {
|
|||||||
test: /\.css$/,
|
test: /\.css$/,
|
||||||
use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
|
use: ["style-loader", { loader: "css-loader", options: { importLoaders: 1 } }, "postcss-loader"],
|
||||||
},
|
},
|
||||||
|
{ test: /\.json$/, type: "asset/inline" },
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
output: {
|
output: {
|
||||||
@ -37,6 +41,9 @@ module.exports = {
|
|||||||
path: path.resolve(__dirname, "dist"),
|
path: path.resolve(__dirname, "dist"),
|
||||||
},
|
},
|
||||||
plugins: [
|
plugins: [
|
||||||
|
new webpack.ProvidePlugin({
|
||||||
|
Buffer: ["buffer", "Buffer"],
|
||||||
|
}),
|
||||||
new HtmlWebpackPlugin({
|
new HtmlWebpackPlugin({
|
||||||
template: path.resolve(__dirname, "public/index.html"),
|
template: path.resolve(__dirname, "public/index.html"),
|
||||||
filename: "./index.html",
|
filename: "./index.html",
|
||||||
@ -47,5 +54,5 @@ module.exports = {
|
|||||||
new CopyPlugin({
|
new CopyPlugin({
|
||||||
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
||||||
}),
|
}),
|
||||||
]
|
],
|
||||||
};
|
};
|
||||||
|
@ -12,7 +12,7 @@ export type VoiceChangerWorkletListener = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export type InternalCallback = {
|
export type InternalCallback = {
|
||||||
processAudio: (data: Uint8Array) => Uint8Array;
|
processAudio: (data: Uint8Array) => Promise<Uint8Array>;
|
||||||
};
|
};
|
||||||
|
|
||||||
export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
||||||
@ -224,18 +224,23 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
|||||||
downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
|
downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Float to Int16
|
// Float to Int16 (internalの場合はfloatのまま行く。)
|
||||||
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
if (this.setting.protocol != "internal") {
|
||||||
const dataView = new DataView(arrayBuffer);
|
const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
|
||||||
for (let i = 0; i < downsampledBuffer.length; i++) {
|
const dataView = new DataView(arrayBuffer);
|
||||||
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
for (let i = 0; i < downsampledBuffer.length; i++) {
|
||||||
s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
|
||||||
dataView.setInt16(i * 2, s, true);
|
s = s < 0 ? s * 0x8000 : s * 0x7fff;
|
||||||
|
dataView.setInt16(i * 2, s, true);
|
||||||
|
}
|
||||||
|
// バッファリング
|
||||||
|
this.requestChunks.push(arrayBuffer);
|
||||||
|
} else {
|
||||||
|
// internal
|
||||||
|
// console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
|
||||||
|
this.requestChunks.push(downsampledBuffer.buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// バッファリング
|
|
||||||
this.requestChunks.push(arrayBuffer);
|
|
||||||
|
|
||||||
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
|
//// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
|
||||||
if (this.requestChunks.length < this.setting.inputChunkNum) {
|
if (this.requestChunks.length < this.setting.inputChunkNum) {
|
||||||
return;
|
return;
|
||||||
@ -290,7 +295,10 @@ export class VoiceChangerWorkletNode extends AudioWorkletNode {
|
|||||||
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
|
this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const res = this.internalCallback.processAudio(newBuffer);
|
const res = await this.internalCallback.processAudio(newBuffer);
|
||||||
|
if (res.length == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (this.outputNode != null) {
|
if (this.outputNode != null) {
|
||||||
this.outputNode.postReceivedVoice(res.buffer);
|
this.outputNode.postReceivedVoice(res.buffer);
|
||||||
} else {
|
} else {
|
||||||
|
@ -42,6 +42,7 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
|||||||
private isRecording = false;
|
private isRecording = false;
|
||||||
|
|
||||||
playBuffer: Float32Array[] = [];
|
playBuffer: Float32Array[] = [];
|
||||||
|
unpushedF32Data: Float32Array = new Float32Array(0);
|
||||||
/**
|
/**
|
||||||
* @constructor
|
* @constructor
|
||||||
*/
|
*/
|
||||||
@ -105,11 +106,16 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const f32Data = request.voice;
|
const f32Data = request.voice;
|
||||||
const chunkNum = f32Data.length / this.BLOCK_SIZE;
|
const concatedF32Data = new Float32Array(this.unpushedF32Data.length + f32Data.length);
|
||||||
|
concatedF32Data.set(this.unpushedF32Data);
|
||||||
|
concatedF32Data.set(f32Data, this.unpushedF32Data.length);
|
||||||
|
|
||||||
|
const chunkNum = Math.floor(concatedF32Data.length / this.BLOCK_SIZE);
|
||||||
for (let i = 0; i < chunkNum; i++) {
|
for (let i = 0; i < chunkNum; i++) {
|
||||||
const block = f32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
|
const block = concatedF32Data.slice(i * this.BLOCK_SIZE, (i + 1) * this.BLOCK_SIZE);
|
||||||
this.playBuffer.push(block);
|
this.playBuffer.push(block);
|
||||||
}
|
}
|
||||||
|
this.unpushedF32Data = concatedF32Data.slice(chunkNum * this.BLOCK_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
pushData = (inputData: Float32Array) => {
|
pushData = (inputData: Float32Array) => {
|
||||||
@ -133,10 +139,10 @@ class VoiceChangerWorkletProcessor extends AudioWorkletProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (this.playBuffer.length === 0) {
|
if (this.playBuffer.length === 0) {
|
||||||
// console.log("[worklet] no play buffer")
|
// console.log("[worklet] no play buffer");
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// console.log("[worklet] play buffer");
|
||||||
//// 一定期間無音状態が続いている場合はスキップ。
|
//// 一定期間無音状態が続いている場合はスキップ。
|
||||||
// let voice: Float32Array | undefined
|
// let voice: Float32Array | undefined
|
||||||
// while (true) {
|
// while (true) {
|
||||||
|
@ -40,6 +40,10 @@ class MMVC_SocketIOApp:
|
|||||||
"filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
|
"filename": f"{getFrontendPath()}/assets/buymeacoffee.png",
|
||||||
"content_type": "image/png",
|
"content_type": "image/png",
|
||||||
},
|
},
|
||||||
|
"/ort-wasm-simd.wasm": {
|
||||||
|
"filename": f"{getFrontendPath()}/ort-wasm-simd.wasm",
|
||||||
|
"content_type": "application/wasm",
|
||||||
|
},
|
||||||
"": f"{getFrontendPath()}",
|
"": f"{getFrontendPath()}",
|
||||||
"/": f"{getFrontendPath()}/index.html",
|
"/": f"{getFrontendPath()}/index.html",
|
||||||
},
|
},
|
||||||
|
@ -93,9 +93,9 @@ class Pipeline(object):
|
|||||||
pitch = None
|
pitch = None
|
||||||
pitchf = None
|
pitchf = None
|
||||||
except IndexError as e: # NOQA
|
except IndexError as e: # NOQA
|
||||||
# print(e)
|
print(e)
|
||||||
# import traceback
|
import traceback
|
||||||
# traceback.print_exc()
|
traceback.print_exc()
|
||||||
raise NotEnoughDataExtimateF0()
|
raise NotEnoughDataExtimateF0()
|
||||||
return pitch, pitchf
|
return pitch, pitchf
|
||||||
|
|
||||||
|
@ -21,7 +21,6 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio, pitchf, f0_up_key, sr, window, silence_front=0):
|
||||||
n_frames = int(len(audio) // window) + 1
|
|
||||||
start_frame = int(silence_front * sr / window)
|
start_frame = int(silence_front * sr / window)
|
||||||
real_silence_front = start_frame * window / sr
|
real_silence_front = start_frame * window / sr
|
||||||
|
|
||||||
|
@ -0,0 +1,160 @@
|
|||||||
|
# 実験用。使用しない。
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from torchcrepe.model import Crepe
|
||||||
|
import os
|
||||||
|
CENTS_PER_BIN = 20
|
||||||
|
PITCH_BINS = 360
|
||||||
|
|
||||||
|
|
||||||
|
class TorchCrepe2(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, model='full'):
|
||||||
|
super().__init__()
|
||||||
|
self.crepe = Crepe(model)
|
||||||
|
file = os.path.join(os.path.dirname(__file__), f'{model}.pth')
|
||||||
|
self.crepe.load_state_dict(torch.load(file, map_location="cpu"))
|
||||||
|
self.crepe = self.crepe.to(torch.device("cpu"))
|
||||||
|
self.crepe.eval()
|
||||||
|
|
||||||
|
self.sample_rate = 16000
|
||||||
|
self.hop_length = 160
|
||||||
|
self.window_size = 1024
|
||||||
|
|
||||||
|
def forward(self, audio, f0_min: int, f0_max: int):
|
||||||
|
# total_frames = 1 + int(audio.size(1) // self.hop_length)
|
||||||
|
audio = torch.nn.functional.pad(audio, (self.window_size // 2, self.window_size // 2))
|
||||||
|
# batch_size = total_frames
|
||||||
|
|
||||||
|
start = 0
|
||||||
|
end = audio.size(1)
|
||||||
|
|
||||||
|
# Chunk
|
||||||
|
frames = torch.nn.functional.unfold(
|
||||||
|
audio[:, None, None, start:end],
|
||||||
|
kernel_size=(1, self.window_size),
|
||||||
|
stride=(1, self.hop_length))
|
||||||
|
|
||||||
|
frames = frames.transpose(1, 2).reshape(-1, self.window_size)
|
||||||
|
|
||||||
|
# Place on device
|
||||||
|
# frames = frames.to(device)
|
||||||
|
|
||||||
|
# Mean-center
|
||||||
|
frames -= frames.mean(dim=1, keepdim=True)
|
||||||
|
|
||||||
|
# Scale
|
||||||
|
frames /= torch.max(torch.tensor(1e-10, device=frames.device),
|
||||||
|
frames.std(dim=1, keepdim=True))
|
||||||
|
|
||||||
|
probabilities = self.crepe(frames.to(torch.device("cpu")))
|
||||||
|
probabilities = probabilities.reshape(
|
||||||
|
audio.size(0), -1, PITCH_BINS).transpose(1, 2)
|
||||||
|
|
||||||
|
minidx = frequency_to_bins(torch.tensor(f0_min))
|
||||||
|
maxidx = frequency_to_bins(torch.tensor(f0_max), torch.ceil)
|
||||||
|
|
||||||
|
probabilities[:, :minidx] = -float('inf')
|
||||||
|
probabilities[:, maxidx:] = -float('inf')
|
||||||
|
|
||||||
|
bins, pitch = weighted_argmax(probabilities)
|
||||||
|
|
||||||
|
return pitch, periodicity(probabilities, bins)
|
||||||
|
|
||||||
|
|
||||||
|
def weighted_argmax(logits):
|
||||||
|
"""Sample observations using weighted sum near the argmax"""
|
||||||
|
# Find center of analysis window
|
||||||
|
bins = logits.argmax(dim=1)
|
||||||
|
|
||||||
|
# Find bounds of analysis window
|
||||||
|
start = torch.max(torch.tensor(0, device=logits.device), bins - 4)
|
||||||
|
end = torch.min(torch.tensor(logits.size(1), device=logits.device), bins + 5)
|
||||||
|
|
||||||
|
# Mask out everything outside of window
|
||||||
|
for batch in range(logits.size(0)):
|
||||||
|
for time in range(logits.size(2)):
|
||||||
|
logits[batch, :start[batch, time], time] = -float('inf')
|
||||||
|
logits[batch, end[batch, time]:, time] = -float('inf')
|
||||||
|
|
||||||
|
# Construct weights
|
||||||
|
if not hasattr(weighted_argmax, 'weights'):
|
||||||
|
weights = bins_to_cents(torch.arange(360))
|
||||||
|
weighted_argmax.weights = weights[None, :, None]
|
||||||
|
|
||||||
|
# Ensure devices are the same (no-op if they are)
|
||||||
|
weighted_argmax.weights = weighted_argmax.weights.to(logits.device)
|
||||||
|
|
||||||
|
# Convert to probabilities
|
||||||
|
with torch.no_grad():
|
||||||
|
probs = torch.sigmoid(logits)
|
||||||
|
|
||||||
|
# Apply weights
|
||||||
|
cents = (weighted_argmax.weights * probs).sum(dim=1) / probs.sum(dim=1)
|
||||||
|
|
||||||
|
# Convert to frequency in Hz
|
||||||
|
return bins, cents_to_frequency(cents)
|
||||||
|
|
||||||
|
|
||||||
|
def bins_to_cents(bins):
|
||||||
|
"""Converts pitch bins to cents"""
|
||||||
|
cents = CENTS_PER_BIN * bins + 1997.3794084376191
|
||||||
|
|
||||||
|
# Trade quantization error for noise
|
||||||
|
return dither(cents)
|
||||||
|
|
||||||
|
|
||||||
|
def dither(cents):
|
||||||
|
"""Dither the predicted pitch in cents to remove quantization error"""
|
||||||
|
# noise = scipy.stats.triang.rvs(c=0.5,
|
||||||
|
# loc=-CENTS_PER_BIN,
|
||||||
|
# scale=2 * CENTS_PER_BIN,
|
||||||
|
# size=cents.size())
|
||||||
|
|
||||||
|
# 三角分布のtorch書き換え。c=0.5の時のみ正確な値。それ以外は近似値
|
||||||
|
c = 0.5
|
||||||
|
loc = -CENTS_PER_BIN
|
||||||
|
scale = 2 * CENTS_PER_BIN
|
||||||
|
u = torch.rand(cents.size())
|
||||||
|
# f = (c - u) / (scale / 2) if u < c else (u - c) / (scale / 2)
|
||||||
|
f = torch.where(u < c, (c - u) / (scale / 2), (u - c) / (scale / 2))
|
||||||
|
noise = 2 * scale * ((1 - f.abs()) ** 0.5) + loc
|
||||||
|
mask = u >= c
|
||||||
|
noise[mask] = 2 * (scale - noise[mask])
|
||||||
|
return cents + cents.new_tensor(noise)
|
||||||
|
|
||||||
|
|
||||||
|
def cents_to_frequency(cents):
|
||||||
|
"""Converts cents to frequency in Hz"""
|
||||||
|
return 10 * 2 ** (cents / 1200)
|
||||||
|
|
||||||
|
|
||||||
|
def periodicity(probabilities, bins):
|
||||||
|
"""Computes the periodicity from the network output and pitch bins"""
|
||||||
|
# shape=(batch * time / hop_length, 360)
|
||||||
|
probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS)
|
||||||
|
|
||||||
|
# shape=(batch * time / hop_length, 1)
|
||||||
|
bins_stacked = bins.reshape(-1, 1).to(torch.int64)
|
||||||
|
|
||||||
|
# Use maximum logit over pitch bins as periodicity
|
||||||
|
periodicity = probs_stacked.gather(1, bins_stacked)
|
||||||
|
|
||||||
|
# shape=(batch, time / hop_length)
|
||||||
|
return periodicity.reshape(probabilities.size(0), probabilities.size(2))
|
||||||
|
|
||||||
|
|
||||||
|
def cents_to_bins(cents, quantize_fn=torch.floor):
|
||||||
|
"""Converts cents to pitch bins"""
|
||||||
|
bins = (cents - 1997.3794084376191) / CENTS_PER_BIN
|
||||||
|
return quantize_fn(bins).int()
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_to_bins(frequency, quantize_fn=torch.floor):
|
||||||
|
"""Convert frequency in Hz to pitch bins"""
|
||||||
|
return cents_to_bins(frequency_to_cents(frequency), quantize_fn)
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_to_cents(frequency):
|
||||||
|
"""Convert frequency in Hz to cents"""
|
||||||
|
return 1200 * torch.log2(frequency / 10.)
|
Loading…
Reference in New Issue
Block a user