test 48k

2025-02-02 16:23:58 +03:00 · 2024-07-02 00:49:12 +09:00 · 2024-07-02 00:49:12 +09:00 · 9dbbdcf89b
commit 9dbbdcf89b
parent 1952c76533
12 changed files with 5192 additions and 543 deletions
--- a/client/demo/dist/index.html
+++ b/client/demo/dist/index.html
@ -1 +1,10 @@
-<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
+<!DOCTYPE html>
+<html style="width: 100%; height: 100%; overflow: hidden">
+    <head>
+        <meta charset="utf-8" />
+        <title>Voice Changer Client Demo</title>
+    <script defer src="index.js"></script></head>
+    <body style="width: 100%; height: 100%; margin: 0px">
+        <div id="app" style="width: 100%; height: 100%"></div>
+    </body>
+</html>
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
--- a/client/demo/dist/index.js.LICENSE.txt
+++ b/client/demo/dist/index.js.LICENSE.txt
@ -1,35 +0,0 @@
-/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
-
-/*!**********************!*\
-  !*** ./src/index.ts ***!
-  \**********************/
-
-/**
- * @license React
- * react-dom.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * react.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-/**
- * @license React
- * scheduler.production.min.js
- *
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
--- a/client/lib/src/client/VoiceChangerWorkletNode.ts
+++ b/client/lib/src/client/VoiceChangerWorkletNode.ts
@ -1,443 +1,396 @@
 import { VoiceChangerWorkletProcessorRequest } from "../@types/voice-changer-worklet-processor";
-import {
-  DefaultClientSettng,
-  DownSamplingMode,
-  VOICE_CHANGER_CLIENT_EXCEPTION,
-  WorkletNodeSetting,
-  WorkletSetting,
-} from "../const";
+import { DefaultClientSettng, DownSamplingMode, VOICE_CHANGER_CLIENT_EXCEPTION, WorkletNodeSetting, WorkletSetting } from "../const";
 import { io, Socket } from "socket.io-client";
 import { DefaultEventsMap } from "@socket.io/component-emitter";
 import { ServerRestClient } from "./ServerRestClient";

 export type VoiceChangerWorkletListener = {
-  notifyVolume: (vol: number) => void;
-  notifySendBufferingTime: (time: number) => void;
-  notifyResponseTime: (time: number, perf?: number[]) => void;
-  notifyException: (
-    code: VOICE_CHANGER_CLIENT_EXCEPTION,
-    message: string
-  ) => void;
+    notifyVolume: (vol: number) => void;
+    notifySendBufferingTime: (time: number) => void;
+    notifyResponseTime: (time: number, perf?: number[]) => void;
+    notifyException: (code: VOICE_CHANGER_CLIENT_EXCEPTION, message: string) => void;
 };

 export type InternalCallback = {
-  processAudio: (data: Uint8Array) => Promise<Uint8Array>;
+    processAudio: (data: Uint8Array) => Promise<Uint8Array>;
 };

 export class VoiceChangerWorkletNode extends AudioWorkletNode {
-  private listener: VoiceChangerWorkletListener;
+    private listener: VoiceChangerWorkletListener;

-  private setting: WorkletNodeSetting = DefaultClientSettng.workletNodeSetting;
-  private requestChunks: ArrayBuffer[] = [];
-  private socket: Socket<DefaultEventsMap, DefaultEventsMap> | null = null;
-  // performance monitor
-  private bufferStart = 0;
+    private setting: WorkletNodeSetting = DefaultClientSettng.workletNodeSetting;
+    private requestChunks: ArrayBuffer[] = [];
+    private socket: Socket<DefaultEventsMap, DefaultEventsMap> | null = null;
+    // performance monitor
+    private bufferStart = 0;

-  private isOutputRecording = false;
-  private recordingOutputChunk: Float32Array[] = [];
-  private outputNode: VoiceChangerWorkletNode | null = null;
+    private isOutputRecording = false;
+    private recordingOutputChunk: Float32Array[] = [];
+    private outputNode: VoiceChangerWorkletNode | null = null;

-  // Promises
-  private startPromiseResolve:
-    | ((value: void | PromiseLike<void>) => void)
-    | null = null;
-  private stopPromiseResolve:
-    | ((value: void | PromiseLike<void>) => void)
-    | null = null;
+    // Promises
+    private startPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;
+    private stopPromiseResolve: ((value: void | PromiseLike<void>) => void) | null = null;

-  // InternalCallback
-  private internalCallback: InternalCallback | null = null;
+    // InternalCallback
+    private internalCallback: InternalCallback | null = null;

-  constructor(context: AudioContext, listener: VoiceChangerWorkletListener) {
-    super(context, "voice-changer-worklet-processor");
-    this.port.onmessage = this.handleMessage.bind(this);
-    this.listener = listener;
-    this.createSocketIO();
-    console.log(`[worklet_node][voice-changer-worklet-processor] created.`);
-  }
-
-  setOutputNode = (outputNode: VoiceChangerWorkletNode | null) => {
-    this.outputNode = outputNode;
-  };
-
-  // 設定
-  updateSetting = (setting: WorkletNodeSetting) => {
-    console.log(
-      `[WorkletNode] Updating WorkletNode Setting,`,
-      this.setting,
-      setting
-    );
-    let recreateSocketIoRequired = false;
-    if (
-      this.setting.serverUrl != setting.serverUrl ||
-      this.setting.protocol != setting.protocol
-    ) {
-      recreateSocketIoRequired = true;
+    constructor(context: AudioContext, listener: VoiceChangerWorkletListener) {
+        super(context, "voice-changer-worklet-processor");
+        this.port.onmessage = this.handleMessage.bind(this);
+        this.listener = listener;
+        this.createSocketIO();
+        console.log(`[worklet_node][voice-changer-worklet-processor] created.`);
    }
-    this.setting = setting;
-    if (recreateSocketIoRequired) {
-      this.createSocketIO();
-    }
-  };

-  setInternalAudioProcessCallback = (internalCallback: InternalCallback) => {
-    this.internalCallback = internalCallback;
-  };
+    setOutputNode = (outputNode: VoiceChangerWorkletNode | null) => {
+        this.outputNode = outputNode;
+    };

-  getSettings = (): WorkletNodeSetting => {
-    return this.setting;
-  };
-
-  getSocketId = () => {
-    return this.socket?.id;
-  };
-
-  // 処理
-  private createSocketIO = () => {
-    if (this.socket) {
-      this.socket.close();
-    }
-    if (this.setting.protocol === "sio") {
-      this.socket = io(this.setting.serverUrl + "/test");
-      this.socket.on("connect_error", (err) => {
-        this.listener.notifyException(
-          VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED,
-          `[SIO] rconnection failed ${err}`
-        );
-      });
-      this.socket.on("connect", () => {
-        console.log(`[SIO] connect to ${this.setting.serverUrl}`);
-        console.log(`[SIO] ${this.socket?.id}`);
-      });
-      this.socket.on("close", function (socket) {
-        console.log(`[SIO] close ${socket.id}`);
-      });
-
-      this.socket.on("message", (response: any[]) => {
-        console.log("message:", response);
-      });
-
-      this.socket.on("response", (response: any[]) => {
-        const cur = Date.now();
-        const responseTime = cur - response[0];
-        const result = response[1] as ArrayBuffer;
-        const perf = response[2];
-
-        // Quick hack for server device mode
-        if (response[0] == 0) {
-          this.listener.notifyResponseTime(
-            Math.round(perf[0] * 1000),
-            perf.slice(1, 4)
-          );
-          return;
+    // 設定
+    updateSetting = (setting: WorkletNodeSetting) => {
+        console.log(`[WorkletNode] Updating WorkletNode Setting,`, this.setting, setting);
+        let recreateSocketIoRequired = false;
+        if (this.setting.serverUrl != setting.serverUrl || this.setting.protocol != setting.protocol) {
+            recreateSocketIoRequired = true;
        }
+        this.setting = setting;
+        if (recreateSocketIoRequired) {
+            this.createSocketIO();
+        }
+    };

-        if (result.byteLength < 128 * 2) {
-          this.listener.notifyException(
-            VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE,
-            `[SIO] recevied data is too short ${result.byteLength}`
-          );
+    setInternalAudioProcessCallback = (internalCallback: InternalCallback) => {
+        this.internalCallback = internalCallback;
+    };
+
+    getSettings = (): WorkletNodeSetting => {
+        return this.setting;
+    };
+
+    getSocketId = () => {
+        return this.socket?.id;
+    };
+
+    // 処理
+    private createSocketIO = () => {
+        if (this.socket) {
+            this.socket.close();
+        }
+        if (this.setting.protocol === "sio") {
+            this.socket = io(this.setting.serverUrl + "/test");
+            this.socket.on("connect_error", (err) => {
+                this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_CONNECT_FAILED, `[SIO] rconnection failed ${err}`);
+            });
+            this.socket.on("connect", () => {
+                console.log(`[SIO] connect to ${this.setting.serverUrl}`);
+                console.log(`[SIO] ${this.socket?.id}`);
+            });
+            this.socket.on("close", function (socket) {
+                console.log(`[SIO] close ${socket.id}`);
+            });
+
+            this.socket.on("message", (response: any[]) => {
+                console.log("message:", response);
+            });
+
+            this.socket.on("response", (response: any[]) => {
+                const cur = Date.now();
+                const responseTime = cur - response[0];
+                const result = response[1] as ArrayBuffer;
+                const perf = response[2];
+
+                // Quick hack for server device mode
+                if (response[0] == 0) {
+                    this.listener.notifyResponseTime(Math.round(perf[0] * 1000), perf.slice(1, 4));
+                    return;
+                }
+
+                if (result.byteLength < 128 * 2) {
+                    this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_SIO_INVALID_RESPONSE, `[SIO] recevied data is too short ${result.byteLength}`);
+                } else {
+                    if (this.outputNode != null) {
+                        this.outputNode.postReceivedVoice(response[1]);
+                    } else {
+                        this.postReceivedVoice(response[1]);
+                    }
+                    this.listener.notifyResponseTime(responseTime, perf);
+                }
+            });
+        }
+    };
+
+    postReceivedVoice = (data: ArrayBuffer) => {
+        // Int16 to Float
+        // const i16Data = new Int16Array(data);
+        // const f32Data = new Float32Array(i16Data.length);
+
+        // // console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
+        // i16Data.forEach((x, i) => {
+        //     const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff;
+        //     f32Data[i] = float;
+        // });
+        const f32Data = new Float32Array(data);
+
+        // アップサンプリング
+        let upSampledBuffer: Float32Array | null = null;
+        if (this.setting.sendingSampleRate == 48000) {
+            upSampledBuffer = f32Data;
        } else {
-          if (this.outputNode != null) {
-            this.outputNode.postReceivedVoice(response[1]);
-          } else {
-            this.postReceivedVoice(response[1]);
-          }
-          this.listener.notifyResponseTime(responseTime, perf);
+            upSampledBuffer = new Float32Array(f32Data.length * 2);
+            for (let i = 0; i < f32Data.length; i++) {
+                const currentFrame = f32Data[i];
+                const nextFrame = i + 1 < f32Data.length ? f32Data[i + 1] : f32Data[i];
+                upSampledBuffer[i * 2] = currentFrame;
+                upSampledBuffer[i * 2 + 1] = (currentFrame + nextFrame) / 2;
+            }
        }
-      });
-    }
-  };

-  postReceivedVoice = (data: ArrayBuffer) => {
-    // Int16 to Float
-    const i16Data = new Int16Array(data);
-    const f32Data = new Float32Array(i16Data.length);
-    // console.log(`[worklet] f32DataLength${f32Data.length} i16DataLength${i16Data.length}`)
-    i16Data.forEach((x, i) => {
-      const float = x >= 0x8000 ? -(0x10000 - x) / 0x8000 : x / 0x7fff;
-      f32Data[i] = float;
-    });
+        const req: VoiceChangerWorkletProcessorRequest = {
+            requestType: "voice",
+            voice: upSampledBuffer,
+            numTrancateTreshold: 0,
+            volTrancateThreshold: 0,
+            volTrancateLength: 0,
+        };
+        this.port.postMessage(req);

-    // アップサンプリング
-    let upSampledBuffer: Float32Array | null = null;
-    if (this.setting.sendingSampleRate == 48000) {
-      upSampledBuffer = f32Data;
-    } else {
-      upSampledBuffer = new Float32Array(f32Data.length * 2);
-      for (let i = 0; i < f32Data.length; i++) {
-        const currentFrame = f32Data[i];
-        const nextFrame = i + 1 < f32Data.length ? f32Data[i + 1] : f32Data[i];
-        upSampledBuffer[i * 2] = currentFrame;
-        upSampledBuffer[i * 2 + 1] = (currentFrame + nextFrame) / 2;
-      }
-    }
-
-    const req: VoiceChangerWorkletProcessorRequest = {
-      requestType: "voice",
-      voice: upSampledBuffer,
-      numTrancateTreshold: 0,
-      volTrancateThreshold: 0,
-      volTrancateLength: 0,
+        if (this.isOutputRecording) {
+            this.recordingOutputChunk.push(upSampledBuffer);
+        }
    };
-    this.port.postMessage(req);

-    if (this.isOutputRecording) {
-      this.recordingOutputChunk.push(upSampledBuffer);
-    }
-  };
-
-  private _averageDownsampleBuffer(
-    buffer: Float32Array,
-    originalSampleRate: number,
-    destinationSamplerate: number
-  ) {
-    if (originalSampleRate == destinationSamplerate) {
-      return buffer;
-    }
-    if (destinationSamplerate > originalSampleRate) {
-      throw "downsampling rate show be smaller than original sample rate";
-    }
-    const sampleRateRatio = originalSampleRate / destinationSamplerate;
-    const newLength = Math.round(buffer.length / sampleRateRatio);
-    const result = new Float32Array(newLength);
-    let offsetResult = 0;
-    let offsetBuffer = 0;
-    while (offsetResult < result.length) {
-      var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
-      // Use average value of skipped samples
-      var accum = 0,
-        count = 0;
-      for (
-        var i = offsetBuffer;
-        i < nextOffsetBuffer && i < buffer.length;
-        i++
-      ) {
-        accum += buffer[i];
-        count++;
-      }
-      result[offsetResult] = accum / count;
-      // Or you can simply get rid of the skipped samples:
-      // result[offsetResult] = buffer[nextOffsetBuffer];
-      offsetResult++;
-      offsetBuffer = nextOffsetBuffer;
-    }
-    return result;
-  }
-  handleMessage(event: any) {
-    // console.log(`[Node:handleMessage_] `, event.data.volume);
-    if (event.data.responseType === "start_ok") {
-      if (this.startPromiseResolve) {
-        this.startPromiseResolve();
-        this.startPromiseResolve = null;
-      }
-    } else if (event.data.responseType === "stop_ok") {
-      if (this.stopPromiseResolve) {
-        this.stopPromiseResolve();
-        this.stopPromiseResolve = null;
-      }
-    } else if (event.data.responseType === "volume") {
-      this.listener.notifyVolume(event.data.volume as number);
-    } else if (event.data.responseType === "inputData") {
-      const inputData = event.data.inputData as Float32Array;
-      // console.log("receive input data", inputData);
-
-      // ダウンサンプリング
-      let downsampledBuffer: Float32Array | null = null;
-      if (this.setting.sendingSampleRate == 48000) {
-        downsampledBuffer = inputData;
-      } else if (this.setting.downSamplingMode == DownSamplingMode.decimate) {
-        //////// (Kind 1) 間引き //////////
-        //// 48000Hz で入ってくるので間引いて24000Hzに変換する。
-        downsampledBuffer = new Float32Array(inputData.length / 2);
-        for (let i = 0; i < inputData.length; i++) {
-          if (i % 2 == 0) {
-            downsampledBuffer[i / 2] = inputData[i];
-          }
+    private _averageDownsampleBuffer(buffer: Float32Array, originalSampleRate: number, destinationSamplerate: number) {
+        if (originalSampleRate == destinationSamplerate) {
+            return buffer;
        }
-      } else {
-        //////// (Kind 2) 平均 //////////
-        // downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
-        downsampledBuffer = this._averageDownsampleBuffer(
-          inputData,
-          48000,
-          this.setting.sendingSampleRate
-        );
-      }
-
-      // Float to Int16 (internalの場合はfloatのまま行く。)
-      if (this.setting.protocol != "internal") {
-        const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
-        const dataView = new DataView(arrayBuffer);
-        for (let i = 0; i < downsampledBuffer.length; i++) {
-          let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
-          s = s < 0 ? s * 0x8000 : s * 0x7fff;
-          dataView.setInt16(i * 2, s, true);
+        if (destinationSamplerate > originalSampleRate) {
+            throw "downsampling rate show be smaller than original sample rate";
        }
-        // バッファリング
-        this.requestChunks.push(arrayBuffer);
-      } else {
-        // internal
-        // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
-        this.requestChunks.push(downsampledBuffer.buffer);
-      }
-
-      //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
-      if (this.requestChunks.length < this.setting.inputChunkNum) {
-        return;
-      }
-
-      // リクエスト用の入れ物を作成
-      const windowByteLength = this.requestChunks.reduce((prev, cur) => {
-        return prev + cur.byteLength;
-      }, 0);
-      const newBuffer = new Uint8Array(windowByteLength);
-
-      // リクエストのデータをセット
-      this.requestChunks.reduce((prev, cur) => {
-        newBuffer.set(new Uint8Array(cur), prev);
-        return prev + cur.byteLength;
-      }, 0);
-
-      this.sendBuffer(newBuffer);
-      this.requestChunks = [];
-
-      this.listener.notifySendBufferingTime(Date.now() - this.bufferStart);
-      this.bufferStart = Date.now();
-    } else {
-      console.warn(
-        `[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`,
-        event.data
-      );
+        const sampleRateRatio = originalSampleRate / destinationSamplerate;
+        const newLength = Math.round(buffer.length / sampleRateRatio);
+        const result = new Float32Array(newLength);
+        let offsetResult = 0;
+        let offsetBuffer = 0;
+        while (offsetResult < result.length) {
+            var nextOffsetBuffer = Math.round((offsetResult + 1) * sampleRateRatio);
+            // Use average value of skipped samples
+            var accum = 0,
+                count = 0;
+            for (var i = offsetBuffer; i < nextOffsetBuffer && i < buffer.length; i++) {
+                accum += buffer[i];
+                count++;
+            }
+            result[offsetResult] = accum / count;
+            // Or you can simply get rid of the skipped samples:
+            // result[offsetResult] = buffer[nextOffsetBuffer];
+            offsetResult++;
+            offsetBuffer = nextOffsetBuffer;
+        }
+        return result;
    }
-  }
+    handleMessage(event: any) {
+        // console.log(`[Node:handleMessage_] `, event.data.volume);
+        if (event.data.responseType === "start_ok") {
+            if (this.startPromiseResolve) {
+                this.startPromiseResolve();
+                this.startPromiseResolve = null;
+            }
+        } else if (event.data.responseType === "stop_ok") {
+            if (this.stopPromiseResolve) {
+                this.stopPromiseResolve();
+                this.stopPromiseResolve = null;
+            }
+        } else if (event.data.responseType === "volume") {
+            this.listener.notifyVolume(event.data.volume as number);
+        } else if (event.data.responseType === "inputData") {
+            const inputData = event.data.inputData as Float32Array;
+            // console.log("receive input data", inputData);

-  private sendBuffer = async (newBuffer: Uint8Array) => {
-    const timestamp = Date.now();
-    if (this.setting.protocol === "sio") {
-      if (!this.socket) {
-        console.warn(`sio is not initialized`);
-        return;
-      }
-      // console.log("emit!")
-      this.socket.emit("request_message", [timestamp, newBuffer.buffer]);
-    } else if (this.setting.protocol === "rest") {
-      const restClient = new ServerRestClient(this.setting.serverUrl);
-      const res = await restClient.postVoice(timestamp, newBuffer.buffer);
-      if (res.byteLength < 128 * 2) {
-        this.listener.notifyException(
-          VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE,
-          `[REST] recevied data is too short ${res.byteLength}`
-        );
-      } else {
-        if (this.outputNode != null) {
-          this.outputNode.postReceivedVoice(res);
+            // ダウンサンプリング
+            let downsampledBuffer: Float32Array | null = null;
+            if (this.setting.sendingSampleRate == 48000) {
+                console.log("no downsample");
+                downsampledBuffer = inputData;
+            } else if (this.setting.downSamplingMode == DownSamplingMode.decimate) {
+                //////// (Kind 1) 間引き //////////
+                //// 48000Hz で入ってくるので間引いて24000Hzに変換する。
+                downsampledBuffer = new Float32Array(inputData.length / 2);
+                for (let i = 0; i < inputData.length; i++) {
+                    if (i % 2 == 0) {
+                        downsampledBuffer[i / 2] = inputData[i];
+                    }
+                }
+            } else {
+                //////// (Kind 2) 平均 //////////
+                // downsampledBuffer = this._averageDownsampleBuffer(buffer, 48000, 24000)
+                downsampledBuffer = this._averageDownsampleBuffer(inputData, 48000, this.setting.sendingSampleRate);
+            }
+
+            // Float to Int16 (internalの場合はfloatのまま行く。)
+            // if (this.setting.protocol != "internal") {
+            //   const arrayBuffer = new ArrayBuffer(downsampledBuffer.length * 2);
+            //   const dataView = new DataView(arrayBuffer);
+            //   for (let i = 0; i < downsampledBuffer.length; i++) {
+            //     let s = Math.max(-1, Math.min(1, downsampledBuffer[i]));
+            //     s = s < 0 ? s * 0x8000 : s * 0x7fff;
+            //     dataView.setInt16(i * 2, s, true);
+            //   }
+            //   // バッファリング
+            //   this.requestChunks.push(arrayBuffer);
+            // } else {
+            // internal
+            // console.log("downsampledBuffer.buffer", downsampledBuffer.buffer);
+            this.requestChunks.push(downsampledBuffer.buffer);
+            // }
+
+            //// リクエストバッファの中身が、リクエスト送信数と違う場合は処理終了。
+            if (this.requestChunks.length < this.setting.inputChunkNum) {
+                return;
+            }
+
+            // リクエスト用の入れ物を作成
+            const windowByteLength = this.requestChunks.reduce((prev, cur) => {
+                return prev + cur.byteLength;
+            }, 0);
+            const newBuffer = new Uint8Array(windowByteLength);
+
+            // リクエストのデータをセット
+            this.requestChunks.reduce((prev, cur) => {
+                newBuffer.set(new Uint8Array(cur), prev);
+                return prev + cur.byteLength;
+            }, 0);
+
+            this.sendBuffer(newBuffer);
+            this.requestChunks = [];
+
+            this.listener.notifySendBufferingTime(Date.now() - this.bufferStart);
+            this.bufferStart = Date.now();
        } else {
-          this.postReceivedVoice(res);
+            console.warn(`[worklet_node][voice-changer-worklet-processor] unknown response ${event.data.responseType}`, event.data);
        }
-        this.listener.notifyResponseTime(Date.now() - timestamp);
-      }
-    } else if (this.setting.protocol == "internal") {
-      if (!this.internalCallback) {
-        this.listener.notifyException(
-          VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED,
-          `[AudioWorkletNode] internal audio process callback is not initialized`
-        );
-        return;
-      }
-      // const res = await this.internalCallback.processAudio(newBuffer);
-      // if (res.length < 128 * 2) {
-      //     return;
-      // }
-      // if (this.outputNode != null) {
-      //     this.outputNode.postReceivedVoice(res.buffer);
-      // } else {
-      //     this.postReceivedVoice(res.buffer);
-      // }
-      this.internalCallback.processAudio(newBuffer).then((res) => {
-        if (res.length < 128 * 2) {
-          return;
-        }
-        if (this.outputNode != null) {
-          this.outputNode.postReceivedVoice(res.buffer);
+    }
+
+    private sendBuffer = async (newBuffer: Uint8Array) => {
+        const timestamp = Date.now();
+        if (this.setting.protocol === "sio") {
+            if (!this.socket) {
+                console.warn(`sio is not initialized`);
+                return;
+            }
+            // console.log("emit!")
+            this.socket.emit("request_message", [timestamp, newBuffer.buffer]);
+        } else if (this.setting.protocol === "rest") {
+            const restClient = new ServerRestClient(this.setting.serverUrl);
+            const res = await restClient.postVoice(timestamp, newBuffer.buffer);
+            if (res.byteLength < 128 * 2) {
+                this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_REST_INVALID_RESPONSE, `[REST] recevied data is too short ${res.byteLength}`);
+            } else {
+                if (this.outputNode != null) {
+                    this.outputNode.postReceivedVoice(res);
+                } else {
+                    this.postReceivedVoice(res);
+                }
+                this.listener.notifyResponseTime(Date.now() - timestamp);
+            }
+        } else if (this.setting.protocol == "internal") {
+            if (!this.internalCallback) {
+                this.listener.notifyException(VOICE_CHANGER_CLIENT_EXCEPTION.ERR_INTERNAL_AUDIO_PROCESS_CALLBACK_IS_NOT_INITIALIZED, `[AudioWorkletNode] internal audio process callback is not initialized`);
+                return;
+            }
+            // const res = await this.internalCallback.processAudio(newBuffer);
+            // if (res.length < 128 * 2) {
+            //     return;
+            // }
+            // if (this.outputNode != null) {
+            //     this.outputNode.postReceivedVoice(res.buffer);
+            // } else {
+            //     this.postReceivedVoice(res.buffer);
+            // }
+            this.internalCallback.processAudio(newBuffer).then((res) => {
+                if (res.length < 128 * 2) {
+                    return;
+                }
+                if (this.outputNode != null) {
+                    this.outputNode.postReceivedVoice(res.buffer);
+                } else {
+                    this.postReceivedVoice(res.buffer);
+                }
+            });
        } else {
-          this.postReceivedVoice(res.buffer);
+            throw "unknown protocol";
        }
-      });
-    } else {
-      throw "unknown protocol";
-    }
-  };
-
-  // Worklet操作
-  configure = (setting: WorkletSetting) => {
-    const req: VoiceChangerWorkletProcessorRequest = {
-      requestType: "config",
-      voice: new Float32Array(1),
-      numTrancateTreshold: setting.numTrancateTreshold,
-      volTrancateThreshold: setting.volTrancateThreshold,
-      volTrancateLength: setting.volTrancateLength,
    };
-    this.port.postMessage(req);
-  };

-  start = async () => {
-    const p = new Promise<void>((resolve) => {
-      this.startPromiseResolve = resolve;
-    });
-    const req: VoiceChangerWorkletProcessorRequest = {
-      requestType: "start",
-      voice: new Float32Array(1),
-      numTrancateTreshold: 0,
-      volTrancateThreshold: 0,
-      volTrancateLength: 0,
+    // Worklet操作
+    configure = (setting: WorkletSetting) => {
+        const req: VoiceChangerWorkletProcessorRequest = {
+            requestType: "config",
+            voice: new Float32Array(1),
+            numTrancateTreshold: setting.numTrancateTreshold,
+            volTrancateThreshold: setting.volTrancateThreshold,
+            volTrancateLength: setting.volTrancateLength,
+        };
+        this.port.postMessage(req);
    };
-    this.port.postMessage(req);
-    await p;
-  };
-  stop = async () => {
-    const p = new Promise<void>((resolve) => {
-      this.stopPromiseResolve = resolve;
-    });
-    const req: VoiceChangerWorkletProcessorRequest = {
-      requestType: "stop",
-      voice: new Float32Array(1),
-      numTrancateTreshold: 0,
-      volTrancateThreshold: 0,
-      volTrancateLength: 0,
-    };
-    this.port.postMessage(req);
-    await p;
-  };
-  trancateBuffer = () => {
-    const req: VoiceChangerWorkletProcessorRequest = {
-      requestType: "trancateBuffer",
-      voice: new Float32Array(1),
-      numTrancateTreshold: 0,
-      volTrancateThreshold: 0,
-      volTrancateLength: 0,
-    };
-    this.port.postMessage(req);
-  };

-  startOutputRecording = () => {
-    this.recordingOutputChunk = [];
-    this.isOutputRecording = true;
-  };
-  stopOutputRecording = () => {
-    this.isOutputRecording = false;
+    start = async () => {
+        const p = new Promise<void>((resolve) => {
+            this.startPromiseResolve = resolve;
+        });
+        const req: VoiceChangerWorkletProcessorRequest = {
+            requestType: "start",
+            voice: new Float32Array(1),
+            numTrancateTreshold: 0,
+            volTrancateThreshold: 0,
+            volTrancateLength: 0,
+        };
+        this.port.postMessage(req);
+        await p;
+    };
+    stop = async () => {
+        const p = new Promise<void>((resolve) => {
+            this.stopPromiseResolve = resolve;
+        });
+        const req: VoiceChangerWorkletProcessorRequest = {
+            requestType: "stop",
+            voice: new Float32Array(1),
+            numTrancateTreshold: 0,
+            volTrancateThreshold: 0,
+            volTrancateLength: 0,
+        };
+        this.port.postMessage(req);
+        await p;
+    };
+    trancateBuffer = () => {
+        const req: VoiceChangerWorkletProcessorRequest = {
+            requestType: "trancateBuffer",
+            voice: new Float32Array(1),
+            numTrancateTreshold: 0,
+            volTrancateThreshold: 0,
+            volTrancateLength: 0,
+        };
+        this.port.postMessage(req);
+    };

-    const dataSize = this.recordingOutputChunk.reduce((prev, cur) => {
-      return prev + cur.length;
-    }, 0);
-    const samples = new Float32Array(dataSize);
-    let sampleIndex = 0;
-    for (let i = 0; i < this.recordingOutputChunk.length; i++) {
-      for (let j = 0; j < this.recordingOutputChunk[i].length; j++) {
-        samples[sampleIndex] = this.recordingOutputChunk[i][j];
-        sampleIndex++;
-      }
-    }
-    return samples;
-  };
+    startOutputRecording = () => {
+        this.recordingOutputChunk = [];
+        this.isOutputRecording = true;
+    };
+    stopOutputRecording = () => {
+        this.isOutputRecording = false;
+
+        const dataSize = this.recordingOutputChunk.reduce((prev, cur) => {
+            return prev + cur.length;
+        }, 0);
+        const samples = new Float32Array(dataSize);
+        let sampleIndex = 0;
+        for (let i = 0; i < this.recordingOutputChunk.length; i++) {
+            for (let j = 0; j < this.recordingOutputChunk[i].length; j++) {
+                samples[sampleIndex] = this.recordingOutputChunk[i][j];
+                sampleIndex++;
+            }
+        }
+        return samples;
+    };
 }
--- a/server/.python-version
+++ b/server/.python-version
@ -0,0 +1 @@
+3.10.11
--- a/server/MMVCServerSIO.py
+++ b/server/MMVCServerSIO.py
@ -41,32 +41,122 @@ logger.debug(f"---------------- Booting PHASE :{__name__} -----------------")

 def setupArgParser():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--logLevel", type=str, default="error", help="Log level info|critical|error. (default: error)")
+    parser.add_argument(
+        "--logLevel",
+        type=str,
+        default="error",
+        help="Log level info|critical|error. (default: error)",
+    )
    parser.add_argument("-p", type=int, default=18888, help="port")
    parser.add_argument("--https", type=strtobool, default=False, help="use https")
-    parser.add_argument("--test_connect", type=str, default="8.8.8.8", help="test connect to detect ip in https mode. default 8.8.8.8")
-    parser.add_argument("--httpsKey", type=str, default="ssl.key", help="path for the key of https")
-    parser.add_argument("--httpsCert", type=str, default="ssl.cert", help="path for the cert of https")
-    parser.add_argument("--httpsSelfSigned", type=strtobool, default=True, help="generate self-signed certificate")
+    parser.add_argument(
+        "--test_connect",
+        type=str,
+        default="8.8.8.8",
+        help="test connect to detect ip in https mode. default 8.8.8.8",
+    )
+    parser.add_argument(
+        "--httpsKey", type=str, default="ssl.key", help="path for the key of https"
+    )
+    parser.add_argument(
+        "--httpsCert", type=str, default="ssl.cert", help="path for the cert of https"
+    )
+    parser.add_argument(
+        "--httpsSelfSigned",
+        type=strtobool,
+        default=True,
+        help="generate self-signed certificate",
+    )

-    parser.add_argument("--model_dir", type=str, default="model_dir", help="path to model files")
-    parser.add_argument("--sample_mode", type=str, default="production", help="rvc_sample_mode")
+    parser.add_argument(
+        "--model_dir", type=str, default="model_dir", help="path to model files"
+    )
+    parser.add_argument(
+        "--sample_mode", type=str, default="production", help="rvc_sample_mode"
+    )

-    parser.add_argument("--content_vec_500", type=str, default="pretrain/checkpoint_best_legacy_500.pt", help="path to content_vec_500 model(pytorch)")
-    parser.add_argument("--content_vec_500_onnx", type=str, default="pretrain/content_vec_500.onnx", help="path to content_vec_500 model(onnx)")
-    parser.add_argument("--content_vec_500_onnx_on", type=strtobool, default=True, help="use or not onnx for  content_vec_500")
-    parser.add_argument("--hubert_base", type=str, default="pretrain/hubert_base.pt", help="path to hubert_base model(pytorch)")
-    parser.add_argument("--hubert_base_jp", type=str, default="pretrain/rinna_hubert_base_jp.pt", help="path to hubert_base_jp model(pytorch)")
-    parser.add_argument("--hubert_soft", type=str, default="pretrain/hubert/hubert-soft-0d54a1f4.pt", help="path to hubert_soft model(pytorch)")
-    parser.add_argument("--whisper_tiny", type=str, default="pretrain/whisper_tiny.pt", help="path to hubert_soft model(pytorch)")
-    parser.add_argument("--nsf_hifigan", type=str, default="pretrain/nsf_hifigan/model", help="path to nsf_hifigan model(pytorch)")
-    parser.add_argument("--crepe_onnx_full", type=str, default="pretrain/crepe_onnx_full.onnx", help="path to crepe_onnx_full")
-    parser.add_argument("--crepe_onnx_tiny", type=str, default="pretrain/crepe_onnx_tiny.onnx", help="path to crepe_onnx_tiny")
-    parser.add_argument("--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe")
-    parser.add_argument("--rmvpe_onnx", type=str, default="pretrain/rmvpe.onnx", help="path to rmvpe onnx")
+    parser.add_argument(
+        "--content_vec_500",
+        type=str,
+        default="pretrain/checkpoint_best_legacy_500.pt",
+        help="path to content_vec_500 model(pytorch)",
+    )
+    parser.add_argument(
+        "--content_vec_500_onnx",
+        type=str,
+        default="pretrain/content_vec_500.onnx",
+        help="path to content_vec_500 model(onnx)",
+    )
+    parser.add_argument(
+        "--content_vec_500_onnx_on",
+        type=strtobool,
+        default=True,
+        help="use or not onnx for  content_vec_500",
+    )
+    parser.add_argument(
+        "--hubert_base",
+        type=str,
+        default="pretrain/hubert_base.pt",
+        help="path to hubert_base model(pytorch)",
+    )
+    parser.add_argument(
+        "--hubert_base_jp",
+        type=str,
+        default="pretrain/rinna_hubert_base_jp.pt",
+        help="path to hubert_base_jp model(pytorch)",
+    )
+    parser.add_argument(
+        "--hubert_soft",
+        type=str,
+        default="pretrain/hubert/hubert-soft-0d54a1f4.pt",
+        help="path to hubert_soft model(pytorch)",
+    )
+    parser.add_argument(
+        "--whisper_tiny",
+        type=str,
+        default="pretrain/whisper_tiny.pt",
+        help="path to hubert_soft model(pytorch)",
+    )
+    parser.add_argument(
+        "--nsf_hifigan",
+        type=str,
+        default="pretrain/nsf_hifigan/model",
+        help="path to nsf_hifigan model(pytorch)",
+    )
+    parser.add_argument(
+        "--crepe_onnx_full",
+        type=str,
+        default="pretrain/crepe_onnx_full.onnx",
+        help="path to crepe_onnx_full",
+    )
+    parser.add_argument(
+        "--crepe_onnx_tiny",
+        type=str,
+        default="pretrain/crepe_onnx_tiny.onnx",
+        help="path to crepe_onnx_tiny",
+    )
+    parser.add_argument(
+        "--rmvpe", type=str, default="pretrain/rmvpe.pt", help="path to rmvpe"
+    )
+    parser.add_argument(
+        "--rmvpe_onnx",
+        type=str,
+        default="pretrain/rmvpe.onnx",
+        help="path to rmvpe onnx",
+    )

-    parser.add_argument("--host", type=str, default='127.0.0.1', help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.")
-    parser.add_argument("--allowed-origins", action='append', default=[], help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.")
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+        help="IP address of the network interface to listen for HTTP connections. Specify 0.0.0.0 to listen on all interfaces.",
+    )
+    parser.add_argument(
+        "--allowed-origins",
+        action="append",
+        default=[],
+        help="List of URLs to allow connection from, i.e. https://example.com. Allows http(s)://127.0.0.1:{port} and http(s)://localhost:{port} by default.",
+    )

    return parser

@ -121,7 +211,11 @@ HOST = args.host
 PORT = args.p


-def localServer(logLevel: str = "critical", key_path: str | None = None, cert_path: str | None = None):
+def localServer(
+    logLevel: str = "critical",
+    key_path: str | None = None,
+    cert_path: str | None = None,
+):
    try:
        uvicorn.run(
            f"{os.path.basename(__file__)[:-3]}:app_socketio",
@ -140,14 +234,19 @@ if __name__ == "MMVCServerSIO":
    mp.freeze_support()

    voiceChangerManager = VoiceChangerManager.get_instance(voiceChangerParams)
-    app_fastapi = MMVC_Rest.get_instance(voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT)
-    app_socketio = MMVC_SocketIOApp.get_instance(app_fastapi, voiceChangerManager, args.allowed_origins, PORT)
+    app_fastapi = MMVC_Rest.get_instance(
+        voiceChangerManager, voiceChangerParams, args.allowed_origins, PORT
+    )
+    app_socketio = MMVC_SocketIOApp.get_instance(
+        app_fastapi, voiceChangerManager, args.allowed_origins, PORT
+    )


 if __name__ == "__mp_main__":
    # printMessage("サーバプロセスを起動しています。", level=2)
    printMessage("The server process is starting up.", level=2)

+
 if __name__ == "__main__":
    mp.freeze_support()

@ -202,7 +301,9 @@ if __name__ == "__main__":
        )
        key_path = os.path.join(SSL_KEY_DIR, keyname)
        cert_path = os.path.join(SSL_KEY_DIR, certname)
-        printMessage(f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1)
+        printMessage(
+            f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1
+        )

    elif args.https and args.httpsSelfSigned == 0:
        # HTTPS
@ -223,8 +324,13 @@ if __name__ == "__main__":
        printMessage("http://<IP>:<PORT>/", level=1)

    # printMessage("多くの場合は次のいずれかのURLにアクセスすると起動します。", level=2)
-    printMessage("In many cases, it will launch when you access any of the following URLs.", level=2)
-    if "EX_PORT" in locals() and "EX_IP" in locals():  # シェルスクリプト経由起動(docker)
+    printMessage(
+        "In many cases, it will launch when you access any of the following URLs.",
+        level=2,
+    )
+    if (
+        "EX_PORT" in locals() and "EX_IP" in locals()
+    ):  # シェルスクリプト経由起動(docker)
        if args.https == 1:
            printMessage(f"https://localhost:{EX_PORT}/", level=1)
            for ip in EX_IP.strip().split(" "):
@ -254,12 +360,26 @@ if __name__ == "__main__":
        p.start()
        try:
            if sys.platform.startswith("win"):
-                process = subprocess.Popen([NATIVE_CLIENT_FILE_WIN, "--disable-gpu", "-u", f"http://localhost:{PORT}/"])
+                process = subprocess.Popen(
+                    [
+                        NATIVE_CLIENT_FILE_WIN,
+                        "--disable-gpu",
+                        "-u",
+                        f"http://localhost:{PORT}/",
+                    ]
+                )
                return_code = process.wait()
                logger.info("client closed.")
                p.terminate()
            elif sys.platform.startswith("darwin"):
-                process = subprocess.Popen([NATIVE_CLIENT_FILE_MAC, "--disable-gpu", "-u", f"http://localhost:{PORT}/"])
+                process = subprocess.Popen(
+                    [
+                        NATIVE_CLIENT_FILE_MAC,
+                        "--disable-gpu",
+                        "-u",
+                        f"http://localhost:{PORT}/",
+                    ]
+                )
                return_code = process.wait()
                logger.info("client closed.")
                p.terminate()
--- a/server/poetry.lock
+++ b/server/poetry.lock
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@ -0,0 +1,44 @@
+[tool.poetry]
+name = "server"
+version = "0.1.0"
+description = ""
+authors = ["wok <wok@local.com>"]
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = "3.10.11"
+uvicorn = "0.21.1"
+pyOpenSSL ="23.1.1"
+numpy = "1.23.5"
+resampy = "0.4.2"
+python-socketio = "5.8.0"
+fastapi = "0.95.1"
+python-multipart = "0.0.6"
+onnxruntime-gpu = "1.13.1"
+scipy = "1.10.1"
+matplotlib = "3.7.1"
+websockets = "11.0.2"
+faiss-cpu = "1.7.3"
+torchcrepe = "0.0.18"
+librosa = "0.9.1"
+gin = "0.1.6"
+gin_config = "0.5.0"
+einops = "0.6.0"
+local_attention = "1.8.5"
+sounddevice = "0.4.6"
+dataclasses_json = "0.5.7"
+onnxsim = "0.4.28"
+torchfcpe = "0.0.3"
+torchaudio = "2.3.1"
+torch = "2.3.1"
+fairseq = "0.12.2"
+pyworld = "0.3.4"
+
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+main2 = "MMVCServerSIO:main"
+test = "test.test:test"
--- a/server/restapi/MMVC_Rest_VoiceChanger.py
+++ b/server/restapi/MMVC_Rest_VoiceChanger.py
@ -39,13 +39,18 @@ class MMVC_Rest_VoiceChanger:
            #         struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)
            #     )

-            unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16)
+            # unpackedData = np.array(struct.unpack("<%sh" % (len(wav) // struct.calcsize("<h")), wav)).astype(np.int16)
+            unpackedData = np.array(
+                struct.unpack("<%sf" % (len(wav) // struct.calcsize("<f")), wav)
+            ).astype(np.float32)
            # print(f"[REST] unpackedDataType {unpackedData.dtype}")

            self.tlock.acquire()
            changedVoice = self.voiceChangerManager.changeVoice(unpackedData)
            self.tlock.release()

+            print("", changedVoice[0].dtype)
+
            changedVoiceBase64 = base64.b64encode(changedVoice[0]).decode("utf-8")
            data = {"timestamp": timestamp, "changedVoiceBase64": changedVoiceBase64}

--- a/server/voice_changer/RVC/RVCr2.py
+++ b/server/voice_changer/RVC/RVCr2.py
@ -60,9 +60,13 @@ class RVCr2(VoiceChangerModel):

        # pipelineの生成
        try:
-            self.pipeline = createPipeline(self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector)
+            self.pipeline = createPipeline(
+                self.params, self.slotInfo, self.settings.gpu, self.settings.f0Detector
+            )
        except PipelineCreateException as e:  # NOQA
-            logger.error("[Voice Changer] pipeline create failed. check your model is valid.")
+            logger.error(
+                "[Voice Changer] pipeline create failed. check your model is valid."
+            )
            return

        # その他の設定
@ -88,7 +92,9 @@ class RVCr2(VoiceChangerModel):
        elif key in self.settings.strData:
            setattr(self.settings, key, str(val))
            if key == "f0Detector" and self.pipeline is not None:
-                pitchExtractor = PitchExtractorManager.getPitchExtractor(self.settings.f0Detector, self.settings.gpu)
+                pitchExtractor = PitchExtractorManager.getPitchExtractor(
+                    self.settings.f0Detector, self.settings.gpu
+                )
                self.pipeline.setPitchExtractor(pitchExtractor)
        else:
            return False
@ -115,14 +121,16 @@ class RVCr2(VoiceChangerModel):
    ):
        # 16k で入ってくる。
        inputSize = newData.shape[0]
-        newData = newData.astype(np.float32) / 32768.0
+        # newData = newData.astype(np.float32) / 32768.0
        newFeatureLength = inputSize // 160  # hopsize:=160

        if self.audio_buffer is not None:
            # 過去のデータに連結
            self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
            if self.slotInfo.f0:
-                self.pitchf_buffer = np.concatenate([self.pitchf_buffer, np.zeros(newFeatureLength)], 0)
+                self.pitchf_buffer = np.concatenate(
+                    [self.pitchf_buffer, np.zeros(newFeatureLength)], 0
+                )
            self.feature_buffer = np.concatenate(
                [
                    self.feature_buffer,
@ -134,19 +142,29 @@ class RVCr2(VoiceChangerModel):
            self.audio_buffer = newData
            if self.slotInfo.f0:
                self.pitchf_buffer = np.zeros(newFeatureLength)
-            self.feature_buffer = np.zeros([newFeatureLength, self.slotInfo.embChannels])
+            self.feature_buffer = np.zeros(
+                [newFeatureLength, self.slotInfo.embChannels]
+            )

        convertSize = inputSize + crossfadeSize + solaSearchFrame + extra_frame

-        if convertSize % 160 != 0:  # モデルの出力のホップサイズで切り捨てが発生するので補う。
+        if (
+            convertSize % 160 != 0
+        ):  # モデルの出力のホップサイズで切り捨てが発生するので補う。
            convertSize = convertSize + (160 - (convertSize % 160))
-        outSize = int(((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate)
+        outSize = int(
+            ((convertSize - extra_frame) / 16000) * self.slotInfo.samplingRate
+        )

        # バッファがたまっていない場合はzeroで補う
        if self.audio_buffer.shape[0] < convertSize:
-            self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
+            self.audio_buffer = np.concatenate(
+                [np.zeros([convertSize]), self.audio_buffer]
+            )
            if self.slotInfo.f0:
-                self.pitchf_buffer = np.concatenate([np.zeros([convertSize // 160]), self.pitchf_buffer])
+                self.pitchf_buffer = np.concatenate(
+                    [np.zeros([convertSize // 160]), self.pitchf_buffer]
+                )
            self.feature_buffer = np.concatenate(
                [
                    np.zeros([convertSize // 160, self.slotInfo.embChannels]),
@ -179,27 +197,39 @@ class RVCr2(VoiceChangerModel):
            outSize,
        )

-    def inference(self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int):
+    def inference(
+        self, receivedData: AudioInOut, crossfade_frame: int, sola_search_frame: int
+    ):
        if self.pipeline is None:
            logger.info("[Voice Changer] Pipeline is not initialized.")
            raise PipelineNotInitializedException()

        # 処理は16Kで実施(Pitch, embed, (infer))
-        receivedData = cast(
-            AudioInOut,
-            resampy.resample(
-                receivedData,
-                self.inputSampleRate,
-                16000,
-                filter="kaiser_fast",
-            ),
+        # receivedData = cast(
+        #     AudioInOut,
+        #     resampy.resample(
+        #         receivedData,
+        #         self.inputSampleRate,
+        #         16000,
+        #         filter="kaiser_fast",
+        #     ),
+        # )
+        receivedData = resampy.resample(
+            receivedData,
+            self.inputSampleRate,
+            16000,
+            filter="kaiser_fast",
        )
        crossfade_frame = int((crossfade_frame / self.inputSampleRate) * 16000)
        sola_search_frame = int((sola_search_frame / self.inputSampleRate) * 16000)
-        extra_frame = int((self.settings.extraConvertSize / self.inputSampleRate) * 16000)
+        extra_frame = int(
+            (self.settings.extraConvertSize / self.inputSampleRate) * 16000
+        )

        # 入力データ生成
-        data = self.generate_input(receivedData, crossfade_frame, sola_search_frame, extra_frame)
+        data = self.generate_input(
+            receivedData, crossfade_frame, sola_search_frame, extra_frame
+        )

        audio = data[0]
        pitchf = data[1]
@ -234,7 +264,11 @@ class RVCr2(VoiceChangerModel):
                index_rate,
                if_f0,
                # 0,
-                self.settings.extraConvertSize / self.inputSampleRate if self.settings.silenceFront else 0.0,  # extaraDataSizeの秒数。入力のサンプリングレートで算出
+                (
+                    self.settings.extraConvertSize / self.inputSampleRate
+                    if self.settings.silenceFront
+                    else 0.0
+                ),  # extaraDataSizeの秒数。入力のサンプリングレートで算出
                embOutputLayer,
                useFinalProj,
                repeat,
@ -244,19 +278,27 @@ class RVCr2(VoiceChangerModel):
            # result = audio_out.detach().cpu().numpy() * np.sqrt(vol)
            result = audio_out[-outSize:].detach().cpu().numpy() * np.sqrt(vol)

-            result = cast(
-                AudioInOut,
-                resampy.resample(
-                    result,
-                    self.slotInfo.samplingRate,
-                    self.outputSampleRate,
-                    filter="kaiser_fast",
-                ),
+            # result = cast(
+            #     AudioInOut,
+            #     resampy.resample(
+            #         result,
+            #         self.slotInfo.samplingRate,
+            #         self.outputSampleRate,
+            #         filter="kaiser_fast",
+            #     ),
+            # )
+            result = resampy.resample(
+                result,
+                self.slotInfo.samplingRate,
+                self.outputSampleRate,
+                filter="kaiser_fast",
            )

            return result
        except DeviceCannotSupportHalfPrecisionException as e:  # NOQA
-            logger.warn("[Device Manager] Device cannot support half precision. Fallback to float....")
+            logger.warn(
+                "[Device Manager] Device cannot support half precision. Fallback to float...."
+            )
            self.deviceManager.setForceTensor(True)
            self.initialize()
            # raise e
--- a/server/voice_changer/RVC/pipeline/Pipeline.py
+++ b/server/voice_changer/RVC/pipeline/Pipeline.py
@ -55,7 +55,9 @@ class Pipeline(object):
        logger.info("GENERATE PITCH EXTRACTOR" + str(self.pitchExtractor))

        self.index = index
-        self.big_npy = index.reconstruct_n(0, index.ntotal) if index is not None else None
+        self.big_npy = (
+            index.reconstruct_n(0, index.ntotal) if index is not None else None
+        )
        # self.feature = feature

        self.targetSR = targetSR
@ -69,7 +71,12 @@ class Pipeline(object):
        inferencerInfo = self.inferencer.getInferencerInfo() if self.inferencer else {}
        embedderInfo = self.embedder.getEmbedderInfo()
        pitchExtractorInfo = self.pitchExtractor.getPitchExtractorInfo()
-        return {"inferencer": inferencerInfo, "embedder": embedderInfo, "pitchExtractor": pitchExtractorInfo, "isHalf": self.isHalf}
+        return {
+            "inferencer": inferencerInfo,
+            "embedder": embedderInfo,
+            "pitchExtractor": pitchExtractorInfo,
+            "isHalf": self.isHalf,
+        }

    def setPitchExtractor(self, pitchExtractor: PitchExtractor):
        self.pitchExtractor = pitchExtractor
@ -88,13 +95,16 @@ class Pipeline(object):
                # pitch = pitch[:p_len]
                # pitchf = pitchf[:p_len]
                pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
-                pitchf = torch.tensor(pitchf, device=self.device, dtype=torch.float).unsqueeze(0)
+                pitchf = torch.tensor(
+                    pitchf, device=self.device, dtype=torch.float
+                ).unsqueeze(0)
            else:
                pitch = None
                pitchf = None
        except IndexError as e:  # NOQA
            print(e)
            import traceback
+
            traceback.print_exc()
            raise NotEnoughDataExtimateF0()
        return pitch, pitchf
@ -102,7 +112,9 @@ class Pipeline(object):
    def extractFeatures(self, feats, embOutputLayer, useFinalProj):
        with autocast(enabled=self.isHalf):
            try:
-                feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
+                feats = self.embedder.extractFeatures(
+                    feats, embOutputLayer, useFinalProj
+                )
                if torch.isnan(feats).all():
                    raise DeviceCannotSupportHalfPrecisionException()
                return feats
@ -118,8 +130,11 @@ class Pipeline(object):
        try:
            with torch.no_grad():
                with autocast(enabled=self.isHalf):
-                    audio1 = self.inferencer.infer(feats,  p_len, pitch, pitchf, sid, out_size)                    
-                    audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
+                    audio1 = self.inferencer.infer(
+                        feats, p_len, pitch, pitchf, sid, out_size
+                    )
+                    # audio1 = (audio1 * 32767.5).data.to(dtype=torch.int16)
+                    audio1 = (audio1).data
            return audio1
        except RuntimeError as e:
            if "HALF" in e.__str__().upper():
@ -149,16 +164,24 @@ class Pipeline(object):

        with Timer2("Pipeline-Exec", False) as t:  # NOQA
            # 16000のサンプリングレートで入ってきている。以降この世界は16000で処理。
-            search_index = self.index is not None and self.big_npy is not None and index_rate != 0
+            search_index = (
+                self.index is not None and self.big_npy is not None and index_rate != 0
+            )
            # self.t_pad = self.sr * repeat  # 1秒
            # self.t_pad_tgt = self.targetSR * repeat  # 1秒　出力時のトリミング(モデルのサンプリングで出力される)
            audio = audio.unsqueeze(0)

-            quality_padding_sec = (repeat * (audio.shape[1] - 1)) / self.sr  # padding(reflect)のサイズは元のサイズより小さい必要がある。
+            quality_padding_sec = (
+                repeat * (audio.shape[1] - 1)
+            ) / self.sr  # padding(reflect)のサイズは元のサイズより小さい必要がある。

            self.t_pad = round(self.sr * quality_padding_sec)  # 前後に音声を追加
-            self.t_pad_tgt = round(self.targetSR * quality_padding_sec)  # 前後に音声を追加　出力時のトリミング(モデルのサンプリングで出力される)
-            audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(0)
+            self.t_pad_tgt = round(
+                self.targetSR * quality_padding_sec
+            )  # 前後に音声を追加　出力時のトリミング(モデルのサンプリングで出力される)
+            audio_pad = F.pad(audio, (self.t_pad, self.t_pad), mode="reflect").squeeze(
+                0
+            )
            p_len = audio_pad.shape[0] // self.window
            sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()

@ -176,7 +199,9 @@ class Pipeline(object):

            t.record("pre-process")
            # ピッチ検出
-            pitch, pitchf = self.extractPitch(audio_pad, if_f0, pitchf, f0_up_key, silence_front)
+            pitch, pitchf = self.extractPitch(
+                audio_pad, if_f0, pitchf, f0_up_key, silence_front
+            )
            t.record("extract-pitch")

            # embedding
@ -203,12 +228,25 @@ class Pipeline(object):
                    score, ix = self.index.search(npy, k=8)
                    weight = np.square(1 / score)
                    weight /= weight.sum(axis=1, keepdims=True)
-                    npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1)
+                    npy = np.sum(
+                        self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1
+                    )

                # recover silient font
-                npy = np.concatenate([np.zeros([npyOffset, npy.shape[1]], dtype=np.float32), feature[:npyOffset:2].astype("float32"), npy])[-feats.shape[1]:]
-                feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
-            feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
+                npy = np.concatenate(
+                    [
+                        np.zeros([npyOffset, npy.shape[1]], dtype=np.float32),
+                        feature[:npyOffset:2].astype("float32"),
+                        npy,
+                    ]
+                )[-feats.shape[1] :]
+                feats = (
+                    torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate
+                    + (1 - index_rate) * feats
+                )
+            feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(
+                0, 2, 1
+            )
            if protect < 0.5 and search_index:
                feats0 = feats.clone()

@ -280,4 +318,4 @@ class Pipeline(object):
        del self.embedder
        del self.inferencer
        del self.pitchExtractor
-        print('Pipeline has been deleted')
+        print("Pipeline has been deleted")
--- a/server/voice_changer/VoiceChangerV2.py
+++ b/server/voice_changer/VoiceChangerV2.py
@ -90,15 +90,22 @@ class VoiceChangerV2(VoiceChangerIF):
        self.params = params
        self.gpu_num = torch.cuda.device_count()
        self.prev_audio = np.zeros(4096)
-        self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
+        self.mps_enabled: bool = (
+            getattr(torch.backends, "mps", None) is not None
+            and torch.backends.mps.is_available()
+        )
        self.onnx_device = onnxruntime.get_device()
        self.noCrossFade = False

-        logger.info(f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})")
+        logger.info(
+            f"VoiceChangerV2 Initialized (GPU_NUM(cuda):{self.gpu_num}, mps_enabled:{self.mps_enabled}, onnx_device:{self.onnx_device})"
+        )

    def setModel(self, model: VoiceChangerModel):
        self.voiceChanger = model
-        self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
+        self.voiceChanger.setSamplingRate(
+            self.settings.inputSampleRate, self.settings.outputSampleRate
+        )
        # if model.voiceChangerType == "Beatrice" or model.voiceChangerType == "LLVC":
        if model.voiceChangerType == "Beatrice":
            self.noCrossFade = True
@ -107,11 +114,15 @@ class VoiceChangerV2(VoiceChangerIF):

    def setInputSampleRate(self, sr: int):
        self.settings.inputSampleRate = sr
-        self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
+        self.voiceChanger.setSamplingRate(
+            self.settings.inputSampleRate, self.settings.outputSampleRate
+        )

    def setOutputSampleRate(self, sr: int):
        self.settings.outputSampleRate = sr
-        self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
+        self.voiceChanger.setSamplingRate(
+            self.settings.inputSampleRate, self.settings.outputSampleRate
+        )

    def get_info(self):
        data = asdict(self.settings)
@ -130,7 +141,9 @@ class VoiceChangerV2(VoiceChangerIF):
        if key == "serverAudioStated" and val == 0:
            self.settings.inputSampleRate = 48000
            self.settings.outputSampleRate = 48000
-            self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
+            self.voiceChanger.setSamplingRate(
+                self.settings.inputSampleRate, self.settings.outputSampleRate
+            )

        if key in self.settings.intData:
            setattr(self.settings, key, int(val))
@ -146,7 +159,9 @@ class VoiceChangerV2(VoiceChangerIF):
                    self.settings.outputSampleRate,
                    # 16000,
                )
-                print(f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}")
+                print(
+                    f"-------------------------- - - - {self.settings.inputSampleRate}, {self.settings.outputSampleRate}"
+                )
            if key == "recordIO" and val == 0:
                if hasattr(self, "ioRecorder"):
                    self.ioRecorder.close()
@ -155,7 +170,9 @@ class VoiceChangerV2(VoiceChangerIF):
                if hasattr(self, "ioRecorder"):
                    self.ioRecorder.close()
            if key == "inputSampleRate" or key == "outputSampleRate":
-                self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
+                self.voiceChanger.setSamplingRate(
+                    self.settings.inputSampleRate, self.settings.outputSampleRate
+                )
        elif key in self.settings.floatData:
            setattr(self.settings, key, float(val))
        elif key in self.settings.strData:
@ -168,7 +185,12 @@ class VoiceChangerV2(VoiceChangerIF):
        return self.get_info()

    def _generate_strength(self, crossfadeSize: int):
-        if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
+        if (
+            self.crossfadeSize != crossfadeSize
+            or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
+            or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
+            or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
+        ):
            self.crossfadeSize = crossfadeSize
            self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
            self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
@ -197,7 +219,9 @@ class VoiceChangerV2(VoiceChangerIF):
                ]
            )

-            logger.info(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
+            logger.info(
+                f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
+            )

            # ひとつ前の結果とサイズが変わるため、記録は消去する。
            if hasattr(self, "np_prev_audio1") is True:
@ -212,13 +236,19 @@ class VoiceChangerV2(VoiceChangerIF):
            return self.voiceChanger.get_processing_sampling_rate()

    #  receivedData: tuple of short
-    def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
+    def on_request(
+        self, receivedData: AudioInOut
+    ) -> tuple[AudioInOut, list[Union[int, float]]]:
        try:
            if self.voiceChanger is None:
-                raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
+                raise VoiceChangerIsNotSelectedException(
+                    "Voice Changer is not selected."
+                )
            enableMainprocessTimer = False
            with Timer2("main-process", enableMainprocessTimer) as t:
-                processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
+                processing_sampling_rate = (
+                    self.voiceChanger.get_processing_sampling_rate()
+                )

                if self.noCrossFade:  # Beatrice, LLVC
                    audio = self.voiceChanger.inference(
@ -232,7 +262,9 @@ class VoiceChangerV2(VoiceChangerIF):
                else:
                    sola_search_frame = int(0.012 * processing_sampling_rate)
                    block_frame = receivedData.shape[0]
-                    crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
+                    crossfade_frame = min(
+                        self.settings.crossFadeOverlapSize, block_frame
+                    )
                    self._generate_strength(crossfade_frame)
                    t.record("generate_strength")

@ -241,11 +273,14 @@ class VoiceChangerV2(VoiceChangerIF):
                        crossfade_frame=crossfade_frame,
                        sola_search_frame=sola_search_frame,
                    )
+                    print("output audio dtype", audio.dtype)
                    t.record("inference")

                    if hasattr(self, "sola_buffer") is True:
                        np.set_printoptions(threshold=10000)
-                        audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
+                        audio_offset = -1 * (
+                            sola_search_frame + crossfade_frame + block_frame
+                        )
                        audio = audio[audio_offset:]

                        # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
@ -264,24 +299,33 @@ class VoiceChangerV2(VoiceChangerIF):
                        )
                        sola_offset = int(np.argmax(cor_nom / cor_den))
                        sola_end = sola_offset + block_frame
-                        output_wav = audio[sola_offset:sola_end].astype(np.float64)
+                        output_wav = audio[sola_offset:sola_end].astype(np.float32)
                        output_wav[:crossfade_frame] *= self.np_cur_strength
                        output_wav[:crossfade_frame] += self.sola_buffer[:]

                        result = output_wav
                    else:
-                        logger.info("[Voice Changer] warming up... generating sola buffer.")
-                        result = np.zeros(4096).astype(np.int16)
+                        logger.info(
+                            "[Voice Changer] warming up... generating sola buffer."
+                        )
+                        result = np.zeros(4096).astype(np.float32)

                    t.record("sora")

-                    if hasattr(self, "sola_buffer") is True and sola_offset < sola_search_frame:
-                        offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
+                    if (
+                        hasattr(self, "sola_buffer") is True
+                        and sola_offset < sola_search_frame
+                    ):
+                        offset = -1 * (
+                            sola_search_frame + crossfade_frame - sola_offset
+                        )
                        end = -1 * (sola_search_frame - sola_offset)
                        sola_buf_org = audio[offset:end]
                        self.sola_buffer = sola_buf_org * self.np_prev_strength
                    else:
-                        self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
+                        self.sola_buffer = (
+                            audio[-crossfade_frame:] * self.np_prev_strength
+                        )
                        # self.sola_buffer = audio[- crossfade_frame:]

                    t.record("post")
@ -290,9 +334,11 @@ class VoiceChangerV2(VoiceChangerIF):

            # 後処理
            with Timer2("post-process", False) as t:
-                result = result.astype(np.int16)
+                result = result.astype(np.float32)

-                print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz")
+                print_convert_processing(
+                    f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {result .shape[0]}/{self.settings.outputSampleRate}hz"
+                )

                if receivedData.shape[0] != result.shape[0]:
                    # print("TODO FIX:::::PADDING", receivedData.shape[0], result.shape[0])
@ -311,7 +357,9 @@ class VoiceChangerV2(VoiceChangerIF):

            postprocess_time = t.secs

-            print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
+            print_convert_processing(
+                f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
+            )
            perf = [0, mainprocess_time, postprocess_time]

            return outputData, perf
@ -320,7 +368,9 @@ class VoiceChangerV2(VoiceChangerIF):
            logger.warn(f"[Voice Changer] [Exception], {e}")
            return np.zeros(1).astype(np.int16), [0, 0, 0]
        except ONNXInputArgumentException as e:
-            logger.warn(f"[Voice Changer] [Exception] onnx are waiting valid input., {e}")
+            logger.warn(
+                f"[Voice Changer] [Exception] onnx are waiting valid input., {e}"
+            )
            return np.zeros(1).astype(np.int16), [0, 0, 0]
        except HalfPrecisionChangingException:
            logger.warn("[Voice Changer] Switching model configuration....")
@ -332,7 +382,9 @@ class VoiceChangerV2(VoiceChangerIF):
            logger.warn(f"[Voice Changer] embedder: {e}")
            return np.zeros(1).astype(np.int16), [0, 0, 0]
        except VoiceChangerIsNotSelectedException:
-            logger.warn("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
+            logger.warn(
+                "[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc."
+            )
            return np.zeros(1).astype(np.int16), [0, 0, 0]
        except DeviceCannotSupportHalfPrecisionException:
            # RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。