From b9429c7655e1c27e014450c809bee2eae65ce9d6 Mon Sep 17 00:00:00 2001
From: w-okada <you@example.com>
Date: Mon, 17 Jul 2023 07:21:06 +0900
Subject: [PATCH] WIP: diffusion svc refining

---
 client/demo/dist/index.js                     |   2 +-
 .../101-7_diffusion-svcSettingArea.tsx        |  47 ++++--
 server/restapi/MMVC_Rest_Fileuploader.py      |   2 +
 .../DiffusionSVC/DiffusionSVC.py              |   4 +-
 .../inferencer/DiffusionSVCInferencer.py      |  24 ++-
 .../onnxExporter/DiffusionSVC_ONNX.py         |  90 +++++++++++
 .../DiffusionSVC/onnxExporter/export2onnx.py  | 125 ++++++++++++++
 .../DiffusionSVC/pipeline/Pipeline.py         | 152 ++++++++++--------
 .../pitchExtractor/DioPitchExtractor.py       |   1 +
 server/voice_changer/utils/VoiceChangerIF.py  |   3 +-
 10 files changed, 361 insertions(+), 89 deletions(-)
 create mode 100644 server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py
 create mode 100644 server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py

diff --git a/client/demo/dist/index.js b/client/demo/dist/index.js
index df39b33d..2d71a580 100644
--- a/client/demo/dist/index.js
+++ b/client/demo/dist/index.js
@@ -389,7 +389,7 @@ eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpac
 /***/ ((__unused_webpack_module, __webpack_exports__, __webpack_require__) => {
 
 "use strict";
-eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */   DiffusionSVCSettingArea: () => (/* binding */ DiffusionSVCSettingArea)\n/* harmony export */ });\n/* harmony import */ var _babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! @babel/runtime/helpers/defineProperty */ \"./node_modules/@babel/runtime/helpers/esm/defineProperty.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! react */ \"./node_modules/react/index.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_1___default = /*#__PURE__*/__webpack_require__.n(react__WEBPACK_IMPORTED_MODULE_1__);\n/* harmony import */ var _001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../../001_provider/001_AppStateProvider */ \"./src/001_provider/001_AppStateProvider.tsx\");\n\nfunction ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); enumerableOnly && (symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; })), keys.push.apply(keys, symbols); } return keys; }\nfunction _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = null != arguments[i] ? arguments[i] : {}; i % 2 ? ownKeys(Object(source), !0).forEach(function (key) { (0,_babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_0__[\"default\"])(target, key, source[key]); }) : Object.getOwnPropertyDescriptors ? Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)) : ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } return target; }\n\n\nvar DiffusionSVCSettingArea = function DiffusionSVCSettingArea(_props) {\n  var _useAppState = (0,_001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_2__.useAppState)(),\n    serverSetting = _useAppState.serverSetting;\n  var selected = (0,react__WEBPACK_IMPORTED_MODULE_1__.useMemo)(function () {\n    if (serverSetting.serverSetting.modelSlotIndex == undefined) {\n      return;\n    }\n    return serverSetting.serverSetting.modelSlots[serverSetting.serverSetting.modelSlotIndex];\n  }, [serverSetting.serverSetting.modelSlotIndex, serverSetting.serverSetting.modelSlots]);\n  var settingArea = (0,react__WEBPACK_IMPORTED_MODULE_1__.useMemo)(function () {\n    if (!selected) {\n      return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null);\n    }\n    if (selected.voiceChangerType != \"Diffusion-SVC\") {\n      return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null);\n    }\n    var kStepRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control-title\"\n    }, \"k-step:\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control-field\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-slider-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-kind\"\n    }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-slider\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"input\", {\n      type: \"range\",\n      min: \"0\",\n      max: selected.kStepMax,\n      step: \"1\",\n      value: serverSetting.serverSetting.kStep,\n      onChange: function onChange(e) {\n        serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n          kStep: Number(e.target.value)\n        }));\n      }\n    })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-val\"\n    }, serverSetting.serverSetting.kStep))));\n    var speedUpRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control-title\"\n    }, \"speedup\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-control-field\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n      className: \"character-area-slider-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-kind\"\n    }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-slider\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"input\", {\n      type: \"range\",\n      min: \"0\",\n      max: serverSetting.serverSetting.kStep,\n      step: \"1\",\n      value: serverSetting.serverSetting.speedUp,\n      onChange: function onChange(e) {\n        serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n          speedUp: Number(e.target.value)\n        }));\n      }\n    })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n      className: \"character-area-slider-control-val\"\n    }, serverSetting.serverSetting.speedUp))));\n    return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null, kStepRow, speedUpRow);\n  }, [serverSetting.serverSetting, serverSetting.updateServerSettings, selected]);\n  return settingArea;\n};\n\n//# sourceURL=webpack://demo/./src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx?");
+eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */   DiffusionSVCSettingArea: () => (/* binding */ DiffusionSVCSettingArea)\n/* harmony export */ });\n/* harmony import */ var _babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! @babel/runtime/helpers/toConsumableArray */ \"./node_modules/@babel/runtime/helpers/esm/toConsumableArray.js\");\n/* harmony import */ var _babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! @babel/runtime/helpers/defineProperty */ \"./node_modules/@babel/runtime/helpers/esm/defineProperty.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! react */ \"./node_modules/react/index.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_2___default = /*#__PURE__*/__webpack_require__.n(react__WEBPACK_IMPORTED_MODULE_2__);\n/* harmony import */ var _001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../../../001_provider/001_AppStateProvider */ \"./src/001_provider/001_AppStateProvider.tsx\");\n\n\nfunction ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); enumerableOnly && (symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; })), keys.push.apply(keys, symbols); } return keys; }\nfunction _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = null != arguments[i] ? arguments[i] : {}; i % 2 ? ownKeys(Object(source), !0).forEach(function (key) { (0,_babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_1__[\"default\"])(target, key, source[key]); }) : Object.getOwnPropertyDescriptors ? Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)) : ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } return target; }\n\n\nvar DiffusionSVCSettingArea = function DiffusionSVCSettingArea(_props) {\n  var _useAppState = (0,_001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_3__.useAppState)(),\n    serverSetting = _useAppState.serverSetting;\n  var selected = (0,react__WEBPACK_IMPORTED_MODULE_2__.useMemo)(function () {\n    if (serverSetting.serverSetting.modelSlotIndex == undefined) {\n      return;\n    }\n    return serverSetting.serverSetting.modelSlots[serverSetting.serverSetting.modelSlotIndex];\n  }, [serverSetting.serverSetting.modelSlotIndex, serverSetting.serverSetting.modelSlots]);\n  var settingArea = (0,react__WEBPACK_IMPORTED_MODULE_2__.useMemo)(function () {\n    if (!selected) {\n      return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null);\n    }\n    if (selected.voiceChangerType != \"Diffusion-SVC\") {\n      return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null);\n    }\n    var skipValues = getDivisors(serverSetting.serverSetting.kStep);\n    skipValues.pop();\n    var kStepRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control-title\"\n    }, \"k-step:\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control-field\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-slider-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n      className: \"character-area-slider-control-kind\"\n    }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n      className: \"character-area-slider-control-slider\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"input\", {\n      type: \"range\",\n      min: \"2\",\n      max: selected.kStepMax,\n      step: \"1\",\n      value: serverSetting.serverSetting.kStep,\n      onChange: function onChange(e) {\n        var newKStep = Number(e.target.value);\n        var newSkipValues = getDivisors(Number(e.target.value));\n        newSkipValues.pop();\n        serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n          speedUp: Math.max.apply(Math, (0,_babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_0__[\"default\"])(newSkipValues)),\n          kStep: newKStep\n        }));\n      }\n    })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n      className: \"character-area-slider-control-val\"\n    }, serverSetting.serverSetting.kStep))));\n    var speedUpRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control-title\"\n    }, \"skip\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-control-field\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n      className: \"character-area-slider-control\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n      className: \"character-area-slider-control-kind\"\n    }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n      className: \"character-area-slider-control-slider\"\n    }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"select\", {\n      name: \"\",\n      id: \"\",\n      value: serverSetting.serverSetting.speedUp,\n      onChange: function onChange(e) {\n        serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n          speedUp: Number(e.target.value)\n        }));\n      }\n    }, skipValues.map(function (v) {\n      return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"option\", {\n        value: v,\n        key: v\n      }, v);\n    }))))));\n    return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null, kStepRow, speedUpRow);\n  }, [serverSetting.serverSetting, serverSetting.updateServerSettings, selected]);\n  return settingArea;\n};\nvar getDivisors = function getDivisors(num) {\n  var divisors = [];\n  var end = Math.sqrt(num);\n  for (var i = 1; i <= end; i++) {\n    if (num % i === 0) {\n      divisors.push(i);\n      if (i !== num / i) {\n        divisors.push(num / i);\n      }\n    }\n  }\n  return divisors.sort(function (a, b) {\n    return a - b;\n  });\n};\n\n//# sourceURL=webpack://demo/./src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx?");
 
 /***/ }),
 
diff --git a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
index fc998ede..dc14a12b 100644
--- a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
+++ b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx
@@ -23,6 +23,9 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
             return <></>;
         }
 
+        const skipValues = getDivisors(serverSetting.serverSetting.kStep);
+        skipValues.pop();
+
         const kStepRow = (
             <div className="character-area-control">
                 <div className="character-area-control-title">k-step:</div>
@@ -32,12 +35,15 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
                         <span className="character-area-slider-control-slider">
                             <input
                                 type="range"
-                                min="0"
+                                min="2"
                                 max={(selected as DiffusionSVCModelSlot).kStepMax}
                                 step="1"
                                 value={serverSetting.serverSetting.kStep}
                                 onChange={(e) => {
-                                    serverSetting.updateServerSettings({ ...serverSetting.serverSetting, kStep: Number(e.target.value) });
+                                    const newKStep = Number(e.target.value);
+                                    const newSkipValues = getDivisors(Number(e.target.value));
+                                    newSkipValues.pop();
+                                    serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Math.max(...newSkipValues), kStep: newKStep });
                                 }}
                             ></input>
                         </span>
@@ -48,23 +54,28 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
         );
         const speedUpRow = (
             <div className="character-area-control">
-                <div className="character-area-control-title">speedup</div>
+                <div className="character-area-control-title">skip</div>
                 <div className="character-area-control-field">
                     <div className="character-area-slider-control">
                         <span className="character-area-slider-control-kind"></span>
                         <span className="character-area-slider-control-slider">
-                            <input
-                                type="range"
-                                min="0"
-                                max={serverSetting.serverSetting.kStep}
-                                step="1"
+                            <select
+                                name=""
+                                id=""
                                 value={serverSetting.serverSetting.speedUp}
                                 onChange={(e) => {
                                     serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Number(e.target.value) });
                                 }}
-                            ></input>
+                            >
+                                {skipValues.map((v) => {
+                                    return (
+                                        <option value={v} key={v}>
+                                            {v}
+                                        </option>
+                                    );
+                                })}
+                            </select>
                         </span>
-                        <span className="character-area-slider-control-val">{serverSetting.serverSetting.speedUp}</span>
                     </div>
                 </div>
             </div>
@@ -79,3 +90,19 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) =>
 
     return settingArea;
 };
+
+const getDivisors = (num: number) => {
+    var divisors = [];
+    var end = Math.sqrt(num);
+
+    for (var i = 1; i <= end; i++) {
+        if (num % i === 0) {
+            divisors.push(i);
+            if (i !== num / i) {
+                divisors.push(num / i);
+            }
+        }
+    }
+
+    return divisors.sort((a, b) => a - b);
+};
diff --git a/server/restapi/MMVC_Rest_Fileuploader.py b/server/restapi/MMVC_Rest_Fileuploader.py
index 160136e5..5361d6df 100644
--- a/server/restapi/MMVC_Rest_Fileuploader.py
+++ b/server/restapi/MMVC_Rest_Fileuploader.py
@@ -119,6 +119,8 @@ class MMVC_Rest_Fileuploader:
             return JSONResponse(content=json_compatible_item_data)
         except Exception as e:
             print("[Voice Changer] post_update_model_default ex:", e)
+            import traceback
+            traceback.print_exc()
 
     def post_update_model_info(self, newData: str = Form(...)):
         try:
diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py
index 079d4697..98377a3e 100644
--- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py
+++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py
@@ -202,10 +202,10 @@ class DiffusionSVC(VoiceChangerModel):
             },
             {
                 "key": "defaultKstep",
-                "val": self.settings.kstep,
+                "val": self.settings.kStep,
             },
             {
                 "key": "defaultSpeedup",
-                "val": self.settings.speedup,
+                "val": self.settings.speedUp,
             },
         ]
diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
index 49bbf4cc..eeb3e43b 100644
--- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
+++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py
@@ -6,6 +6,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2me
 from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder
 
 from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+from voice_changer.utils.Timer import Timer
 
 
 class DiffusionSVCInferencer(Inferencer):
@@ -100,6 +101,7 @@ class DiffusionSVCInferencer(Inferencer):
     @torch.no_grad()
     def infer(
         self,
+        audio_t: torch.Tensor,
         feats: torch.Tensor,
         pitch: torch.Tensor,
         volume: torch.Tensor,
@@ -109,10 +111,22 @@ class DiffusionSVCInferencer(Inferencer):
         infer_speedup: int,
         silence_front: float,
     ) -> torch.Tensor:
-        gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
-        out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
-        start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
-        out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
+        with Timer("pre-process") as t:
+            gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
+            # gt_spec = self.vocoder.extract(audio_t, 16000)
+            # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1)
 
-        out_wav *= mask
+        # print("[    ----Timer::1: ]", t.secs)
+
+        with Timer("pre-process") as t:
+            out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+
+        # print("[    ----Timer::2: ]", t.secs)
+        with Timer("pre-process") as t:  # NOQA
+            start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
+            out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
+
+            out_wav *= mask
+        # print("[    ----Timer::3: ]", t.secs, start_frame, out_mel.shape)
+            
         return out_wav.squeeze()
diff --git a/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py b/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py
new file mode 100644
index 00000000..fecfbccd
--- /dev/null
+++ b/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py
@@ -0,0 +1,90 @@
+import numpy as np
+import torch
+from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder_from_combo
+from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+
+
+class DiffusionSVC_ONNX:
+    def __init__(self, file: str, gpu: int):
+        self.dev = DeviceManager.get_instance().getDevice(gpu)
+        diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev)
+        self.diff_model = diff_model
+        self.naive_model = naive_model
+        self.vocoder = vocoder
+        self.diff_args = diff_args
+        self.naive_args = naive_args
+
+    def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None):
+        g = self.emb_g(sid).unsqueeze(-1)
+        m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length)
+        return o, x_mask, (z, z_p, m_p, logs_p)
+
+
+    @torch.no_grad()  # 最基本推理代码,将输入标准化为tensor,只与mel打交道
+    def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0,
+                 gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True,
+                 spk_emb=None):
+
+
+        aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev)
+
+        # spk_id
+        spk_emb_dict = None
+        if self.diff_args.model.use_speaker_encoder:  # with speaker encoder
+            spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
+        # without speaker encoder
+        else:
+            spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev)
+
+        return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict)
+
+    @torch.no_grad()
+    def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,aug_shift=0, spk_emb=None):
+        # spk_id
+        spk_emb_dict = None
+        if self.diff_args.model.use_speaker_encoder:  # with speaker encoder
+            spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb)
+        # without speaker encoder
+        else:
+
+        return out_spec
+
+    @torch.no_grad()
+    def mel2wav(self, mel, f0, start_frame=0):
+        if start_frame == 0:
+            return self.vocoder.infer(mel, f0)
+        else:  # for realtime speedup
+            mel = mel[:, start_frame:, :]
+            f0 = f0[:, start_frame:, :]
+            out_wav = self.vocoder.infer(mel, f0)
+            return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0))
+
+    @torch.no_grad()
+    def infer(
+        self,
+        feats: torch.Tensor,
+        pitch: torch.Tensor,
+        volume: torch.Tensor,
+        mask: torch.Tensor,
+        sid: torch.Tensor,
+        k_step: int,
+        infer_speedup: int,
+        silence_front: float,
+    ) -> torch.Tensor:
+        
+        aug_shift = torch.LongTensor([0]).to(feats.device)
+        out_spec = self.naive_model(feats, pitch, volume, sid, spk_mix_dict=None,
+                                    aug_shift=aug_shift, infer=True,
+                                    spk_emb=None, spk_emb_dict=None)
+
+
+        gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None)
+        out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None)
+        start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size)
+        out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame)
+
+        out_wav *= mask
+        return out_wav.squeeze()
diff --git a/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py b/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py
new file mode 100644
index 00000000..421550d8
--- /dev/null
+++ b/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py
@@ -0,0 +1,125 @@
+import os
+import json
+import torch
+from onnxsim import simplify
+import onnx
+from const import TMP_DIR, EnumInferenceTypes
+from data.ModelSlot import DiffusionSVCModelSlot
+from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
+
+
+def export2onnx(gpu: int, modelSlot: DiffusionSVCModelSlot):
+    modelFile = modelSlot.modelFile
+
+    output_file = os.path.splitext(os.path.basename(modelFile))[0] + ".onnx"
+    output_file_simple = os.path.splitext(os.path.basename(modelFile))[0] + "_simple.onnx"
+    output_path = os.path.join(TMP_DIR, output_file)
+    output_path_simple = os.path.join(TMP_DIR, output_file_simple)
+    metadata = {
+        "application": "VC_CLIENT",
+        "version": "3",
+        "voiceChangerType": modelSlot.voiceChangerType,
+        "modelType": modelSlot.modelType,
+        "samplingRate": modelSlot.samplingRate,
+        "embChannels": modelSlot.embChannels,
+        "embedder": modelSlot.embedder
+    }
+    gpuMomory = DeviceManager.get_instance().getDeviceMemory(gpu)
+    print(f"[Voice Changer] exporting onnx... gpu_id:{gpu} gpu_mem:{gpuMomory}")
+
+    if gpuMomory > 0:
+        _export2onnx(modelFile, output_path, output_path_simple, True, metadata)
+    else:
+        print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.")
+        _export2onnx(modelFile, output_path, output_path_simple, False, metadata)
+    return output_file_simple
+
+
+def _export2onnx(input_model, output_model, output_model_simple, is_half, metadata):
+    cpt = torch.load(input_model, map_location="cpu")
+    if is_half:
+        dev = torch.device("cuda", index=0)
+    else:
+        dev = torch.device("cpu")
+
+
+
+
+    # EnumInferenceTypesのままだとシリアライズできないのでテキスト化
+    if metadata["modelType"] == EnumInferenceTypes.pyTorchRVC.value:
+        net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half)
+    elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUI.value:
+        net_g_onnx = SynthesizerTrnMsNSFsid_webui_ONNX(**cpt["params"], is_half=is_half)
+    elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCNono.value:
+        net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"])
+    elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUINono.value:
+        net_g_onnx = SynthesizerTrnMsNSFsidNono_webui_ONNX(**cpt["params"])
+    elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2.value:
+        net_g_onnx = SynthesizerTrnMs768NSFsid_ONNX(*cpt["config"], is_half=is_half)
+    elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2Nono.value:
+        net_g_onnx = SynthesizerTrnMs768NSFsid_nono_ONNX(*cpt["config"])
+    else:
+        print(
+            "unknwon::::: ",
+            metadata["modelType"],
+            EnumInferenceTypes.pyTorchRVCv2.value,
+        )
+
+    net_g_onnx.eval().to(dev)
+    net_g_onnx.load_state_dict(cpt["weight"], strict=False)
+    if is_half:
+        net_g_onnx = net_g_onnx.half()
+
+    if is_half:
+        feats = torch.HalfTensor(1, 2192, metadata["embChannels"]).to(dev)
+    else:
+        feats = torch.FloatTensor(1, 2192, metadata["embChannels"]).to(dev)
+    p_len = torch.LongTensor([2192]).to(dev)
+    sid = torch.LongTensor([0]).to(dev)
+
+    if metadata["f0"] is True:
+        pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev)
+        pitchf = torch.FloatTensor(1, 2192).to(dev)
+        input_names = ["feats", "p_len", "pitch", "pitchf", "sid"]
+        inputs = (
+            feats,
+            p_len,
+            pitch,
+            pitchf,
+            sid,
+        )
+
+    else:
+        input_names = ["feats", "p_len", "sid"]
+        inputs = (
+            feats,
+            p_len,
+            sid,
+        )
+
+    output_names = [
+        "audio",
+    ]
+
+    torch.onnx.export(
+        net_g_onnx,
+        inputs,
+        output_model,
+        dynamic_axes={
+            "feats": [1],
+            "pitch": [1],
+            "pitchf": [1],
+        },
+        do_constant_folding=False,
+        opset_version=17,
+        verbose=False,
+        input_names=input_names,
+        output_names=output_names,
+    )
+
+    model_onnx2 = onnx.load(output_model)
+    model_simp, check = simplify(model_onnx2)
+    meta = model_simp.metadata_props.add()
+    meta.key = "metadata"
+    meta.value = json.dumps(metadata)
+    onnx.save(model_simp, output_model_simple)
diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
index 53f236e7..d3962531 100644
--- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
+++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py
@@ -16,6 +16,8 @@ from voice_changer.RVC.embedder.Embedder import Embedder
 from voice_changer.common.VolumeExtractor import VolumeExtractor
 from torchaudio.transforms import Resample
 
+from voice_changer.utils.Timer import Timer
+
 
 class Pipeline(object):
     embedder: Embedder
@@ -112,83 +114,95 @@ class Pipeline(object):
         useFinalProj,
         protect=0.5
     ):
-        audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
-        audio16k = self.resamplerIn(audio_t)
-        volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
-        sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
-        n_frames = int(audio16k.size(-1) // self.hop_size + 1)
+        # print("---------- pipe line --------------------")
+        with Timer("pre-process") as t:
+            audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
+            audio16k = self.resamplerIn(audio_t)
+            volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0)
+            sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
+            n_frames = int(audio16k.size(-1) // self.hop_size + 1)
+        # print("[Timer::1: ]", t.secs)
 
-        # ピッチ検出
-        try:
-            pitch = self.pitchExtractor.extract(
-                audio16k.squeeze(),
-                pitchf,
-                f0_up_key,
-                int(self.hop_size),    # 処理のwindowサイズ (44100における512)
-                silence_front=silence_front,
-            )
-
-            pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
-        except IndexError as e:  # NOQA
-            raise NotEnoughDataExtimateF0()
-
-        # tensor型調整
-        feats = audio16k.squeeze()
-        if feats.dim() == 2:  # double channels
-            feats = feats.mean(-1)
-        feats = feats.view(1, -1)
-
-        # embedding
-        with autocast(enabled=self.isHalf):
+        with Timer("pre-process") as t:
+            # ピッチ検出
             try:
-                feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
-                if torch.isnan(feats).all():
-                    raise DeviceCannotSupportHalfPrecisionException()
+                pitch = self.pitchExtractor.extract(
+                    audio16k.squeeze(),
+                    pitchf,
+                    f0_up_key,
+                    int(self.hop_size),    # 処理のwindowサイズ (44100における512)
+                    silence_front=silence_front,
+                )
+
+                pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
+            except IndexError as e:  # NOQA
+                raise NotEnoughDataExtimateF0()
+
+            # tensor型調整
+            feats = audio16k.squeeze()
+            if feats.dim() == 2:  # double channels
+                feats = feats.mean(-1)
+            feats = feats.view(1, -1)
+        # print("[Timer::2: ]", t.secs)
+
+        with Timer("pre-process") as t:
+
+            # embedding
+            with autocast(enabled=self.isHalf):
+                try:
+                    feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj)
+                    if torch.isnan(feats).all():
+                        raise DeviceCannotSupportHalfPrecisionException()
+                except RuntimeError as e:
+                    if "HALF" in e.__str__().upper():
+                        raise HalfPrecisionChangingException()
+                    elif "same device" in e.__str__():
+                        raise DeviceChangingException()
+                    else:
+                        raise e
+            feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
+        # print("[Timer::3: ]", t.secs)
+
+        with Timer("pre-process") as t:
+            # 推論実行
+            try:
+                with torch.no_grad():
+                    with autocast(enabled=self.isHalf):
+                        audio1 = (
+                            torch.clip(
+                                self.inferencer.infer(
+                                    audio16k,
+                                    feats,
+                                    pitch.unsqueeze(-1),
+                                    volume,
+                                    mask,
+                                    sid,
+                                    k_step,
+                                    infer_speedup,
+                                    silence_front=silence_front
+                                    ).to(dtype=torch.float32),
+                                -1.0,
+                                1.0,
+                            )
+                            * 32767.5
+                        ).data.to(dtype=torch.int16)
             except RuntimeError as e:
                 if "HALF" in e.__str__().upper():
+                    print("11", e)
                     raise HalfPrecisionChangingException()
-                elif "same device" in e.__str__():
-                    raise DeviceChangingException()
                 else:
                     raise e
-        feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1)
+        # print("[Timer::4: ]", t.secs)
 
-        # 推論実行
-        try:
-            with torch.no_grad():
-                with autocast(enabled=self.isHalf):
-                    print("[EMBEDDER EXTRACT:::]", feats.shape, pitch.unsqueeze(-1).shape, volume.shape, mask.shape)
-                    audio1 = (
-                        torch.clip(
-                            self.inferencer.infer(
-                                feats,
-                                pitch.unsqueeze(-1),
-                                volume,
-                                mask,
-                                sid,
-                                k_step,
-                                infer_speedup,
-                                silence_front=silence_front
-                                ).to(dtype=torch.float32),
-                            -1.0,
-                            1.0,
-                        )
-                        * 32767.5
-                    ).data.to(dtype=torch.int16)
-        except RuntimeError as e:
-            if "HALF" in e.__str__().upper():
-                print("11", e)
-                raise HalfPrecisionChangingException()
+        with Timer("pre-process") as t:  # NOQA
+            feats_buffer = feats.squeeze(0).detach().cpu()
+            if pitch is not None:
+                pitch_buffer = pitch.squeeze(0).detach().cpu()
             else:
-                raise e
+                pitch_buffer = None
 
-        feats_buffer = feats.squeeze(0).detach().cpu()
-        if pitch is not None:
-            pitch_buffer = pitch.squeeze(0).detach().cpu()
-        else:
-            pitch_buffer = None
-
-        del pitch, pitchf, feats, sid
-        torch.cuda.empty_cache()
-        audio1 = self.resamplerOut(audio1.float())
+            del pitch, pitchf, feats, sid
+            torch.cuda.empty_cache()
+            audio1 = self.resamplerOut(audio1.float())
+        # print("[Timer::5: ]", t.secs)
         return audio1, pitch_buffer, feats_buffer
diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py
index 5c7724f7..8223c44f 100644
--- a/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py
+++ b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py
@@ -18,6 +18,7 @@ class DioPitchExtractor(PitchExtractor):
 
     def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
         audio = audio.detach().cpu().numpy()
+        silence_front = 0  # TODO: chunkサイズが小さいときに音程を取れなくなる対策
         start_frame = int(silence_front * self.sapmle_rate / window)
         real_silence_front = start_frame * window / self.sapmle_rate
         audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
diff --git a/server/voice_changer/utils/VoiceChangerIF.py b/server/voice_changer/utils/VoiceChangerIF.py
index 7bd77074..8063528e 100644
--- a/server/voice_changer/utils/VoiceChangerIF.py
+++ b/server/voice_changer/utils/VoiceChangerIF.py
@@ -10,7 +10,7 @@ class VoiceChangerIF(Protocol):
 
     def get_info(self) -> dict[str, Any]:
         ...
-        
+
     def get_performance(self) -> list[int]:
         ...
 
@@ -25,4 +25,3 @@ class VoiceChangerIF(Protocol):
 
     def export2onnx() -> Any:
         ...
-