From b9429c7655e1c27e014450c809bee2eae65ce9d6 Mon Sep 17 00:00:00 2001 From: w-okada <you@example.com> Date: Mon, 17 Jul 2023 07:21:06 +0900 Subject: [PATCH] WIP: diffusion svc refining --- client/demo/dist/index.js | 2 +- .../101-7_diffusion-svcSettingArea.tsx | 47 ++++-- server/restapi/MMVC_Rest_Fileuploader.py | 2 + .../DiffusionSVC/DiffusionSVC.py | 4 +- .../inferencer/DiffusionSVCInferencer.py | 24 ++- .../onnxExporter/DiffusionSVC_ONNX.py | 90 +++++++++++ .../DiffusionSVC/onnxExporter/export2onnx.py | 125 ++++++++++++++ .../DiffusionSVC/pipeline/Pipeline.py | 152 ++++++++++-------- .../pitchExtractor/DioPitchExtractor.py | 1 + server/voice_changer/utils/VoiceChangerIF.py | 3 +- 10 files changed, 361 insertions(+), 89 deletions(-) create mode 100644 server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py create mode 100644 server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py diff --git a/client/demo/dist/index.js b/client/demo/dist/index.js index df39b33d..2d71a580 100644 --- a/client/demo/dist/index.js +++ b/client/demo/dist/index.js @@ -389,7 +389,7 @@ eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpac /***/ ((__unused_webpack_module, __webpack_exports__, __webpack_require__) => { "use strict"; -eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ DiffusionSVCSettingArea: () => (/* binding */ DiffusionSVCSettingArea)\n/* harmony export */ });\n/* harmony import */ var _babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! @babel/runtime/helpers/defineProperty */ \"./node_modules/@babel/runtime/helpers/esm/defineProperty.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! react */ \"./node_modules/react/index.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_1___default = /*#__PURE__*/__webpack_require__.n(react__WEBPACK_IMPORTED_MODULE_1__);\n/* harmony import */ var _001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! ../../../001_provider/001_AppStateProvider */ \"./src/001_provider/001_AppStateProvider.tsx\");\n\nfunction ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); enumerableOnly && (symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; })), keys.push.apply(keys, symbols); } return keys; }\nfunction _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = null != arguments[i] ? arguments[i] : {}; i % 2 ? ownKeys(Object(source), !0).forEach(function (key) { (0,_babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_0__[\"default\"])(target, key, source[key]); }) : Object.getOwnPropertyDescriptors ? Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)) : ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } return target; }\n\n\nvar DiffusionSVCSettingArea = function DiffusionSVCSettingArea(_props) {\n var _useAppState = (0,_001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_2__.useAppState)(),\n serverSetting = _useAppState.serverSetting;\n var selected = (0,react__WEBPACK_IMPORTED_MODULE_1__.useMemo)(function () {\n if (serverSetting.serverSetting.modelSlotIndex == undefined) {\n return;\n }\n return serverSetting.serverSetting.modelSlots[serverSetting.serverSetting.modelSlotIndex];\n }, [serverSetting.serverSetting.modelSlotIndex, serverSetting.serverSetting.modelSlots]);\n var settingArea = (0,react__WEBPACK_IMPORTED_MODULE_1__.useMemo)(function () {\n if (!selected) {\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null);\n }\n if (selected.voiceChangerType != \"Diffusion-SVC\") {\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null);\n }\n var kStepRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control-title\"\n }, \"k-step:\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control-field\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-slider-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-kind\"\n }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-slider\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"input\", {\n type: \"range\",\n min: \"0\",\n max: selected.kStepMax,\n step: \"1\",\n value: serverSetting.serverSetting.kStep,\n onChange: function onChange(e) {\n serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n kStep: Number(e.target.value)\n }));\n }\n })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-val\"\n }, serverSetting.serverSetting.kStep))));\n var speedUpRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control-title\"\n }, \"speedup\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-control-field\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"div\", {\n className: \"character-area-slider-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-kind\"\n }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-slider\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"input\", {\n type: \"range\",\n min: \"0\",\n max: serverSetting.serverSetting.kStep,\n step: \"1\",\n value: serverSetting.serverSetting.speedUp,\n onChange: function onChange(e) {\n serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n speedUp: Number(e.target.value)\n }));\n }\n })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement(\"span\", {\n className: \"character-area-slider-control-val\"\n }, serverSetting.serverSetting.speedUp))));\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_1___default().createElement((react__WEBPACK_IMPORTED_MODULE_1___default().Fragment), null, kStepRow, speedUpRow);\n }, [serverSetting.serverSetting, serverSetting.updateServerSettings, selected]);\n return settingArea;\n};\n\n//# sourceURL=webpack://demo/./src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx?"); +eval("__webpack_require__.r(__webpack_exports__);\n/* harmony export */ __webpack_require__.d(__webpack_exports__, {\n/* harmony export */ DiffusionSVCSettingArea: () => (/* binding */ DiffusionSVCSettingArea)\n/* harmony export */ });\n/* harmony import */ var _babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_0__ = __webpack_require__(/*! @babel/runtime/helpers/toConsumableArray */ \"./node_modules/@babel/runtime/helpers/esm/toConsumableArray.js\");\n/* harmony import */ var _babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_1__ = __webpack_require__(/*! @babel/runtime/helpers/defineProperty */ \"./node_modules/@babel/runtime/helpers/esm/defineProperty.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_2__ = __webpack_require__(/*! react */ \"./node_modules/react/index.js\");\n/* harmony import */ var react__WEBPACK_IMPORTED_MODULE_2___default = /*#__PURE__*/__webpack_require__.n(react__WEBPACK_IMPORTED_MODULE_2__);\n/* harmony import */ var _001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_3__ = __webpack_require__(/*! ../../../001_provider/001_AppStateProvider */ \"./src/001_provider/001_AppStateProvider.tsx\");\n\n\nfunction ownKeys(object, enumerableOnly) { var keys = Object.keys(object); if (Object.getOwnPropertySymbols) { var symbols = Object.getOwnPropertySymbols(object); enumerableOnly && (symbols = symbols.filter(function (sym) { return Object.getOwnPropertyDescriptor(object, sym).enumerable; })), keys.push.apply(keys, symbols); } return keys; }\nfunction _objectSpread(target) { for (var i = 1; i < arguments.length; i++) { var source = null != arguments[i] ? arguments[i] : {}; i % 2 ? ownKeys(Object(source), !0).forEach(function (key) { (0,_babel_runtime_helpers_defineProperty__WEBPACK_IMPORTED_MODULE_1__[\"default\"])(target, key, source[key]); }) : Object.getOwnPropertyDescriptors ? Object.defineProperties(target, Object.getOwnPropertyDescriptors(source)) : ownKeys(Object(source)).forEach(function (key) { Object.defineProperty(target, key, Object.getOwnPropertyDescriptor(source, key)); }); } return target; }\n\n\nvar DiffusionSVCSettingArea = function DiffusionSVCSettingArea(_props) {\n var _useAppState = (0,_001_provider_001_AppStateProvider__WEBPACK_IMPORTED_MODULE_3__.useAppState)(),\n serverSetting = _useAppState.serverSetting;\n var selected = (0,react__WEBPACK_IMPORTED_MODULE_2__.useMemo)(function () {\n if (serverSetting.serverSetting.modelSlotIndex == undefined) {\n return;\n }\n return serverSetting.serverSetting.modelSlots[serverSetting.serverSetting.modelSlotIndex];\n }, [serverSetting.serverSetting.modelSlotIndex, serverSetting.serverSetting.modelSlots]);\n var settingArea = (0,react__WEBPACK_IMPORTED_MODULE_2__.useMemo)(function () {\n if (!selected) {\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null);\n }\n if (selected.voiceChangerType != \"Diffusion-SVC\") {\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null);\n }\n var skipValues = getDivisors(serverSetting.serverSetting.kStep);\n skipValues.pop();\n var kStepRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control-title\"\n }, \"k-step:\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control-field\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-slider-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n className: \"character-area-slider-control-kind\"\n }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n className: \"character-area-slider-control-slider\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"input\", {\n type: \"range\",\n min: \"2\",\n max: selected.kStepMax,\n step: \"1\",\n value: serverSetting.serverSetting.kStep,\n onChange: function onChange(e) {\n var newKStep = Number(e.target.value);\n var newSkipValues = getDivisors(Number(e.target.value));\n newSkipValues.pop();\n serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n speedUp: Math.max.apply(Math, (0,_babel_runtime_helpers_toConsumableArray__WEBPACK_IMPORTED_MODULE_0__[\"default\"])(newSkipValues)),\n kStep: newKStep\n }));\n }\n })), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n className: \"character-area-slider-control-val\"\n }, serverSetting.serverSetting.kStep))));\n var speedUpRow = /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control-title\"\n }, \"skip\"), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-control-field\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"div\", {\n className: \"character-area-slider-control\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n className: \"character-area-slider-control-kind\"\n }), /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"span\", {\n className: \"character-area-slider-control-slider\"\n }, /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"select\", {\n name: \"\",\n id: \"\",\n value: serverSetting.serverSetting.speedUp,\n onChange: function onChange(e) {\n serverSetting.updateServerSettings(_objectSpread(_objectSpread({}, serverSetting.serverSetting), {}, {\n speedUp: Number(e.target.value)\n }));\n }\n }, skipValues.map(function (v) {\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement(\"option\", {\n value: v,\n key: v\n }, v);\n }))))));\n return /*#__PURE__*/react__WEBPACK_IMPORTED_MODULE_2___default().createElement((react__WEBPACK_IMPORTED_MODULE_2___default().Fragment), null, kStepRow, speedUpRow);\n }, [serverSetting.serverSetting, serverSetting.updateServerSettings, selected]);\n return settingArea;\n};\nvar getDivisors = function getDivisors(num) {\n var divisors = [];\n var end = Math.sqrt(num);\n for (var i = 1; i <= end; i++) {\n if (num % i === 0) {\n divisors.push(i);\n if (i !== num / i) {\n divisors.push(num / i);\n }\n }\n }\n return divisors.sort(function (a, b) {\n return a - b;\n });\n};\n\n//# sourceURL=webpack://demo/./src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx?"); /***/ }), diff --git a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx index fc998ede..dc14a12b 100644 --- a/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx +++ b/client/demo/src/components/demo/components2/101-7_diffusion-svcSettingArea.tsx @@ -23,6 +23,9 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) => return <></>; } + const skipValues = getDivisors(serverSetting.serverSetting.kStep); + skipValues.pop(); + const kStepRow = ( <div className="character-area-control"> <div className="character-area-control-title">k-step:</div> @@ -32,12 +35,15 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) => <span className="character-area-slider-control-slider"> <input type="range" - min="0" + min="2" max={(selected as DiffusionSVCModelSlot).kStepMax} step="1" value={serverSetting.serverSetting.kStep} onChange={(e) => { - serverSetting.updateServerSettings({ ...serverSetting.serverSetting, kStep: Number(e.target.value) }); + const newKStep = Number(e.target.value); + const newSkipValues = getDivisors(Number(e.target.value)); + newSkipValues.pop(); + serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Math.max(...newSkipValues), kStep: newKStep }); }} ></input> </span> @@ -48,23 +54,28 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) => ); const speedUpRow = ( <div className="character-area-control"> - <div className="character-area-control-title">speedup</div> + <div className="character-area-control-title">skip</div> <div className="character-area-control-field"> <div className="character-area-slider-control"> <span className="character-area-slider-control-kind"></span> <span className="character-area-slider-control-slider"> - <input - type="range" - min="0" - max={serverSetting.serverSetting.kStep} - step="1" + <select + name="" + id="" value={serverSetting.serverSetting.speedUp} onChange={(e) => { serverSetting.updateServerSettings({ ...serverSetting.serverSetting, speedUp: Number(e.target.value) }); }} - ></input> + > + {skipValues.map((v) => { + return ( + <option value={v} key={v}> + {v} + </option> + ); + })} + </select> </span> - <span className="character-area-slider-control-val">{serverSetting.serverSetting.speedUp}</span> </div> </div> </div> @@ -79,3 +90,19 @@ export const DiffusionSVCSettingArea = (_props: DiffusionSVCSettingAreaProps) => return settingArea; }; + +const getDivisors = (num: number) => { + var divisors = []; + var end = Math.sqrt(num); + + for (var i = 1; i <= end; i++) { + if (num % i === 0) { + divisors.push(i); + if (i !== num / i) { + divisors.push(num / i); + } + } + } + + return divisors.sort((a, b) => a - b); +}; diff --git a/server/restapi/MMVC_Rest_Fileuploader.py b/server/restapi/MMVC_Rest_Fileuploader.py index 160136e5..5361d6df 100644 --- a/server/restapi/MMVC_Rest_Fileuploader.py +++ b/server/restapi/MMVC_Rest_Fileuploader.py @@ -119,6 +119,8 @@ class MMVC_Rest_Fileuploader: return JSONResponse(content=json_compatible_item_data) except Exception as e: print("[Voice Changer] post_update_model_default ex:", e) + import traceback + traceback.print_exc() def post_update_model_info(self, newData: str = Form(...)): try: diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index 079d4697..98377a3e 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -202,10 +202,10 @@ class DiffusionSVC(VoiceChangerModel): }, { "key": "defaultKstep", - "val": self.settings.kstep, + "val": self.settings.kStep, }, { "key": "defaultSpeedup", - "val": self.settings.speedup, + "val": self.settings.speedUp, }, ] diff --git a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py index 49bbf4cc..eeb3e43b 100644 --- a/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py +++ b/server/voice_changer/DiffusionSVC/inferencer/DiffusionSVCInferencer.py @@ -6,6 +6,7 @@ from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2me from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.vocoder import Vocoder from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager +from voice_changer.utils.Timer import Timer class DiffusionSVCInferencer(Inferencer): @@ -100,6 +101,7 @@ class DiffusionSVCInferencer(Inferencer): @torch.no_grad() def infer( self, + audio_t: torch.Tensor, feats: torch.Tensor, pitch: torch.Tensor, volume: torch.Tensor, @@ -109,10 +111,22 @@ class DiffusionSVCInferencer(Inferencer): infer_speedup: int, silence_front: float, ) -> torch.Tensor: - gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None) - out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None) - start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) - out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) + with Timer("pre-process") as t: + gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None) + # gt_spec = self.vocoder.extract(audio_t, 16000) + # gt_spec = torch.cat((gt_spec, gt_spec[:, -1:, :]), 1) - out_wav *= mask + # print("[ ----Timer::1: ]", t.secs) + + with Timer("pre-process") as t: + out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None) + + # print("[ ----Timer::2: ]", t.secs) + with Timer("pre-process") as t: # NOQA + start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) + out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) + + out_wav *= mask + # print("[ ----Timer::3: ]", t.secs, start_frame, out_mel.shape) + return out_wav.squeeze() diff --git a/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py b/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py new file mode 100644 index 00000000..fecfbccd --- /dev/null +++ b/server/voice_changer/DiffusionSVC/onnxExporter/DiffusionSVC_ONNX.py @@ -0,0 +1,90 @@ +import numpy as np +import torch +from voice_changer.DiffusionSVC.inferencer.diffusion_svc_model.diffusion.unit2mel import load_model_vocoder_from_combo +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager + + +class DiffusionSVC_ONNX: + def __init__(self, file: str, gpu: int): + self.dev = DeviceManager.get_instance().getDevice(gpu) + diff_model, diff_args, naive_model, naive_args, vocoder = load_model_vocoder_from_combo(file, device=self.dev) + self.diff_model = diff_model + self.naive_model = naive_model + self.vocoder = vocoder + self.diff_args = diff_args + self.naive_args = naive_args + + def forward(self, phone, phone_lengths, sid, max_len=None, convert_length=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec.infer_realtime((z * x_mask)[:, :, :max_len], g=g, convert_length=convert_length) + return o, x_mask, (z, z_p, m_p, logs_p) + + + @torch.no_grad() # 最基本推理代码,将输入标准化为tensor,只与mel打交道 + def __call__(self, units, f0, volume, spk_id=1, spk_mix_dict=None, aug_shift=0, + gt_spec=None, infer_speedup=10, method='dpm-solver', k_step=None, use_tqdm=True, + spk_emb=None): + + + aug_shift = torch.from_numpy(np.array([[float(aug_shift)]])).float().to(self.dev) + + # spk_id + spk_emb_dict = None + if self.diff_args.model.use_speaker_encoder: # with speaker encoder + spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb) + # without speaker encoder + else: + spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(self.dev) + + return self.diff_model(units, f0, volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict, aug_shift=aug_shift, gt_spec=gt_spec, infer=True, infer_speedup=infer_speedup, method=method, k_step=k_step, use_tqdm=use_tqdm, spk_emb=spk_emb, spk_emb_dict=spk_emb_dict) + + @torch.no_grad() + def naive_model_call(self, units, f0, volume, spk_id=1, spk_mix_dict=None,aug_shift=0, spk_emb=None): + # spk_id + spk_emb_dict = None + if self.diff_args.model.use_speaker_encoder: # with speaker encoder + spk_mix_dict, spk_emb = self.pre_spk_emb(spk_id, spk_mix_dict, len(units), spk_emb) + # without speaker encoder + else: + + return out_spec + + @torch.no_grad() + def mel2wav(self, mel, f0, start_frame=0): + if start_frame == 0: + return self.vocoder.infer(mel, f0) + else: # for realtime speedup + mel = mel[:, start_frame:, :] + f0 = f0[:, start_frame:, :] + out_wav = self.vocoder.infer(mel, f0) + return torch.nn.functional.pad(out_wav, (start_frame * self.vocoder.vocoder_hop_size, 0)) + + @torch.no_grad() + def infer( + self, + feats: torch.Tensor, + pitch: torch.Tensor, + volume: torch.Tensor, + mask: torch.Tensor, + sid: torch.Tensor, + k_step: int, + infer_speedup: int, + silence_front: float, + ) -> torch.Tensor: + + aug_shift = torch.LongTensor([0]).to(feats.device) + out_spec = self.naive_model(feats, pitch, volume, sid, spk_mix_dict=None, + aug_shift=aug_shift, infer=True, + spk_emb=None, spk_emb_dict=None) + + + gt_spec = self.naive_model_call(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, spk_emb=None) + out_mel = self.__call__(feats, pitch, volume, spk_id=sid, spk_mix_dict=None, aug_shift=0, gt_spec=gt_spec, infer_speedup=infer_speedup, method='dpm-solver', k_step=k_step, use_tqdm=False, spk_emb=None) + start_frame = int(silence_front * self.vocoder.vocoder_sample_rate / self.vocoder.vocoder_hop_size) + out_wav = self.mel2wav(out_mel, pitch, start_frame=start_frame) + + out_wav *= mask + return out_wav.squeeze() diff --git a/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py b/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py new file mode 100644 index 00000000..421550d8 --- /dev/null +++ b/server/voice_changer/DiffusionSVC/onnxExporter/export2onnx.py @@ -0,0 +1,125 @@ +import os +import json +import torch +from onnxsim import simplify +import onnx +from const import TMP_DIR, EnumInferenceTypes +from data.ModelSlot import DiffusionSVCModelSlot +from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager + + +def export2onnx(gpu: int, modelSlot: DiffusionSVCModelSlot): + modelFile = modelSlot.modelFile + + output_file = os.path.splitext(os.path.basename(modelFile))[0] + ".onnx" + output_file_simple = os.path.splitext(os.path.basename(modelFile))[0] + "_simple.onnx" + output_path = os.path.join(TMP_DIR, output_file) + output_path_simple = os.path.join(TMP_DIR, output_file_simple) + metadata = { + "application": "VC_CLIENT", + "version": "3", + "voiceChangerType": modelSlot.voiceChangerType, + "modelType": modelSlot.modelType, + "samplingRate": modelSlot.samplingRate, + "embChannels": modelSlot.embChannels, + "embedder": modelSlot.embedder + } + gpuMomory = DeviceManager.get_instance().getDeviceMemory(gpu) + print(f"[Voice Changer] exporting onnx... gpu_id:{gpu} gpu_mem:{gpuMomory}") + + if gpuMomory > 0: + _export2onnx(modelFile, output_path, output_path_simple, True, metadata) + else: + print("[Voice Changer] Warning!!! onnx export with float32. maybe size is doubled.") + _export2onnx(modelFile, output_path, output_path_simple, False, metadata) + return output_file_simple + + +def _export2onnx(input_model, output_model, output_model_simple, is_half, metadata): + cpt = torch.load(input_model, map_location="cpu") + if is_half: + dev = torch.device("cuda", index=0) + else: + dev = torch.device("cpu") + + + + + # EnumInferenceTypesのままだとシリアライズできないのでテキスト化 + if metadata["modelType"] == EnumInferenceTypes.pyTorchRVC.value: + net_g_onnx = SynthesizerTrnMs256NSFsid_ONNX(*cpt["config"], is_half=is_half) + elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUI.value: + net_g_onnx = SynthesizerTrnMsNSFsid_webui_ONNX(**cpt["params"], is_half=is_half) + elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCNono.value: + net_g_onnx = SynthesizerTrnMs256NSFsid_nono_ONNX(*cpt["config"]) + elif metadata["modelType"] == EnumInferenceTypes.pyTorchWebUINono.value: + net_g_onnx = SynthesizerTrnMsNSFsidNono_webui_ONNX(**cpt["params"]) + elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2.value: + net_g_onnx = SynthesizerTrnMs768NSFsid_ONNX(*cpt["config"], is_half=is_half) + elif metadata["modelType"] == EnumInferenceTypes.pyTorchRVCv2Nono.value: + net_g_onnx = SynthesizerTrnMs768NSFsid_nono_ONNX(*cpt["config"]) + else: + print( + "unknwon::::: ", + metadata["modelType"], + EnumInferenceTypes.pyTorchRVCv2.value, + ) + + net_g_onnx.eval().to(dev) + net_g_onnx.load_state_dict(cpt["weight"], strict=False) + if is_half: + net_g_onnx = net_g_onnx.half() + + if is_half: + feats = torch.HalfTensor(1, 2192, metadata["embChannels"]).to(dev) + else: + feats = torch.FloatTensor(1, 2192, metadata["embChannels"]).to(dev) + p_len = torch.LongTensor([2192]).to(dev) + sid = torch.LongTensor([0]).to(dev) + + if metadata["f0"] is True: + pitch = torch.zeros(1, 2192, dtype=torch.int64).to(dev) + pitchf = torch.FloatTensor(1, 2192).to(dev) + input_names = ["feats", "p_len", "pitch", "pitchf", "sid"] + inputs = ( + feats, + p_len, + pitch, + pitchf, + sid, + ) + + else: + input_names = ["feats", "p_len", "sid"] + inputs = ( + feats, + p_len, + sid, + ) + + output_names = [ + "audio", + ] + + torch.onnx.export( + net_g_onnx, + inputs, + output_model, + dynamic_axes={ + "feats": [1], + "pitch": [1], + "pitchf": [1], + }, + do_constant_folding=False, + opset_version=17, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + + model_onnx2 = onnx.load(output_model) + model_simp, check = simplify(model_onnx2) + meta = model_simp.metadata_props.add() + meta.key = "metadata" + meta.value = json.dumps(metadata) + onnx.save(model_simp, output_model_simple) diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index 53f236e7..d3962531 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -16,6 +16,8 @@ from voice_changer.RVC.embedder.Embedder import Embedder from voice_changer.common.VolumeExtractor import VolumeExtractor from torchaudio.transforms import Resample +from voice_changer.utils.Timer import Timer + class Pipeline(object): embedder: Embedder @@ -112,83 +114,95 @@ class Pipeline(object): useFinalProj, protect=0.5 ): - audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) - audio16k = self.resamplerIn(audio_t) - volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0) - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - n_frames = int(audio16k.size(-1) // self.hop_size + 1) + # print("---------- pipe line --------------------") + with Timer("pre-process") as t: + audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) + audio16k = self.resamplerIn(audio_t) + volume, mask = self.extract_volume_and_mask(audio16k, threshold=-60.0) + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + n_frames = int(audio16k.size(-1) // self.hop_size + 1) + # print("[Timer::1: ]", t.secs) - # ピッチ検出 - try: - pitch = self.pitchExtractor.extract( - audio16k.squeeze(), - pitchf, - f0_up_key, - int(self.hop_size), # 処理のwindowサイズ (44100における512) - silence_front=silence_front, - ) - - pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() - except IndexError as e: # NOQA - raise NotEnoughDataExtimateF0() - - # tensor型調整 - feats = audio16k.squeeze() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - feats = feats.view(1, -1) - - # embedding - with autocast(enabled=self.isHalf): + with Timer("pre-process") as t: + # ピッチ検出 try: - feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) - if torch.isnan(feats).all(): - raise DeviceCannotSupportHalfPrecisionException() + pitch = self.pitchExtractor.extract( + audio16k.squeeze(), + pitchf, + f0_up_key, + int(self.hop_size), # 処理のwindowサイズ (44100における512) + silence_front=silence_front, + ) + + pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() + except IndexError as e: # NOQA + raise NotEnoughDataExtimateF0() + + # tensor型調整 + feats = audio16k.squeeze() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + feats = feats.view(1, -1) + # print("[Timer::2: ]", t.secs) + + with Timer("pre-process") as t: + + # embedding + with autocast(enabled=self.isHalf): + try: + feats = self.embedder.extractFeatures(feats, embOutputLayer, useFinalProj) + if torch.isnan(feats).all(): + raise DeviceCannotSupportHalfPrecisionException() + except RuntimeError as e: + if "HALF" in e.__str__().upper(): + raise HalfPrecisionChangingException() + elif "same device" in e.__str__(): + raise DeviceChangingException() + else: + raise e + feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1) + # print("[Timer::3: ]", t.secs) + + with Timer("pre-process") as t: + # 推論実行 + try: + with torch.no_grad(): + with autocast(enabled=self.isHalf): + audio1 = ( + torch.clip( + self.inferencer.infer( + audio16k, + feats, + pitch.unsqueeze(-1), + volume, + mask, + sid, + k_step, + infer_speedup, + silence_front=silence_front + ).to(dtype=torch.float32), + -1.0, + 1.0, + ) + * 32767.5 + ).data.to(dtype=torch.int16) except RuntimeError as e: if "HALF" in e.__str__().upper(): + print("11", e) raise HalfPrecisionChangingException() - elif "same device" in e.__str__(): - raise DeviceChangingException() else: raise e - feats = F.interpolate(feats.permute(0, 2, 1), size=int(n_frames), mode='nearest').permute(0, 2, 1) + # print("[Timer::4: ]", t.secs) - # 推論実行 - try: - with torch.no_grad(): - with autocast(enabled=self.isHalf): - print("[EMBEDDER EXTRACT:::]", feats.shape, pitch.unsqueeze(-1).shape, volume.shape, mask.shape) - audio1 = ( - torch.clip( - self.inferencer.infer( - feats, - pitch.unsqueeze(-1), - volume, - mask, - sid, - k_step, - infer_speedup, - silence_front=silence_front - ).to(dtype=torch.float32), - -1.0, - 1.0, - ) - * 32767.5 - ).data.to(dtype=torch.int16) - except RuntimeError as e: - if "HALF" in e.__str__().upper(): - print("11", e) - raise HalfPrecisionChangingException() + with Timer("pre-process") as t: # NOQA + feats_buffer = feats.squeeze(0).detach().cpu() + if pitch is not None: + pitch_buffer = pitch.squeeze(0).detach().cpu() else: - raise e + pitch_buffer = None - feats_buffer = feats.squeeze(0).detach().cpu() - if pitch is not None: - pitch_buffer = pitch.squeeze(0).detach().cpu() - else: - pitch_buffer = None - - del pitch, pitchf, feats, sid - torch.cuda.empty_cache() - audio1 = self.resamplerOut(audio1.float()) + del pitch, pitchf, feats, sid + torch.cuda.empty_cache() + audio1 = self.resamplerOut(audio1.float()) + # print("[Timer::5: ]", t.secs) return audio1, pitch_buffer, feats_buffer diff --git a/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py index 5c7724f7..8223c44f 100644 --- a/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py +++ b/server/voice_changer/DiffusionSVC/pitchExtractor/DioPitchExtractor.py @@ -18,6 +18,7 @@ class DioPitchExtractor(PitchExtractor): def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): audio = audio.detach().cpu().numpy() + silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策 start_frame = int(silence_front * self.sapmle_rate / window) real_silence_front = start_frame * window / self.sapmle_rate audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] diff --git a/server/voice_changer/utils/VoiceChangerIF.py b/server/voice_changer/utils/VoiceChangerIF.py index 7bd77074..8063528e 100644 --- a/server/voice_changer/utils/VoiceChangerIF.py +++ b/server/voice_changer/utils/VoiceChangerIF.py @@ -10,7 +10,7 @@ class VoiceChangerIF(Protocol): def get_info(self) -> dict[str, Any]: ... - + def get_performance(self) -> list[int]: ... @@ -25,4 +25,3 @@ class VoiceChangerIF(Protocol): def export2onnx() -> Any: ... -