support sola

This commit is contained in:
wataru 2023-04-14 09:18:34 +09:00
parent 58b3aaa5fe
commit 2c63803f83
12 changed files with 207 additions and 18 deletions

View File

@ -168,6 +168,10 @@
{
"name": "rvcQuality",
"options": {}
},
{
"name": "solaEnable",
"options": {}
}
]
},

File diff suppressed because one or more lines are too long

View File

@ -168,6 +168,10 @@
{
"name": "rvcQuality",
"options": {}
},
{
"name": "solaEnable",
"options": {}
}
]
},

View File

@ -34,9 +34,10 @@ import { CrossFadeEndRateRow, CrossFadeEndRateRowProps } from "./components/807_
import { DownSamplingModeRow, DownSamplingModeRowProps } from "./components/808_DownSamplingModeRow"
import { TrancateNumTresholdRow, TrancateNumTresholdRowProps } from "./components/809_TrancateNumTresholdRow"
import { IndexRatioRow, IndexRatioRowProps } from "./components/609_IndexRatioRow"
import { RVCQualityRow, RVCQualityRowProps } from "./components/810_RVCQuality"
import { RVCQualityRow, RVCQualityRowProps } from "./components/810_RVCQualityRow"
import { ModelSamplingRateRow, ModelSamplingRateRowProps } from "./components/303_ModelSamplingRateRow"
import { OnnxExportRow, OnnxExportRowProps } from "./components/304_OnnxExportRow"
import { SolaEnableRow, SolaEnableRowProps } from "./components/811_SolaEnableRow"
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
@ -101,6 +102,7 @@ const initialize = () => {
addToCatalog("downSamplingMode", (props: DownSamplingModeRowProps) => { return <DownSamplingModeRow {...props} /> })
addToCatalog("trancateNumThreshold", (props: TrancateNumTresholdRowProps) => { return <TrancateNumTresholdRow {...props} /> })
addToCatalog("rvcQuality", (props: RVCQualityRowProps) => { return <RVCQualityRow {...props} /> })
addToCatalog("solaEnable", (props: SolaEnableRowProps) => { return <SolaEnableRow {...props} /> })

View File

@ -15,7 +15,7 @@ export const InputChunkNumRow = (_props: InputChunkNumRowProps) => {
appState.workletNodeSetting.trancateBuffer()
}}>
{
[16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048].map(x => {
[8, 16, 24, 32, 40, 48, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048].map(x => {
return <option key={x} value={x}>{x}</option>
})
}

View File

@ -0,0 +1,31 @@
import React, { useMemo } from "react"
import { useAppState } from "../../../001_provider/001_AppStateProvider"
export type SolaEnableRowProps = {
}
export const SolaEnableRow = (_props: SolaEnableRowProps) => {
const appState = useAppState()
const trancateNumTresholdRow = useMemo(() => {
const onSolaEnableChanged = (val: number) => {
appState.serverSetting.updateServerSettings({
...appState.serverSetting.serverSetting,
solaEnabled: val
})
}
return (
<div className="body-row split-3-7 left-padding-1 guided">
<div className="body-item-title left-padding-1">Sola enable</div>
<div className="body-input-container">
<select value={appState.serverSetting.serverSetting.solaEnabled} onChange={(e) => { onSolaEnableChanged(Number(e.target.value)) }}>
<option value="0" >disable</option>
<option value="1" >enable</option>
</select>
</div>
</div>
)
}, [appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings])
return trancateNumTresholdRow
}

View File

@ -71,6 +71,7 @@ export const ServerSettingKey = {
"crossFadeOffsetRate": "crossFadeOffsetRate",
"crossFadeEndRate": "crossFadeEndRate",
"crossFadeOverlapSize": "crossFadeOverlapSize",
"solaEnabled": "solaEnabled",
"framework": "framework",
"onnxExecutionProvider": "onnxExecutionProvider",
@ -104,6 +105,7 @@ export type VoiceChangerServerSetting = {
crossFadeOffsetRate: number,
crossFadeEndRate: number,
crossFadeOverlapSize: CrossFadeOverlapSize,
solaEnabled: number,
framework: Framework
onnxExecutionProvider: OnnxExecutionProvider,
@ -145,6 +147,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
crossFadeOffsetRate: 0.0,
crossFadeEndRate: 1.0,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 0,
framework: Framework.PyTorch,
f0Factor: 1.0,
@ -181,6 +184,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
crossFadeOffsetRate: 0.0,
crossFadeEndRate: 1.0,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 0,
framework: Framework.ONNX,
f0Factor: 1.0,
@ -218,6 +222,7 @@ export const DefaultServerSetting_so_vits_svc_40: ServerInfo = {
crossFadeOffsetRate: 0.0,
crossFadeEndRate: 1.0,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 0,
framework: Framework.PyTorch,
f0Factor: 1.0,
@ -259,6 +264,7 @@ export const DefaultServerSetting_so_vits_svc_40_c: ServerInfo = {
crossFadeOffsetRate: 0.0,
crossFadeEndRate: 1.0,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 0,
framework: Framework.ONNX,
f0Factor: 1.0,
@ -299,6 +305,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
crossFadeOffsetRate: 0.0,
crossFadeEndRate: 1.0,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 0,
framework: Framework.PyTorch,
f0Factor: 1.0,
@ -341,6 +348,7 @@ export const DefaultServerSetting_RVC: ServerInfo = {
crossFadeOffsetRate: 0.1,
crossFadeEndRate: 0.8,
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
solaEnabled: 1,
framework: Framework.PyTorch,
f0Factor: 1.0,

View File

@ -34,8 +34,10 @@ class MMVC_Rest_Fileuploader:
json_compatible_item_data = jsonable_encoder(res)
return JSONResponse(content=json_compatible_item_data)
# def post_concat_uploaded_file(self, slot: int = Form(...), filename: str = Form(...), filenameChunkNum: int = Form(...)):
def post_concat_uploaded_file(self, filename: str = Form(...), filenameChunkNum: int = Form(...)):
res = concat_file_chunks(UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
slot = 0
res = concat_file_chunks(slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
json_compatible_item_data = jsonable_encoder(res)
return JSONResponse(content=json_compatible_item_data)
@ -52,6 +54,7 @@ class MMVC_Rest_Fileuploader:
def post_load_model(
self,
# slot: int = Form(...),
pyTorchModelFilename: str = Form(...),
onnxModelFilename: str = Form(...),
configFilename: str = Form(...),

View File

@ -16,8 +16,11 @@ def upload_file(upload_dirname: str, file: UploadFile, filename: str):
return {"status": "ERROR", "msg": "uploaded file is not found."}
def concat_file_chunks(upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
target_file_name = os.path.join(dest_dirname, filename)
def concat_file_chunks(slot: int, upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
# target_dir = os.path.join(dest_dirname, f"{slot}")
target_dir = os.path.join(dest_dirname)
os.makedirs(target_dir, exist_ok=True)
target_file_name = os.path.join(target_dir, filename)
if os.path.exists(target_file_name):
os.remove(target_file_name)
with open(target_file_name, "ab") as target_file:

View File

@ -165,7 +165,32 @@ class RVC:
def get_processing_sampling_rate(self):
return self.settings.modelSamplingRate
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int):
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
newData = newData.astype(np.float32) / 32768.0
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
if solaEnabled:
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
else:
convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮)
rms = np.sqrt(np.square(crop).mean(axis=0))
vol = max(rms, self.prevVol * 0.0)
self.prevVol = vol
return (self.audio_buffer, convertSize, vol, solaEnabled)
def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int):
newData = newData.astype(np.float32) / 32768.0
if hasattr(self, "audio_buffer"):
@ -276,7 +301,13 @@ class RVC:
audio = self._onnx_inference(data)
else:
audio = self._pyTorch_inference(data)
return audio
sola_enabled = data[3]
if sola_enabled:
return audio
# return audio[self.settings.extraConvertSize:]
else:
return audio
def __del__(self):
del self.net_g

View File

@ -13,7 +13,6 @@ from voice_changer.IORecorder import IORecorder
from voice_changer.utils.Timer import Timer
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
import time
@ -32,12 +31,13 @@ class VoiceChangerSettings():
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
solaEnabled: int = 1 # 0:off, 1:on
recordIO: int = 0 # 0:off, 1:on
# ↓mutableな物だけ列挙
intData: list[str] = field(
default_factory=lambda: ["inputSampleRate", "crossFadeOverlapSize", "recordIO"]
default_factory=lambda: ["inputSampleRate", "crossFadeOverlapSize", "recordIO", "solaEnabled"]
)
floatData: list[str] = field(
default_factory=lambda: ["crossFadeOffsetRate", "crossFadeEndRate"]
@ -199,10 +199,102 @@ class VoiceChanger():
if hasattr(self, 'np_prev_audio1') == True:
delattr(self, "np_prev_audio1")
del self.sola_buffer
# receivedData: tuple of short
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
if self.settings.solaEnabled and self.modelType == "RVC":
return self.on_request_sola(receivedData)
else:
return self.on_request_legacy(receivedData)
def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
# print("processing with sola")
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != processing_sampling_rate:
newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
else:
newData = receivedData
sola_search_frame = int(0.012 * processing_sampling_rate)
# sola_search_frame = 0
block_frame = newData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, True, sola_search_frame)
preprocess_time = t.secs
# 変換処理
with Timer("main-process") as t:
try:
# Inference
audio = self.voiceChanger.inference(data)
if hasattr(self, 'sola_buffer') == True:
np.set_printoptions(threshold=10000)
audio = audio[-sola_search_frame - crossfade_frame - block_frame:]
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
cor_nom = np.convolve(audio[: crossfade_frame + sola_search_frame], np.flip(self.sola_buffer), 'valid')
cor_den = np.sqrt(np.convolve(audio[: crossfade_frame + sola_search_frame] ** 2, np.ones(crossfade_frame), 'valid') + 1e-3)
sola_offset = np.argmax(cor_nom / cor_den)
print("sola_offset", sola_offset, sola_search_frame)
output_wav = audio[sola_offset: sola_offset + block_frame].astype(np.float64)
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
result = output_wav
else:
print("no sola buffer")
result = np.zeros(4096).astype(np.int16)
if hasattr(self, 'sola_buffer') == True and sola_offset < sola_search_frame:
sola_buf_org = audio[- sola_search_frame - crossfade_frame + sola_offset: -sola_search_frame + sola_offset]
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength
# self.sola_buffer = audio[- crossfade_frame:]
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return np.zeros(1).astype(np.int16), [0, 0, 0]
mainprocess_time = t.secs
# 後処理
with Timer("post-process") as t:
result = result.astype(np.int16)
if self.settings.inputSampleRate != processing_sampling_rate:
outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
else:
outputData = result
print_convert_processing(
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(outputData.tobytes())
# if receivedData.shape[0] != outputData.shape[0]:
# print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}")
# outputData = pad_array(outputData, receivedData.shape[0])
# # print_convert_processing(
# # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
postprocess_time = t.secs
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
perf = [preprocess_time, mainprocess_time, postprocess_time]
return outputData, perf
def on_request_legacy(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
# print("processing with legacy")
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
print_convert_processing(f"------------ Convert processing.... ------------")
# 前処理
with Timer("pre-process") as t: