mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
support sola
This commit is contained in:
parent
58b3aaa5fe
commit
2c63803f83
@ -168,6 +168,10 @@
|
||||
{
|
||||
"name": "rvcQuality",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "solaEnable",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
29
client/demo/dist/index.js
vendored
29
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -168,6 +168,10 @@
|
||||
{
|
||||
"name": "rvcQuality",
|
||||
"options": {}
|
||||
},
|
||||
{
|
||||
"name": "solaEnable",
|
||||
"options": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
|
@ -34,9 +34,10 @@ import { CrossFadeEndRateRow, CrossFadeEndRateRowProps } from "./components/807_
|
||||
import { DownSamplingModeRow, DownSamplingModeRowProps } from "./components/808_DownSamplingModeRow"
|
||||
import { TrancateNumTresholdRow, TrancateNumTresholdRowProps } from "./components/809_TrancateNumTresholdRow"
|
||||
import { IndexRatioRow, IndexRatioRowProps } from "./components/609_IndexRatioRow"
|
||||
import { RVCQualityRow, RVCQualityRowProps } from "./components/810_RVCQuality"
|
||||
import { RVCQualityRow, RVCQualityRowProps } from "./components/810_RVCQualityRow"
|
||||
import { ModelSamplingRateRow, ModelSamplingRateRowProps } from "./components/303_ModelSamplingRateRow"
|
||||
import { OnnxExportRow, OnnxExportRowProps } from "./components/304_OnnxExportRow"
|
||||
import { SolaEnableRow, SolaEnableRowProps } from "./components/811_SolaEnableRow"
|
||||
|
||||
export const catalog: { [key: string]: (props: any) => JSX.Element } = {}
|
||||
|
||||
@ -101,6 +102,7 @@ const initialize = () => {
|
||||
addToCatalog("downSamplingMode", (props: DownSamplingModeRowProps) => { return <DownSamplingModeRow {...props} /> })
|
||||
addToCatalog("trancateNumThreshold", (props: TrancateNumTresholdRowProps) => { return <TrancateNumTresholdRow {...props} /> })
|
||||
addToCatalog("rvcQuality", (props: RVCQualityRowProps) => { return <RVCQualityRow {...props} /> })
|
||||
addToCatalog("solaEnable", (props: SolaEnableRowProps) => { return <SolaEnableRow {...props} /> })
|
||||
|
||||
|
||||
|
||||
|
@ -15,7 +15,7 @@ export const InputChunkNumRow = (_props: InputChunkNumRowProps) => {
|
||||
appState.workletNodeSetting.trancateBuffer()
|
||||
}}>
|
||||
{
|
||||
[16, 32, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048].map(x => {
|
||||
[8, 16, 24, 32, 40, 48, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 2048].map(x => {
|
||||
return <option key={x} value={x}>{x}</option>
|
||||
})
|
||||
}
|
||||
|
@ -0,0 +1,31 @@
|
||||
import React, { useMemo } from "react"
|
||||
import { useAppState } from "../../../001_provider/001_AppStateProvider"
|
||||
|
||||
export type SolaEnableRowProps = {
|
||||
}
|
||||
|
||||
export const SolaEnableRow = (_props: SolaEnableRowProps) => {
|
||||
const appState = useAppState()
|
||||
|
||||
const trancateNumTresholdRow = useMemo(() => {
|
||||
const onSolaEnableChanged = (val: number) => {
|
||||
appState.serverSetting.updateServerSettings({
|
||||
...appState.serverSetting.serverSetting,
|
||||
solaEnabled: val
|
||||
})
|
||||
}
|
||||
return (
|
||||
<div className="body-row split-3-7 left-padding-1 guided">
|
||||
<div className="body-item-title left-padding-1">Sola enable</div>
|
||||
<div className="body-input-container">
|
||||
<select value={appState.serverSetting.serverSetting.solaEnabled} onChange={(e) => { onSolaEnableChanged(Number(e.target.value)) }}>
|
||||
<option value="0" >disable</option>
|
||||
<option value="1" >enable</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}, [appState.serverSetting.serverSetting, appState.serverSetting.updateServerSettings])
|
||||
|
||||
return trancateNumTresholdRow
|
||||
}
|
@ -71,6 +71,7 @@ export const ServerSettingKey = {
|
||||
"crossFadeOffsetRate": "crossFadeOffsetRate",
|
||||
"crossFadeEndRate": "crossFadeEndRate",
|
||||
"crossFadeOverlapSize": "crossFadeOverlapSize",
|
||||
"solaEnabled": "solaEnabled",
|
||||
|
||||
"framework": "framework",
|
||||
"onnxExecutionProvider": "onnxExecutionProvider",
|
||||
@ -104,6 +105,7 @@ export type VoiceChangerServerSetting = {
|
||||
crossFadeOffsetRate: number,
|
||||
crossFadeEndRate: number,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize,
|
||||
solaEnabled: number,
|
||||
|
||||
framework: Framework
|
||||
onnxExecutionProvider: OnnxExecutionProvider,
|
||||
@ -145,6 +147,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.0,
|
||||
crossFadeEndRate: 1.0,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 0,
|
||||
|
||||
framework: Framework.PyTorch,
|
||||
f0Factor: 1.0,
|
||||
@ -181,6 +184,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.0,
|
||||
crossFadeEndRate: 1.0,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 0,
|
||||
|
||||
framework: Framework.ONNX,
|
||||
f0Factor: 1.0,
|
||||
@ -218,6 +222,7 @@ export const DefaultServerSetting_so_vits_svc_40: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.0,
|
||||
crossFadeEndRate: 1.0,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 0,
|
||||
|
||||
framework: Framework.PyTorch,
|
||||
f0Factor: 1.0,
|
||||
@ -259,6 +264,7 @@ export const DefaultServerSetting_so_vits_svc_40_c: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.0,
|
||||
crossFadeEndRate: 1.0,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 0,
|
||||
|
||||
framework: Framework.ONNX,
|
||||
f0Factor: 1.0,
|
||||
@ -299,6 +305,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.0,
|
||||
crossFadeEndRate: 1.0,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 0,
|
||||
|
||||
framework: Framework.PyTorch,
|
||||
f0Factor: 1.0,
|
||||
@ -341,6 +348,7 @@ export const DefaultServerSetting_RVC: ServerInfo = {
|
||||
crossFadeOffsetRate: 0.1,
|
||||
crossFadeEndRate: 0.8,
|
||||
crossFadeOverlapSize: CrossFadeOverlapSize[1024],
|
||||
solaEnabled: 1,
|
||||
|
||||
framework: Framework.PyTorch,
|
||||
f0Factor: 1.0,
|
||||
|
@ -34,8 +34,10 @@ class MMVC_Rest_Fileuploader:
|
||||
json_compatible_item_data = jsonable_encoder(res)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
|
||||
# def post_concat_uploaded_file(self, slot: int = Form(...), filename: str = Form(...), filenameChunkNum: int = Form(...)):
|
||||
def post_concat_uploaded_file(self, filename: str = Form(...), filenameChunkNum: int = Form(...)):
|
||||
res = concat_file_chunks(UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
|
||||
slot = 0
|
||||
res = concat_file_chunks(slot, UPLOAD_DIR, filename, filenameChunkNum, UPLOAD_DIR)
|
||||
json_compatible_item_data = jsonable_encoder(res)
|
||||
return JSONResponse(content=json_compatible_item_data)
|
||||
|
||||
@ -52,6 +54,7 @@ class MMVC_Rest_Fileuploader:
|
||||
|
||||
def post_load_model(
|
||||
self,
|
||||
# slot: int = Form(...),
|
||||
pyTorchModelFilename: str = Form(...),
|
||||
onnxModelFilename: str = Form(...),
|
||||
configFilename: str = Form(...),
|
||||
|
@ -16,8 +16,11 @@ def upload_file(upload_dirname: str, file: UploadFile, filename: str):
|
||||
return {"status": "ERROR", "msg": "uploaded file is not found."}
|
||||
|
||||
|
||||
def concat_file_chunks(upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
|
||||
target_file_name = os.path.join(dest_dirname, filename)
|
||||
def concat_file_chunks(slot: int, upload_dirname: str, filename: str, chunkNum: int, dest_dirname: str):
|
||||
# target_dir = os.path.join(dest_dirname, f"{slot}")
|
||||
target_dir = os.path.join(dest_dirname)
|
||||
os.makedirs(target_dir, exist_ok=True)
|
||||
target_file_name = os.path.join(target_dir, filename)
|
||||
if os.path.exists(target_file_name):
|
||||
os.remove(target_file_name)
|
||||
with open(target_file_name, "ab") as target_file:
|
||||
|
@ -165,7 +165,32 @@ class RVC:
|
||||
def get_processing_sampling_rate(self):
|
||||
return self.settings.modelSamplingRate
|
||||
|
||||
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int):
|
||||
def generate_input(self, newData: any, inputSize: int, crossfadeSize: int, solaEnabled: bool = False, solaSearchFrame: int = 0):
|
||||
newData = newData.astype(np.float32) / 32768.0
|
||||
|
||||
if hasattr(self, "audio_buffer"):
|
||||
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
||||
else:
|
||||
self.audio_buffer = newData
|
||||
|
||||
if solaEnabled:
|
||||
convertSize = inputSize + crossfadeSize + solaSearchFrame + self.settings.extraConvertSize
|
||||
else:
|
||||
convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
|
||||
|
||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (128 - (convertSize % 128))
|
||||
|
||||
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
|
||||
|
||||
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)] # 出力部分だけ切り出して音量を確認。(solaとの関係性について、現状は無考慮)
|
||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||
vol = max(rms, self.prevVol * 0.0)
|
||||
self.prevVol = vol
|
||||
|
||||
return (self.audio_buffer, convertSize, vol, solaEnabled)
|
||||
|
||||
def generate_input_old(self, newData: any, inputSize: int, crossfadeSize: int):
|
||||
newData = newData.astype(np.float32) / 32768.0
|
||||
|
||||
if hasattr(self, "audio_buffer"):
|
||||
@ -276,7 +301,13 @@ class RVC:
|
||||
audio = self._onnx_inference(data)
|
||||
else:
|
||||
audio = self._pyTorch_inference(data)
|
||||
return audio
|
||||
|
||||
sola_enabled = data[3]
|
||||
if sola_enabled:
|
||||
return audio
|
||||
# return audio[self.settings.extraConvertSize:]
|
||||
else:
|
||||
return audio
|
||||
|
||||
def __del__(self):
|
||||
del self.net_g
|
||||
|
@ -13,7 +13,6 @@ from voice_changer.IORecorder import IORecorder
|
||||
|
||||
from voice_changer.utils.Timer import Timer
|
||||
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@ -32,12 +31,13 @@ class VoiceChangerSettings():
|
||||
crossFadeOffsetRate: float = 0.1
|
||||
crossFadeEndRate: float = 0.9
|
||||
crossFadeOverlapSize: int = 4096
|
||||
solaEnabled: int = 1 # 0:off, 1:on
|
||||
|
||||
recordIO: int = 0 # 0:off, 1:on
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData: list[str] = field(
|
||||
default_factory=lambda: ["inputSampleRate", "crossFadeOverlapSize", "recordIO"]
|
||||
default_factory=lambda: ["inputSampleRate", "crossFadeOverlapSize", "recordIO", "solaEnabled"]
|
||||
)
|
||||
floatData: list[str] = field(
|
||||
default_factory=lambda: ["crossFadeOffsetRate", "crossFadeEndRate"]
|
||||
@ -199,10 +199,102 @@ class VoiceChanger():
|
||||
if hasattr(self, 'np_prev_audio1') == True:
|
||||
delattr(self, "np_prev_audio1")
|
||||
|
||||
del self.sola_buffer
|
||||
|
||||
# receivedData: tuple of short
|
||||
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||
if self.settings.solaEnabled and self.modelType == "RVC":
|
||||
return self.on_request_sola(receivedData)
|
||||
else:
|
||||
return self.on_request_legacy(receivedData)
|
||||
|
||||
def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||
# print("processing with sola")
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
|
||||
# 前処理
|
||||
with Timer("pre-process") as t:
|
||||
if self.settings.inputSampleRate != processing_sampling_rate:
|
||||
newData = cast(AudioInOut, resampy.resample(receivedData, self.settings.inputSampleRate, processing_sampling_rate))
|
||||
else:
|
||||
newData = receivedData
|
||||
|
||||
sola_search_frame = int(0.012 * processing_sampling_rate)
|
||||
# sola_search_frame = 0
|
||||
block_frame = newData.shape[0]
|
||||
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
||||
self._generate_strength(crossfade_frame)
|
||||
|
||||
data = self.voiceChanger.generate_input(newData, block_frame, crossfade_frame, True, sola_search_frame)
|
||||
preprocess_time = t.secs
|
||||
|
||||
# 変換処理
|
||||
with Timer("main-process") as t:
|
||||
try:
|
||||
# Inference
|
||||
audio = self.voiceChanger.inference(data)
|
||||
|
||||
if hasattr(self, 'sola_buffer') == True:
|
||||
np.set_printoptions(threshold=10000)
|
||||
audio = audio[-sola_search_frame - crossfade_frame - block_frame:]
|
||||
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
||||
cor_nom = np.convolve(audio[: crossfade_frame + sola_search_frame], np.flip(self.sola_buffer), 'valid')
|
||||
cor_den = np.sqrt(np.convolve(audio[: crossfade_frame + sola_search_frame] ** 2, np.ones(crossfade_frame), 'valid') + 1e-3)
|
||||
sola_offset = np.argmax(cor_nom / cor_den)
|
||||
print("sola_offset", sola_offset, sola_search_frame)
|
||||
|
||||
output_wav = audio[sola_offset: sola_offset + block_frame].astype(np.float64)
|
||||
output_wav[:crossfade_frame] *= self.np_cur_strength
|
||||
output_wav[:crossfade_frame] += self.sola_buffer[:]
|
||||
|
||||
result = output_wav
|
||||
else:
|
||||
print("no sola buffer")
|
||||
result = np.zeros(4096).astype(np.int16)
|
||||
|
||||
if hasattr(self, 'sola_buffer') == True and sola_offset < sola_search_frame:
|
||||
sola_buf_org = audio[- sola_search_frame - crossfade_frame + sola_offset: -sola_search_frame + sola_offset]
|
||||
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
||||
else:
|
||||
self.sola_buffer = audio[- crossfade_frame:] * self.np_prev_strength
|
||||
# self.sola_buffer = audio[- crossfade_frame:]
|
||||
|
||||
except Exception as e:
|
||||
print("VC PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
||||
mainprocess_time = t.secs
|
||||
|
||||
# 後処理
|
||||
with Timer("post-process") as t:
|
||||
result = result.astype(np.int16)
|
||||
if self.settings.inputSampleRate != processing_sampling_rate:
|
||||
outputData = cast(AudioInOut, resampy.resample(result, processing_sampling_rate, self.settings.inputSampleRate).astype(np.int16))
|
||||
else:
|
||||
outputData = result
|
||||
|
||||
print_convert_processing(
|
||||
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
|
||||
if self.settings.recordIO == 1:
|
||||
self.ioRecorder.writeInput(receivedData)
|
||||
self.ioRecorder.writeOutput(outputData.tobytes())
|
||||
|
||||
# if receivedData.shape[0] != outputData.shape[0]:
|
||||
# print(f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}")
|
||||
# outputData = pad_array(outputData, receivedData.shape[0])
|
||||
# # print_convert_processing(
|
||||
# # f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
postprocess_time = t.secs
|
||||
|
||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||
return outputData, perf
|
||||
|
||||
def on_request_legacy(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
|
||||
# print("processing with legacy")
|
||||
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
print_convert_processing(f"------------ Convert processing.... ------------")
|
||||
# 前処理
|
||||
with Timer("pre-process") as t:
|
||||
|
Loading…
Reference in New Issue
Block a user