WIP: so-vits-svc 40v2, alpha (refactoring2)

This commit is contained in:
wataru 2023-03-13 02:06:39 +09:00
parent ddcae23f20
commit acc848fda4
5 changed files with 28 additions and 27 deletions

View File

@ -70,7 +70,7 @@ export const ServerSettingKey = {
"noiceScale": "noiceScale",
"predictF0": "predictF0",
"silentThreshold": "silentThreshold",
"processingLength": "processingLength",
"extraConvertSize": "extraConvertSize",
"inputSampleRate": "inputSampleRate",
} as const
@ -97,7 +97,7 @@ export type VoiceChangerServerSetting = {
noiceScale: number // so-vits-svc
predictF0: number // so-vits-svc
silentThreshold: number // so-vits-svc
processingLength: number// so-vits-svc
extraConvertSize: number// so-vits-svc
inputSampleRate: InputSampleRate
}
@ -129,7 +129,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
noiceScale: 0,
predictF0: 0,
silentThreshold: 0,
processingLength: 0,
extraConvertSize: 0,
inputSampleRate: 24000,
@ -160,7 +160,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
noiceScale: 0,
predictF0: 0,
silentThreshold: 0,
processingLength: 0,
extraConvertSize: 0,
inputSampleRate: 24000,
@ -195,7 +195,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
noiceScale: 0.3,
predictF0: 0,
silentThreshold: 0.00001,
processingLength: 1024 * 32,
extraConvertSize: 1024 * 32,
inputSampleRate: 24000,

File diff suppressed because one or more lines are too long

View File

@ -47,10 +47,10 @@ export const useConvertSetting = (): ConvertSettingState => {
const processingLengthRow = useMemo(() => {
return (
<div className="body-row split-3-2-1-4 left-padding-1 guided">
<div className="body-item-title left-padding-1">Processing Length</div>
<div className="body-item-title left-padding-1">Extra Data Length</div>
<div className="body-input-container">
<select className="body-select" value={appState.serverSetting.serverSetting.processingLength} onChange={(e) => {
appState.serverSetting.updateServerSettings({ ...appState.serverSetting.serverSetting, processingLength: Number(e.target.value) })
<select className="body-select" value={appState.serverSetting.serverSetting.extraConvertSize} onChange={(e) => {
appState.serverSetting.updateServerSettings({ ...appState.serverSetting.serverSetting, extraConvertSize: Number(e.target.value) })
appState.workletNodeSetting.trancateBuffer()
}}>
{

View File

@ -36,7 +36,7 @@ class SoVitsSvc40v2Settings():
noiceScale: float = 0.3
predictF0: int = 0 # 0:False, 1:True
silentThreshold: float = 0.00001
processingLength: int = 1024 * 32
extraConvertSize: int = 1024 * 32
framework: str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile: str = ""
@ -44,7 +44,7 @@ class SoVitsSvc40v2Settings():
configFile: str = ""
# ↓mutableな物だけ列挙
intData = ["gpu", "dstId", "tran", "predictF0", "processingLength"]
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize"]
floatData = ["noiceScale", "silentThreshold"]
strData = ["framework", "f0Detector"]
@ -171,7 +171,7 @@ class SoVitsSvc40v2:
else:
self.audio_buffer = newData
convertSize = inputSize + crossfadeSize + self.settings.processingLength
convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))

View File

@ -42,11 +42,11 @@ class VoiceChanger():
def __init__(self):
# 初期化
self.settings = VocieChangerSettings()
self.unpackedData_length = 0
self.onnx_session = None
self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate = 0
self.currentCrossFadeOverlapSize = 0
self.currentCrossFadeOverlapSize = 0 # setting
self.crossfadeSize = 0 # calculated
modelType = getModelType()
print("[VoiceChanger] activate model type:", modelType)
@ -82,7 +82,7 @@ class VoiceChanger():
if key in self.settings.intData:
setattr(self.settings, key, int(val))
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0
self.crossfadeSize = 0
if key == "recordIO" and val == 1:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
@ -114,31 +114,31 @@ class VoiceChanger():
return self.get_info()
def _generate_strength(self, dataLength: int):
def _generate_strength(self, crossfadeSize: int):
if self.unpackedData_length != dataLength or \
if self.crossfadeSize != crossfadeSize or \
self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \
self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \
self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
self.unpackedData_length = dataLength
self.crossfadeSize = crossfadeSize
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
overlapSize = min(self.settings.crossFadeOverlapSize, self.unpackedData_length)
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength,
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(crossfadeSize - cf_offset - len(np_cur_strength))])
print("Generated Strengths")
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'np_prev_audio1') == True:
@ -179,7 +179,7 @@ class VoiceChanger():
print_convert_processing(f" Convert data size of {inputSize + crossfadeSize} (+ extra size)")
print_convert_processing(f" will be cropped:{-1 * (inputSize + crossfadeSize)}, {-1 * (crossfadeSize)}")
self._generate_strength(inputSize)
self._generate_strength(crossfadeSize)
data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize)
preprocess_time = t.secs
@ -196,7 +196,8 @@ class VoiceChanger():
cur_overlap_start = -1 * (inputSize + crossfadeSize)
cur_overlap_end = -1 * inputSize
cur_overlap = audio[cur_overlap_start:cur_overlap_end]
# cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
print_convert_processing(
f" audio:{audio.shape}, prev_overlap:{prev_overlap.shape}, self.np_prev_strength:{self.np_prev_strength.shape}")
powered_prev = prev_overlap * self.np_prev_strength
print_convert_processing(
f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")