mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-03-11 09:58:49 +03:00
WIP: so-vits-svc 40v2, alpha (refactoring2)
This commit is contained in:
parent
ddcae23f20
commit
acc848fda4
@ -70,7 +70,7 @@ export const ServerSettingKey = {
|
||||
"noiceScale": "noiceScale",
|
||||
"predictF0": "predictF0",
|
||||
"silentThreshold": "silentThreshold",
|
||||
"processingLength": "processingLength",
|
||||
"extraConvertSize": "extraConvertSize",
|
||||
|
||||
"inputSampleRate": "inputSampleRate",
|
||||
} as const
|
||||
@ -97,7 +97,7 @@ export type VoiceChangerServerSetting = {
|
||||
noiceScale: number // so-vits-svc
|
||||
predictF0: number // so-vits-svc
|
||||
silentThreshold: number // so-vits-svc
|
||||
processingLength: number// so-vits-svc
|
||||
extraConvertSize: number// so-vits-svc
|
||||
|
||||
inputSampleRate: InputSampleRate
|
||||
}
|
||||
@ -129,7 +129,7 @@ export const DefaultServerSetting_MMVCv15: ServerInfo = {
|
||||
noiceScale: 0,
|
||||
predictF0: 0,
|
||||
silentThreshold: 0,
|
||||
processingLength: 0,
|
||||
extraConvertSize: 0,
|
||||
|
||||
inputSampleRate: 24000,
|
||||
|
||||
@ -160,7 +160,7 @@ export const DefaultServerSetting_MMVCv13: ServerInfo = {
|
||||
noiceScale: 0,
|
||||
predictF0: 0,
|
||||
silentThreshold: 0,
|
||||
processingLength: 0,
|
||||
extraConvertSize: 0,
|
||||
|
||||
inputSampleRate: 24000,
|
||||
|
||||
@ -195,7 +195,7 @@ export const DefaultServerSetting_so_vits_svc_40v2: ServerInfo = {
|
||||
noiceScale: 0.3,
|
||||
predictF0: 0,
|
||||
silentThreshold: 0.00001,
|
||||
processingLength: 1024 * 32,
|
||||
extraConvertSize: 1024 * 32,
|
||||
|
||||
inputSampleRate: 24000,
|
||||
|
||||
|
4
client/so-vits-svc_40v2/dist/index.js
vendored
4
client/so-vits-svc_40v2/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -47,10 +47,10 @@ export const useConvertSetting = (): ConvertSettingState => {
|
||||
const processingLengthRow = useMemo(() => {
|
||||
return (
|
||||
<div className="body-row split-3-2-1-4 left-padding-1 guided">
|
||||
<div className="body-item-title left-padding-1">Processing Length</div>
|
||||
<div className="body-item-title left-padding-1">Extra Data Length</div>
|
||||
<div className="body-input-container">
|
||||
<select className="body-select" value={appState.serverSetting.serverSetting.processingLength} onChange={(e) => {
|
||||
appState.serverSetting.updateServerSettings({ ...appState.serverSetting.serverSetting, processingLength: Number(e.target.value) })
|
||||
<select className="body-select" value={appState.serverSetting.serverSetting.extraConvertSize} onChange={(e) => {
|
||||
appState.serverSetting.updateServerSettings({ ...appState.serverSetting.serverSetting, extraConvertSize: Number(e.target.value) })
|
||||
appState.workletNodeSetting.trancateBuffer()
|
||||
}}>
|
||||
{
|
||||
|
@ -36,7 +36,7 @@ class SoVitsSvc40v2Settings():
|
||||
noiceScale: float = 0.3
|
||||
predictF0: int = 0 # 0:False, 1:True
|
||||
silentThreshold: float = 0.00001
|
||||
processingLength: int = 1024 * 32
|
||||
extraConvertSize: int = 1024 * 32
|
||||
|
||||
framework: str = "PyTorch" # PyTorch or ONNX
|
||||
pyTorchModelFile: str = ""
|
||||
@ -44,7 +44,7 @@ class SoVitsSvc40v2Settings():
|
||||
configFile: str = ""
|
||||
|
||||
# ↓mutableな物だけ列挙
|
||||
intData = ["gpu", "dstId", "tran", "predictF0", "processingLength"]
|
||||
intData = ["gpu", "dstId", "tran", "predictF0", "extraConvertSize"]
|
||||
floatData = ["noiceScale", "silentThreshold"]
|
||||
strData = ["framework", "f0Detector"]
|
||||
|
||||
@ -171,7 +171,7 @@ class SoVitsSvc40v2:
|
||||
else:
|
||||
self.audio_buffer = newData
|
||||
|
||||
convertSize = inputSize + crossfadeSize + self.settings.processingLength
|
||||
convertSize = inputSize + crossfadeSize + self.settings.extraConvertSize
|
||||
|
||||
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
|
||||
|
@ -42,11 +42,11 @@ class VoiceChanger():
|
||||
def __init__(self):
|
||||
# 初期化
|
||||
self.settings = VocieChangerSettings()
|
||||
self.unpackedData_length = 0
|
||||
self.onnx_session = None
|
||||
self.currentCrossFadeOffsetRate = 0
|
||||
self.currentCrossFadeEndRate = 0
|
||||
self.currentCrossFadeOverlapSize = 0
|
||||
self.currentCrossFadeOverlapSize = 0 # setting
|
||||
self.crossfadeSize = 0 # calculated
|
||||
|
||||
modelType = getModelType()
|
||||
print("[VoiceChanger] activate model type:", modelType)
|
||||
@ -82,7 +82,7 @@ class VoiceChanger():
|
||||
if key in self.settings.intData:
|
||||
setattr(self.settings, key, int(val))
|
||||
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
|
||||
self.unpackedData_length = 0
|
||||
self.crossfadeSize = 0
|
||||
if key == "recordIO" and val == 1:
|
||||
if hasattr(self, "ioRecorder"):
|
||||
self.ioRecorder.close()
|
||||
@ -114,31 +114,31 @@ class VoiceChanger():
|
||||
|
||||
return self.get_info()
|
||||
|
||||
def _generate_strength(self, dataLength: int):
|
||||
def _generate_strength(self, crossfadeSize: int):
|
||||
|
||||
if self.unpackedData_length != dataLength or \
|
||||
if self.crossfadeSize != crossfadeSize or \
|
||||
self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \
|
||||
self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \
|
||||
self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
|
||||
|
||||
self.unpackedData_length = dataLength
|
||||
self.crossfadeSize = crossfadeSize
|
||||
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
||||
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
||||
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
|
||||
|
||||
overlapSize = min(self.settings.crossFadeOverlapSize, self.unpackedData_length)
|
||||
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
|
||||
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
|
||||
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
|
||||
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
|
||||
cf_range = cf_end - cf_offset
|
||||
percent = np.arange(cf_range) / cf_range
|
||||
|
||||
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
|
||||
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
|
||||
|
||||
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
|
||||
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
|
||||
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength,
|
||||
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength))])
|
||||
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(crossfadeSize - cf_offset - len(np_cur_strength))])
|
||||
|
||||
print("Generated Strengths")
|
||||
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
|
||||
|
||||
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
||||
if hasattr(self, 'np_prev_audio1') == True:
|
||||
@ -179,7 +179,7 @@ class VoiceChanger():
|
||||
print_convert_processing(f" Convert data size of {inputSize + crossfadeSize} (+ extra size)")
|
||||
print_convert_processing(f" will be cropped:{-1 * (inputSize + crossfadeSize)}, {-1 * (crossfadeSize)}")
|
||||
|
||||
self._generate_strength(inputSize)
|
||||
self._generate_strength(crossfadeSize)
|
||||
data = self.voiceChanger.generate_input(newData, inputSize, crossfadeSize)
|
||||
preprocess_time = t.secs
|
||||
|
||||
@ -196,7 +196,8 @@ class VoiceChanger():
|
||||
cur_overlap_start = -1 * (inputSize + crossfadeSize)
|
||||
cur_overlap_end = -1 * inputSize
|
||||
cur_overlap = audio[cur_overlap_start:cur_overlap_end]
|
||||
# cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
|
||||
print_convert_processing(
|
||||
f" audio:{audio.shape}, prev_overlap:{prev_overlap.shape}, self.np_prev_strength:{self.np_prev_strength.shape}")
|
||||
powered_prev = prev_overlap * self.np_prev_strength
|
||||
print_convert_processing(
|
||||
f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
|
||||
|
Loading…
x
Reference in New Issue
Block a user