update information when upload model

This commit is contained in:
wataru 2023-01-28 15:56:56 +09:00
parent eb846bedcc
commit 9a473a07be
8 changed files with 6728 additions and 3172 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@
"postcss-loader": "^7.0.2",
"postcss-nested": "^6.0.0",
"prettier": "^2.8.3",
"rimraf": "^4.1.1",
"rimraf": "^4.1.2",
"style-loader": "^3.3.1",
"ts-loader": "^9.4.2",
"tsconfig-paths": "^4.1.2",
@ -48,7 +48,7 @@
"webpack-dev-server": "^4.11.1"
},
"dependencies": {
"@dannadori/voice-changer-client-js": "^1.0.20",
"@dannadori/voice-changer-client-js": "^1.0.28",
"react": "^18.2.0",
"react-dom": "^18.2.0"
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{
"name": "@dannadori/voice-changer-client-js",
"version": "1.0.20",
"version": "1.0.28",
"description": "",
"main": "dist/index.js",
"directories": {
@ -26,17 +26,17 @@
"devDependencies": {
"@types/audioworklet": "^0.0.36",
"@types/node": "^18.11.18",
"@types/react": "18.0.26",
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"eslint": "^8.32.0",
"eslint-config-prettier": "^8.6.0",
"eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-react": "^7.32.0",
"eslint-plugin-react": "^7.32.1",
"eslint-webpack-plugin": "^3.2.0",
"npm-run-all": "^4.1.5",
"prettier": "^2.8.3",
"raw-loader": "^4.0.2",
"rimraf": "^4.0.7",
"rimraf": "^4.1.2",
"ts-loader": "^9.4.2",
"typescript": "^4.9.4",
"webpack": "^5.75.0",

View File

@ -135,7 +135,7 @@ export const DefaultVoiceChangerServerSetting: VoiceChangerServerSetting = {
crossFadeOffsetRate: 0.1,
crossFadeEndRate: 0.9,
crossFadeOverlapRate: 0.5,
framework: "PyTorch",
framework: "ONNX",
onnxExecutionProvider: "CPUExecutionProvider"
}

View File

@ -179,10 +179,10 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
console.log(progress, end)
})
const serverInfo = await props.voiceChangerClient.loadModel(fileUploadSetting.configFile, fileUploadSetting.pyTorchModel, fileUploadSetting.onnxModel)
console.log(serverInfo)
await props.voiceChangerClient.loadModel(fileUploadSetting.configFile, fileUploadSetting.pyTorchModel, fileUploadSetting.onnxModel)
setUploadProgress(0)
setIsUploading(false)
reloadServerInfo()
}
}, [fileUploadSetting, props.voiceChangerClient])
@ -203,7 +203,6 @@ export const useServerSetting = (props: UseServerSettingProps): ServerSettingSta
framework: res.framework,
onnxExecutionProvider: (!!res.onnxExecutionProvider && res.onnxExecutionProvider.length > 0) ? res.onnxExecutionProvider[0] as OnnxExecutionProvider : DefaultVoiceChangerServerSetting.onnxExecutionProvider
})
}
}, [props.voiceChangerClient])

View File

@ -1,6 +1,7 @@
from const import ERROR_NO_ONNX_SESSION
import torch
import os, traceback
import os
import traceback
import numpy as np
from dataclasses import dataclass, asdict
@ -11,39 +12,41 @@ from models import SynthesizerTrn
from voice_changer.TrainerFunctions import TextAudioSpeakerCollate, spectrogram_torch, load_checkpoint, get_hparams_from_file
providers = ['OpenVINOExecutionProvider',"CUDAExecutionProvider","DmlExecutionProvider","CPUExecutionProvider"]
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
@dataclass
class VocieChangerSettings():
gpu:int = 0
srcId:int = 107
dstId:int = 100
crossFadeOffsetRate:float = 0.1
crossFadeEndRate:float = 0.9
crossFadeOverlapRate:float = 0.9
convertChunkNum:int = 32
minConvertSize:int = 0
framework:str = "PyTorch" # PyTorch or ONNX
pyTorchModelFile:str = ""
onnxModelFile:str = ""
configFile:str = ""
gpu: int = 0
srcId: int = 107
dstId: int = 100
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapRate: float = 0.9
convertChunkNum: int = 32
minConvertSize: int = 0
framework: str = "ONNX" # PyTorch or ONNX
pyTorchModelFile: str = ""
onnxModelFile: str = ""
configFile: str = ""
# ↓mutableな物だけ列挙
intData = ["gpu","srcId", "dstId", "convertChunkNum", "minConvertSize"]
floatData = [ "crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate"]
intData = ["gpu", "srcId", "dstId", "convertChunkNum", "minConvertSize"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate", "crossFadeOverlapRate"]
strData = ["framework"]
class VoiceChanger():
def __init__(self, config:str):
def __init__(self, config: str):
# 初期化
self.settings = VocieChangerSettings(configFile=config)
self.unpackedData_length=0
self.unpackedData_length = 0
self.net_g = None
self.onnx_session = None
self.currentCrossFadeOffsetRate=0
self.currentCrossFadeEndRate=0
self.currentCrossFadeOverlapRate=0
self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate = 0
self.currentCrossFadeOverlapRate = 0
# 共通で使用する情報を収集
# self.hps = utils.get_hparams_from_file(config)
self.hps = get_hparams_from_file(config)
@ -55,20 +58,20 @@ class VoiceChanger():
# print("text_norm2: ",text_norm)
# self.text_norm = torch.LongTensor(text_norm)
self.text_norm = torch.LongTensor([0, 6, 0])
self.text_norm = torch.LongTensor([0, 6, 0])
self.audio_buffer = torch.zeros(1, 0)
self.prev_audio = np.zeros(1)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def loadModel(self, config:str, pyTorch_model_file:str=None, onnx_model_file:str=None):
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
self.settings.configFile = config
if pyTorch_model_file != None:
self.settings.pyTorchModelFile = pyTorch_model_file
if onnx_model_file:
self.settings.onnxModelFile = onnx_model_file
# PyTorchモデル生成
if pyTorch_model_file != None:
self.net_g = SynthesizerTrn(
@ -93,7 +96,7 @@ class VoiceChanger():
def destroy(self):
del self.net_g
del self.onnx_session
del self.onnx_session
def get_info(self):
data = asdict(self.settings)
@ -101,19 +104,19 @@ class VoiceChanger():
data["onnxExecutionProvider"] = self.onnx_session.get_providers() if self.onnx_session != None else []
files = ["configFile", "pyTorchModelFile", "onnxModelFile"]
for f in files:
if data[f]!=None and os.path.exists(data[f]):
if data[f] != None and os.path.exists(data[f]):
data[f] = os.path.basename(data[f])
else:
data[f] = ""
return data
def update_setteings(self, key:str, val:any):
def update_setteings(self, key: str, val: any):
if key == "onnxExecutionProvider" and self.onnx_session != None:
if val == "CUDAExecutionProvider":
if self.settings.gpu < 0 or self.settings.gpu >= self.gpu_num:
self.settings.gpu = 0
provider_options=[{'device_id': self.settings.gpu}]
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=[val], provider_options=provider_options)
else:
self.onnx_session.set_providers(providers=[val])
@ -123,7 +126,7 @@ class VoiceChanger():
providers = self.onnx_session.get_providers()
print("Providers:", providers)
if "CUDAExecutionProvider" in providers:
provider_options=[{'device_id': self.settings.gpu}]
provider_options = [{'device_id': self.settings.gpu}]
self.onnx_session.set_providers(providers=["CUDAExecutionProvider"], provider_options=provider_options)
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0
@ -136,7 +139,6 @@ class VoiceChanger():
return self.get_info()
def _generate_strength(self, unpackedData):
if self.unpackedData_length != unpackedData.shape[0] or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapRate != self.settings.crossFadeOverlapRate:
@ -148,12 +150,12 @@ class VoiceChanger():
overlapSize = int(unpackedData.shape[0] * self.settings.crossFadeOverlapRate)
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1-percent) * 0.5 * np.pi) ** 2
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
@ -167,23 +169,23 @@ class VoiceChanger():
# print(f"target_len:{unpackedData.shape[0]}, prev_len:{len(self.prev_strength)} cur_len:{len(self.cur_strength)}")
# print("Prev", self.prev_strength)
# print("Cur", self.cur_strength)
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'prev_audio1') == True:
delattr(self,"prev_audio1")
delattr(self, "prev_audio1")
def _generate_input(self, unpackedData:any, convertSize:int):
def _generate_input(self, unpackedData: any, convertSize: int):
# 今回変換するデータをテンソルとして整形する
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
audio_norm = self.audio_buffer[:, -convertSize:] # 変換対象の部分だけ抽出
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
audio_norm = self.audio_buffer[:, -convertSize:] # 変換対象の部分だけ抽出
self.audio_buffer = audio_norm
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(self.settings.srcId)])
@ -191,7 +193,6 @@ class VoiceChanger():
data = TextAudioSpeakerCollate()([data])
return data
def _onnx_inference(self, data, inputSize):
if hasattr(self, "onnx_session") == False or self.onnx_session == None:
print("[Voice Changer] No ONNX session.")
@ -207,26 +208,26 @@ class VoiceChanger():
"lengths": spec_lengths.numpy(),
"sid_src": sid_src.numpy(),
"sid_tgt": sid_tgt1.numpy()
})[0][0,0] * self.hps.data.max_wav_value
})[0][0, 0] * self.hps.data.max_wav_value
if hasattr(self, 'np_prev_audio1') == True:
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.np_prev_audio1[-1*overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize]
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1*(inputSize + overlapSize) , -1*inputSize)
powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize]
result = np.concatenate([powered_result, cur],axis=0)
cur = audio1[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0)
else:
result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio1
return result
def _pyTorch_inference(self, data, inputSize):
if hasattr(self, "net_g") == False or self.net_g ==None:
if hasattr(self, "net_g") == False or self.net_g == None:
print("[Voice Changer] No pyTorch session.")
return np.zeros(1).astype(np.int16)
@ -234,7 +235,8 @@ class VoiceChanger():
with torch.no_grad():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([self.settings.dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value)
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src,
sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value)
if self.prev_strength.device != torch.device('cpu'):
print(f"prev_strength move from {self.prev_strength.device} to cpu")
@ -243,19 +245,19 @@ class VoiceChanger():
print(f"cur_strength move from {self.cur_strength.device} to cpu")
self.cur_strength = self.cur_strength.cpu()
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): # prev_audio1が所望のデバイスに無い場合は一回休み。
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cpu'): # prev_audio1が所望のデバイスに無い場合は一回休み。
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.prev_audio1[-1*overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize]
prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur],axis=0) # Crossfadeと今回のインプットの生部分を結合
cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else:
cur = audio1[-2*inputSize:-1*inputSize]
cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur
self.prev_audio1 = audio1
@ -265,7 +267,8 @@ class VoiceChanger():
with torch.no_grad():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(self.settings.gpu) for x in data]
sid_tgt1 = torch.LongTensor([self.settings.dstId]).cuda(self.settings.gpu)
audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value
audio1 = self.net_g.cuda(self.settings.gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src,
sid_tgt=sid_tgt1)[0, 0].data * self.hps.data.max_wav_value
if self.prev_strength.device != torch.device('cuda', self.settings.gpu):
print(f"prev_strength move from {self.prev_strength.device} to gpu{self.settings.gpu}")
@ -274,33 +277,30 @@ class VoiceChanger():
print(f"cur_strength move from {self.cur_strength.device} to gpu{self.settings.gpu}")
self.cur_strength = self.cur_strength.cuda(self.settings.gpu)
if hasattr(self, 'prev_audio1') == True and self.prev_audio1.device == torch.device('cuda', self.settings.gpu):
overlapSize = int(inputSize * self.settings.crossFadeOverlapRate)
prev_overlap = self.prev_audio1[-1*overlapSize:]
cur_overlap = audio1[-1*(inputSize + overlapSize) :-1*inputSize]
prev_overlap = self.prev_audio1[-1 * overlapSize:]
cur_overlap = audio1[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.prev_strength
powered_cur = cur_overlap * self.cur_strength
powered_result = powered_prev + powered_cur
cur = audio1[-1*inputSize:-1*overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur],axis=0) # Crossfadeと今回のインプットの生部分を結合
cur = audio1[-1 * inputSize:-1 * overlapSize] # 今回のインプットの生部分。(インプット - 次回のCrossfade部分)。
result = torch.cat([powered_result, cur], axis=0) # Crossfadeと今回のインプットの生部分を結合
else:
cur = audio1[-2*inputSize:-1*inputSize]
cur = audio1[-2 * inputSize:-1 * inputSize]
result = cur
self.prev_audio1 = audio1
result = result.cpu().float().numpy()
return result
def on_request(self, unpackedData:any):
convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk
def on_request(self, unpackedData: any):
convertSize = self.settings.convertChunkNum * 128 # 128sample/1chunk
if unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize:
convertSize = int(unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate)) + 1024
if unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate) + 1024 > convertSize:
convertSize = int(unpackedData.shape[0] * (1 + self.settings.crossFadeOverlapRate)) + 1024
if convertSize < self.settings.minConvertSize:
convertSize = self.settings.minConvertSize
# print("convert Size", unpackedData.shape[0], unpackedData.shape[0]*(1 + self.settings.crossFadeOverlapRate), convertSize, self.settings.minConvertSize)
@ -308,16 +308,14 @@ class VoiceChanger():
self._generate_strength(unpackedData)
data = self._generate_input(unpackedData, convertSize)
try:
if self.settings.framework == "ONNX":
result = self._onnx_inference(data, unpackedData.shape[0])
else:
result = self._pyTorch_inference(data, unpackedData.shape[0])
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
if hasattr(self, "np_prev_audio1"):
del self.np_prev_audio1
@ -328,4 +326,3 @@ class VoiceChanger():
result = result.astype(np.int16)
# print("on_request result size:",result.shape)
return result