voice-changer/server/voice_changer/VoiceChanger.py

224 lines
8.9 KiB
Python
Raw Normal View History

2023-03-08 03:48:50 +03:00
from const import TMP_DIR, getModelType
2022-12-31 10:08:14 +03:00
import torch
2023-01-28 09:56:56 +03:00
import os
import traceback
2022-12-31 10:08:14 +03:00
import numpy as np
2023-01-08 10:18:20 +03:00
from dataclasses import dataclass, asdict
2023-02-18 14:53:15 +03:00
import resampy
2023-02-10 18:59:44 +03:00
from voice_changer.IORecorder import IORecorder
from voice_changer.IOAnalyzer import IOAnalyzer
2023-02-20 22:07:43 +03:00
import time
2022-12-31 10:08:14 +03:00
2023-01-28 09:56:56 +03:00
providers = ['OpenVINOExecutionProvider', "CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"]
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png")
STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")
2023-02-12 06:25:57 +03:00
2023-01-08 10:18:20 +03:00
@dataclass
class VocieChangerSettings():
2023-02-21 00:03:37 +03:00
inputSampleRate: int = 24000 # 48000 or 24000
2023-01-28 09:56:56 +03:00
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
2023-02-20 01:14:05 +03:00
recordIO: int = 0 # 0:off, 1:on
2023-02-10 18:59:44 +03:00
2023-01-08 10:18:20 +03:00
# ↓mutableな物だけ列挙
intData = ["inputSampleRate", "crossFadeOverlapSize", "recordIO"]
floatData = ["crossFadeOffsetRate", "crossFadeEndRate"]
strData = []
2023-01-08 10:18:20 +03:00
2023-01-28 09:56:56 +03:00
2022-12-31 10:08:14 +03:00
class VoiceChanger():
2023-01-08 10:18:20 +03:00
2023-01-29 03:42:45 +03:00
def __init__(self):
2023-01-08 10:18:20 +03:00
# 初期化
2023-01-29 03:42:45 +03:00
self.settings = VocieChangerSettings()
2023-01-28 09:56:56 +03:00
self.unpackedData_length = 0
2023-01-10 16:49:16 +03:00
self.onnx_session = None
2023-01-28 09:56:56 +03:00
self.currentCrossFadeOffsetRate = 0
self.currentCrossFadeEndRate = 0
self.currentCrossFadeOverlapSize = 0
2023-01-28 09:56:56 +03:00
2023-03-08 03:48:50 +03:00
modelType = getModelType()
print("[VoiceChanger] activate model type:", modelType)
if modelType == "MMVCv15":
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
2023-03-07 19:46:08 +03:00
self.voiceChanger = MMVCv15()
else:
2023-03-08 03:48:50 +03:00
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
2023-03-07 19:46:08 +03:00
self.voiceChanger = MMVCv13()
2022-12-31 10:08:14 +03:00
self.gpu_num = torch.cuda.device_count()
self.prev_audio = np.zeros(1)
2023-01-07 18:25:21 +03:00
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
2022-12-31 10:08:14 +03:00
2023-01-04 20:28:36 +03:00
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
2023-01-28 09:56:56 +03:00
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
return self.voiceChanger.loadModel(config, pyTorch_model_file, onnx_model_file)
2022-12-31 10:08:14 +03:00
2023-01-07 18:25:21 +03:00
def get_info(self):
2023-01-08 10:18:20 +03:00
data = asdict(self.settings)
data.update(self.voiceChanger.get_info())
2023-01-08 10:18:20 +03:00
return data
2023-01-28 09:56:56 +03:00
def update_setteings(self, key: str, val: any):
if key in self.settings.intData:
2023-01-08 10:18:20 +03:00
setattr(self.settings, key, int(val))
2023-01-08 15:19:44 +03:00
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.unpackedData_length = 0
2023-02-14 23:02:51 +03:00
if key == "recordIO" and val == 1:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
2023-02-15 01:18:05 +03:00
if key == "recordIO" and val == 0:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
2023-02-16 21:03:21 +03:00
pass
if key == "recordIO" and val == 2:
if hasattr(self, "ioRecorder"):
self.ioRecorder.close()
if hasattr(self, "ioAnalyzer") == False:
self.ioAnalyzer = IOAnalyzer()
2023-02-15 01:18:05 +03:00
try:
self.ioAnalyzer.analyze(STREAM_INPUT_FILE, STREAM_ANALYZE_FILE_DIO, STREAM_ANALYZE_FILE_HARVEST, self.settings.inputSampleRate)
2023-02-15 01:18:05 +03:00
except Exception as e:
print("recordIO exception", e)
2023-01-08 10:18:20 +03:00
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
2023-01-08 03:45:58 +03:00
else:
ret = self.voiceChanger.update_setteings(key, val)
if ret == False:
print(f"{key} is not mutalbe variable or unknown variable!")
2023-01-08 10:18:20 +03:00
2023-01-10 18:59:09 +03:00
return self.get_info()
2023-01-08 10:18:20 +03:00
def _generate_strength(self, dataLength: int):
2023-01-07 14:07:39 +03:00
if self.unpackedData_length != dataLength or \
self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or \
self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or \
self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
self.unpackedData_length = dataLength
2023-01-10 18:59:09 +03:00
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
2023-01-11 19:05:38 +03:00
overlapSize = min(self.settings.crossFadeOverlapSize, self.unpackedData_length)
2023-01-11 19:05:38 +03:00
cf_offset = int(overlapSize * self.settings.crossFadeOffsetRate)
2023-01-28 09:56:56 +03:00
cf_end = int(overlapSize * self.settings.crossFadeEndRate)
2023-01-04 20:28:36 +03:00
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
2023-01-28 09:56:56 +03:00
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
2023-01-04 20:28:36 +03:00
2023-01-11 19:05:38 +03:00
self.np_prev_strength = np.concatenate([np.ones(cf_offset), np_prev_strength, np.zeros(overlapSize - cf_offset - len(np_prev_strength))])
self.np_cur_strength = np.concatenate([np.zeros(cf_offset), np_cur_strength, np.ones(overlapSize - cf_offset - len(np_cur_strength))])
2023-01-04 20:28:36 +03:00
print("Generated Strengths")
2023-01-28 09:56:56 +03:00
2023-01-04 20:28:36 +03:00
# ひとつ前の結果とサイズが変わるため、記録は消去する。
if hasattr(self, 'np_prev_audio1') == True:
delattr(self, "np_prev_audio1")
2023-01-04 20:28:36 +03:00
# receivedData: tuple of short
def on_request(self, receivedData: any):
2023-02-19 22:49:34 +03:00
# 前処理
2023-02-20 22:07:43 +03:00
with Timer("pre-process") as t:
2023-02-20 22:07:43 +03:00
if self.settings.inputSampleRate != 24000:
newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
else:
newData = receivedData
inputSize = newData.shape[0]
convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
2023-02-20 22:07:43 +03:00
# print(convertSize, unpackedData.shape[0])
if convertSize < 8192:
convertSize = 8192
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128))
self._generate_strength(inputSize)
data = self.voiceChanger.generate_input(newData, convertSize)
2023-02-20 22:07:43 +03:00
preprocess_time = t.secs
# 変換処理
2023-02-20 22:07:43 +03:00
with Timer("main-process") as t:
try:
# Inference
audio = self.voiceChanger.inference(data)
# CrossFade
if hasattr(self, 'np_prev_audio1') == True:
np.set_printoptions(threshold=10000)
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
prev_overlap = self.np_prev_audio1[-1 * overlapSize:]
cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
powered_prev = prev_overlap * self.np_prev_strength
powered_cur = cur_overlap * self.np_cur_strength
powered_result = powered_prev + powered_cur
cur = audio[-1 * inputSize:-1 * overlapSize]
result = np.concatenate([powered_result, cur], axis=0)
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
else:
result = np.zeros(1).astype(np.int16)
self.np_prev_audio1 = audio
2023-02-20 22:07:43 +03:00
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
if hasattr(self, "np_prev_audio1"):
del self.np_prev_audio1
2023-03-01 16:33:51 +03:00
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-02-20 22:07:43 +03:00
mainprocess_time = t.secs
# 後処理
2023-02-20 22:07:43 +03:00
with Timer("post-process") as t:
result = result.astype(np.int16)
if self.settings.inputSampleRate != 24000:
result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
2023-02-20 22:07:43 +03:00
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(result.tobytes())
2023-02-20 22:07:43 +03:00
postprocess_time = t.secs
perf = [preprocess_time, mainprocess_time, postprocess_time]
return result, perf
##############
class Timer(object):
def __init__(self, title: str):
self.title = title
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.secs = self.end - self.start
self.msecs = self.secs * 1000 # millisecs