406 lines
16 KiB
Python
Raw Normal View History

2023-04-28 06:39:51 +09:00
from typing import Any, Union, cast
2023-05-07 04:18:18 +09:00
import socketio
2023-06-16 00:56:18 +09:00
from const import TMP_DIR, VoiceChangerType
2022-12-31 16:08:14 +09:00
import torch
2023-01-28 15:56:56 +09:00
import os
import traceback
2022-12-31 16:08:14 +09:00
import numpy as np
2023-04-11 00:21:17 +09:00
from dataclasses import dataclass, asdict, field
2023-02-18 20:53:15 +09:00
import resampy
2023-06-16 00:56:18 +09:00
from data.ModelSlot import loadSlotInfo
2023-02-11 00:59:44 +09:00
from voice_changer.IORecorder import IORecorder
2023-05-07 04:18:18 +09:00
from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
from voice_changer.utils.Timer import Timer
2023-06-16 00:56:18 +09:00
from voice_changer.utils.VoiceChangerModel import AudioInOut
2023-05-03 13:14:00 +09:00
from Exceptions import (
DeviceCannotSupportHalfPrecisionException,
2023-05-04 17:15:53 +09:00
DeviceChangingException,
2023-05-03 13:14:00 +09:00
HalfPrecisionChangingException,
NoModeLoadedException,
2023-05-04 17:15:53 +09:00
NotEnoughDataExtimateF0,
2023-05-03 13:14:00 +09:00
ONNXInputArgumentException,
VoiceChangerIsNotSelectedException,
2023-05-03 13:14:00 +09:00
)
2023-04-27 23:38:25 +09:00
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
2023-06-16 00:56:18 +09:00
# import threading
# import time
# import sounddevice as sd
# import librosa
import json
2023-04-10 09:28:00 +09:00
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
2023-02-12 12:25:57 +09:00
2023-06-16 00:56:18 +09:00
@dataclass
class SlotInfo:
voiceChangerType: VoiceChangerType | None = None
2023-01-08 16:18:20 +09:00
@dataclass
2023-04-27 23:38:25 +09:00
class VoiceChangerSettings:
inputSampleRate: int = 48000 # 48000 or 24000
2023-01-28 15:56:56 +09:00
crossFadeOffsetRate: float = 0.1
crossFadeEndRate: float = 0.9
crossFadeOverlapSize: int = 4096
2023-02-20 07:14:05 +09:00
recordIO: int = 0 # 0:off, 1:on
2023-06-16 01:49:49 +09:00
2023-05-07 04:18:18 +09:00
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
2023-02-11 00:59:44 +09:00
2023-01-08 16:18:20 +09:00
# ↓mutableな物だけ列挙
2023-04-11 00:21:17 +09:00
intData: list[str] = field(
2023-05-07 04:18:18 +09:00
default_factory=lambda: [
"inputSampleRate",
"crossFadeOverlapSize",
"recordIO",
]
2023-04-11 00:21:17 +09:00
)
floatData: list[str] = field(
2023-05-26 23:53:27 +09:00
default_factory=lambda: [
"crossFadeOffsetRate",
"crossFadeEndRate",
]
2023-04-11 00:21:17 +09:00
)
2023-04-27 23:38:25 +09:00
strData: list[str] = field(default_factory=lambda: [])
2023-01-08 16:18:20 +09:00
2023-01-28 15:56:56 +09:00
2023-05-09 18:59:36 +09:00
class VoiceChanger:
2023-06-16 00:56:18 +09:00
# settings: VoiceChangerSettings = VoiceChangerSettings()
# voiceChangerModel: VoiceChangerModel | None = None
#
#
# namespace: socketio.AsyncNamespace | None = None
# localPerformanceShowTime = 0.0
# emitTo = None
def __init__(self, params: VoiceChangerParams, slotIndex: int):
2023-01-08 16:18:20 +09:00
# 初期化
2023-04-10 08:18:14 +09:00
self.settings = VoiceChangerSettings()
2023-01-10 22:49:16 +09:00
self.onnx_session = None
2023-04-28 06:39:51 +09:00
self.currentCrossFadeOffsetRate = 0.0
self.currentCrossFadeEndRate = 0.0
self.currentCrossFadeOverlapSize = 0 # setting
self.crossfadeSize = 0 # calculated
2023-01-28 15:56:56 +09:00
2023-06-16 00:56:18 +09:00
self.voiceChangerModel = None
self.modelType: VoiceChangerType | None = None
2023-04-11 00:21:17 +09:00
self.params = params
self.prev_audio = np.zeros(4096)
2023-06-16 00:56:18 +09:00
self.ioRecorder: IORecorder | None = None
self.sola_buffer: AudioInOut | None = None
2023-04-11 00:21:17 +09:00
2023-06-16 01:49:49 +09:00
# audioinput, audiooutput = list_audio_device()
# self.settings.serverAudioInputDevices = audioinput
# self.settings.serverAudioOutputDevices = audiooutput
2023-05-07 04:18:18 +09:00
2023-06-16 00:56:18 +09:00
self.slotIndex = slotIndex
self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex)
if self.slotInfo.voiceChangerType is None:
print(f"[Voice Changer] Voice Changer Type is None for slot {slotIndex} is not found.")
return
elif self.slotInfo.voiceChangerType == "RVC":
from voice_changer.RVC.RVC import RVC
2023-04-11 00:21:17 +09:00
2023-06-16 00:56:18 +09:00
self.voiceChangerModel = RVC(self.slotIndex, self.params)
2023-04-11 00:21:17 +09:00
else:
2023-06-16 00:56:18 +09:00
print(f"[Voice Changer] unknwon voice changer type. {self.slotInfo.voiceChangerType}")
2023-01-05 02:28:36 +09:00
2023-06-16 00:56:18 +09:00
# thread = threading.Thread(target=self.serverLocal, args=(self,))
# thread.start()
def prepareModel(self):
self.voiceChangerModel.prepareModel()
2022-12-31 16:08:14 +09:00
2023-01-08 00:25:21 +09:00
def get_info(self):
2023-01-08 16:18:20 +09:00
data = asdict(self.settings)
2023-06-16 00:56:18 +09:00
if self.voiceChangerModel is not None:
data.update(self.voiceChangerModel.get_info())
2023-01-08 16:18:20 +09:00
return data
2023-05-07 04:18:18 +09:00
def get_performance(self):
return self.settings.performance
2023-04-10 09:28:00 +09:00
def update_settings(self, key: str, val: Any):
2023-06-16 00:56:18 +09:00
if self.voiceChangerModel is None:
2023-05-26 16:52:05 +09:00
print("[Voice Changer] Voice Changer is not selected.")
2023-06-16 00:56:18 +09:00
return
2023-05-26 16:52:05 +09:00
if key in self.settings.intData:
2023-01-08 16:18:20 +09:00
setattr(self.settings, key, int(val))
2023-01-08 21:19:44 +09:00
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
self.crossfadeSize = 0
2023-02-15 05:02:51 +09:00
if key == "recordIO" and val == 1:
2023-06-16 00:56:18 +09:00
if self.ioRecorder is not None:
self.ioRecorder.close()
2023-06-16 00:56:18 +09:00
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
2023-02-15 07:18:05 +09:00
if key == "recordIO" and val == 0:
2023-06-16 00:56:18 +09:00
if self.ioRecorder is not None:
self.ioRecorder.close()
2023-06-16 00:56:18 +09:00
self.ioRecorder = None
2023-02-17 03:03:21 +09:00
pass
if key == "recordIO" and val == 2:
2023-06-16 00:56:18 +09:00
if self.ioRecorder is not None:
self.ioRecorder.close()
2023-06-16 00:56:18 +09:00
self.ioRecorder = None
2023-01-08 16:18:20 +09:00
elif key in self.settings.floatData:
setattr(self.settings, key, float(val))
elif key in self.settings.strData:
setattr(self.settings, key, str(val))
2023-01-08 09:45:58 +09:00
else:
2023-06-16 00:56:18 +09:00
ret = self.voiceChangerModel.update_settings(key, val)
2023-05-26 16:52:05 +09:00
if ret is False:
2023-05-31 14:30:35 +09:00
pass
# print(f"({key} is not mutable variable or unknown variable)")
2023-01-08 16:18:20 +09:00
def _generate_strength(self, crossfadeSize: int):
2023-06-16 00:56:18 +09:00
if self.crossfadeSize != crossfadeSize or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize:
self.crossfadeSize = crossfadeSize
2023-01-11 00:59:09 +09:00
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
2023-01-12 01:05:38 +09:00
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
2023-01-05 02:28:36 +09:00
cf_range = cf_end - cf_offset
percent = np.arange(cf_range) / cf_range
2023-01-28 15:56:56 +09:00
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
2023-01-05 02:28:36 +09:00
2023-04-27 23:38:25 +09:00
self.np_prev_strength = np.concatenate(
[
np.ones(cf_offset),
np_prev_strength,
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)),
]
)
self.np_cur_strength = np.concatenate(
[
np.zeros(cf_offset),
np_cur_strength,
np.ones(crossfadeSize - cf_offset - len(np_cur_strength)),
]
)
2023-06-16 00:56:18 +09:00
print(f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}")
2023-01-28 15:56:56 +09:00
2023-01-05 02:28:36 +09:00
# ひとつ前の結果とサイズが変わるため、記録は消去する。
2023-04-28 06:39:51 +09:00
if hasattr(self, "np_prev_audio1") is True:
delattr(self, "np_prev_audio1")
2023-06-16 00:56:18 +09:00
if self.sola_buffer is not None:
2023-04-14 11:03:52 +09:00
del self.sola_buffer
2023-06-16 00:56:18 +09:00
self.sola_buffer = None
2023-04-14 09:18:34 +09:00
# receivedData: tuple of short
2023-06-16 00:56:18 +09:00
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
2023-04-15 04:58:56 +09:00
return self.on_request_sola(receivedData)
2023-04-14 09:18:34 +09:00
2023-06-16 00:56:18 +09:00
def on_request_sola(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
2023-04-17 09:45:12 +09:00
try:
2023-06-16 00:56:18 +09:00
if self.voiceChangerModel is None:
raise VoiceChangerIsNotSelectedException("Voice Changer is not selected.")
2023-05-26 16:52:05 +09:00
2023-06-16 00:56:18 +09:00
processing_sampling_rate = self.voiceChangerModel.get_processing_sampling_rate()
2023-04-17 09:45:12 +09:00
# 前処理
with Timer("pre-process") as t:
if self.settings.inputSampleRate != processing_sampling_rate:
2023-04-27 23:38:25 +09:00
newData = cast(
AudioInOut,
resampy.resample(
receivedData,
self.settings.inputSampleRate,
processing_sampling_rate,
),
)
2023-04-17 09:45:12 +09:00
else:
newData = receivedData
2023-04-14 09:18:34 +09:00
2023-04-17 09:45:12 +09:00
sola_search_frame = int(0.012 * processing_sampling_rate)
# sola_search_frame = 0
block_frame = newData.shape[0]
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
self._generate_strength(crossfade_frame)
2023-04-14 09:18:34 +09:00
2023-06-16 00:56:18 +09:00
data = self.voiceChangerModel.generate_input(newData, block_frame, crossfade_frame, sola_search_frame)
2023-04-17 09:45:12 +09:00
preprocess_time = t.secs
2023-04-14 09:18:34 +09:00
2023-04-17 09:45:12 +09:00
# 変換処理
with Timer("main-process") as t:
2023-04-14 09:18:34 +09:00
# Inference
2023-06-16 00:56:18 +09:00
audio = self.voiceChangerModel.inference(data)
2023-04-14 09:18:34 +09:00
2023-06-16 00:56:18 +09:00
if self.sola_buffer is not None:
2023-04-14 09:18:34 +09:00
np.set_printoptions(threshold=10000)
2023-06-16 00:56:18 +09:00
audio_offset = -1 * (sola_search_frame + crossfade_frame + block_frame)
2023-04-28 06:39:51 +09:00
audio = audio[audio_offset:]
2023-06-02 23:33:46 +09:00
2023-04-14 09:18:34 +09:00
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
2023-04-27 23:38:25 +09:00
cor_nom = np.convolve(
audio[: crossfade_frame + sola_search_frame],
np.flip(self.sola_buffer),
"valid",
)
cor_den = np.sqrt(
np.convolve(
audio[: crossfade_frame + sola_search_frame] ** 2,
np.ones(crossfade_frame),
"valid",
)
+ 1e-3
)
2023-04-28 06:39:51 +09:00
sola_offset = int(np.argmax(cor_nom / cor_den))
sola_end = sola_offset + block_frame
output_wav = audio[sola_offset:sola_end].astype(np.float64)
2023-04-14 09:18:34 +09:00
output_wav[:crossfade_frame] *= self.np_cur_strength
output_wav[:crossfade_frame] += self.sola_buffer[:]
result = output_wav
else:
2023-05-31 02:26:16 +09:00
print("[Voice Changer] warming up... generating sola buffer.")
2023-04-14 09:18:34 +09:00
result = np.zeros(4096).astype(np.int16)
2023-06-16 00:56:18 +09:00
if self.sola_buffer is not None and sola_offset < sola_search_frame:
2023-04-28 06:39:51 +09:00
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
end = -1 * (sola_search_frame - sola_offset)
sola_buf_org = audio[offset:end]
2023-04-14 09:18:34 +09:00
self.sola_buffer = sola_buf_org * self.np_prev_strength
else:
2023-04-27 23:38:25 +09:00
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
2023-04-14 09:18:34 +09:00
# self.sola_buffer = audio[- crossfade_frame:]
2023-04-17 09:45:12 +09:00
mainprocess_time = t.secs
2023-04-14 09:18:34 +09:00
2023-04-17 09:45:12 +09:00
# 後処理
with Timer("post-process") as t:
result = result.astype(np.int16)
if self.settings.inputSampleRate != processing_sampling_rate:
2023-05-10 01:02:28 +09:00
# print(
# "samplingrate",
# self.settings.inputSampleRate,
# processing_sampling_rate,
# )
2023-04-27 23:38:25 +09:00
outputData = cast(
AudioInOut,
resampy.resample(
result,
processing_sampling_rate,
self.settings.inputSampleRate,
).astype(np.int16),
)
2023-04-17 09:45:12 +09:00
else:
outputData = result
2023-04-14 09:18:34 +09:00
2023-06-16 00:56:18 +09:00
print_convert_processing(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
2023-04-14 09:18:34 +09:00
2023-05-09 18:59:36 +09:00
if receivedData.shape[0] != outputData.shape[0]:
2023-05-10 01:02:28 +09:00
# print(
# f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}"
# )
2023-05-09 18:59:36 +09:00
outputData = pad_array(outputData, receivedData.shape[0])
# print_convert_processing(
# f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
2023-05-15 02:20:49 +09:00
pass
if self.settings.recordIO == 1:
self.ioRecorder.writeInput(receivedData)
self.ioRecorder.writeOutput(outputData.tobytes())
2023-04-17 09:45:12 +09:00
postprocess_time = t.secs
2023-04-14 09:18:34 +09:00
2023-06-16 00:56:18 +09:00
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
2023-04-17 09:45:12 +09:00
perf = [preprocess_time, mainprocess_time, postprocess_time]
return outputData, perf
except NoModeLoadedException as e:
print("[Voice Changer] [Exception]", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-04-19 03:06:45 +09:00
except ONNXInputArgumentException as e:
2023-05-28 22:08:10 +09:00
print("[Voice Changer] [Exception] onnx are waiting valid input.", e)
2023-04-19 03:06:45 +09:00
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-05-31 02:26:16 +09:00
except HalfPrecisionChangingException:
print("[Voice Changer] Switching model configuration....")
2023-05-03 13:14:00 +09:00
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-05-31 02:26:16 +09:00
except NotEnoughDataExtimateF0:
print("[Voice Changer] warming up... waiting more data.")
2023-05-04 17:15:53 +09:00
return np.zeros(1).astype(np.int16), [0, 0, 0]
except DeviceChangingException as e:
print("[Voice Changer] embedder:", e)
return np.zeros(1).astype(np.int16), [0, 0, 0]
except VoiceChangerIsNotSelectedException:
2023-06-16 00:56:18 +09:00
print("[Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc.")
return np.zeros(1).astype(np.int16), [0, 0, 0]
except DeviceCannotSupportHalfPrecisionException:
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-04-17 09:45:12 +09:00
except Exception as e:
print("[Voice Changer] VC PROCESSING EXCEPTION!!!", e)
2023-04-17 09:45:12 +09:00
print(traceback.format_exc())
return np.zeros(1).astype(np.int16), [0, 0, 0]
2023-04-14 09:18:34 +09:00
2023-04-13 08:00:28 +09:00
def export2onnx(self):
return self.voiceChanger.export2onnx()
2023-02-21 04:07:43 +09:00
2023-04-13 08:00:28 +09:00
##############
2023-04-27 23:38:25 +09:00
2023-05-01 02:34:01 +09:00
def merge_models(self, request: str):
2023-05-26 16:52:05 +09:00
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return
2023-05-01 02:34:01 +09:00
self.voiceChanger.merge_models(request)
return self.get_info()
2023-05-21 04:21:54 +09:00
def update_model_default(self):
2023-05-26 16:52:05 +09:00
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return
2023-05-21 04:21:54 +09:00
self.voiceChanger.update_model_default()
return self.get_info()
2023-06-08 03:08:59 +09:00
def update_model_info(self, newData: str):
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return
self.voiceChanger.update_model_info(newData)
return self.get_info()
def upload_model_assets(self, params: str):
if self.voiceChanger is None:
print("[Voice Changer] Voice Changer is not selected.")
return
self.voiceChanger.upload_model_assets(params)
return self.get_info()
2023-04-27 23:38:25 +09:00
2023-04-10 09:28:00 +09:00
PRINT_CONVERT_PROCESSING: bool = False
# PRINT_CONVERT_PROCESSING = True
def print_convert_processing(mess: str):
2023-04-28 06:39:51 +09:00
if PRINT_CONVERT_PROCESSING is True:
print(mess)
def pad_array(arr: AudioInOut, target_length: int):
2023-03-11 01:56:10 +09:00
current_length = arr.shape[0]
if current_length >= target_length:
return arr
else:
pad_width = target_length - current_length
pad_left = pad_width // 2
pad_right = pad_width - pad_left
2023-05-15 02:20:49 +09:00
# padded_arr = np.pad(
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
# )
padded_arr = np.pad(arr, (pad_left, pad_right), "edge")
2023-03-11 01:56:10 +09:00
return padded_arr