2023-04-28 00:39:51 +03:00
|
|
|
from typing import Any, Union, cast
|
2023-05-06 22:18:18 +03:00
|
|
|
|
|
|
|
import socketio
|
2023-04-10 18:21:17 +03:00
|
|
|
from const import TMP_DIR, ModelType
|
2022-12-31 10:08:14 +03:00
|
|
|
import torch
|
2023-01-28 09:56:56 +03:00
|
|
|
import os
|
|
|
|
import traceback
|
2022-12-31 10:08:14 +03:00
|
|
|
import numpy as np
|
2023-04-10 18:21:17 +03:00
|
|
|
from dataclasses import dataclass, asdict, field
|
2023-02-18 14:53:15 +03:00
|
|
|
import resampy
|
2023-01-14 00:44:30 +03:00
|
|
|
|
2023-02-10 18:59:44 +03:00
|
|
|
|
2023-03-07 16:30:48 +03:00
|
|
|
from voice_changer.IORecorder import IORecorder
|
2023-05-06 22:18:18 +03:00
|
|
|
from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
|
2023-04-28 00:39:51 +03:00
|
|
|
from voice_changer.utils.LoadModelParams import LoadModelParams
|
2023-03-07 16:30:48 +03:00
|
|
|
|
2023-04-12 19:13:25 +03:00
|
|
|
from voice_changer.utils.Timer import Timer
|
|
|
|
from voice_changer.utils.VoiceChangerModel import VoiceChangerModel, AudioInOut
|
2023-05-03 07:14:00 +03:00
|
|
|
from Exceptions import (
|
2023-05-04 11:15:53 +03:00
|
|
|
DeviceChangingException,
|
2023-05-03 07:14:00 +03:00
|
|
|
HalfPrecisionChangingException,
|
|
|
|
NoModeLoadedException,
|
2023-05-04 11:15:53 +03:00
|
|
|
NotEnoughDataExtimateF0,
|
2023-05-03 07:14:00 +03:00
|
|
|
ONNXInputArgumentException,
|
|
|
|
)
|
2023-04-27 17:38:25 +03:00
|
|
|
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
|
2023-05-06 22:18:18 +03:00
|
|
|
import threading
|
|
|
|
import time
|
2023-05-09 12:59:36 +03:00
|
|
|
import sounddevice as sd
|
|
|
|
import librosa
|
2023-04-10 03:28:00 +03:00
|
|
|
|
2023-03-07 16:30:48 +03:00
|
|
|
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
|
|
|
|
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
|
2023-02-12 06:25:57 +03:00
|
|
|
|
|
|
|
|
2023-01-08 10:18:20 +03:00
|
|
|
@dataclass
|
2023-04-27 17:38:25 +03:00
|
|
|
class VoiceChangerSettings:
|
2023-04-20 10:15:57 +03:00
|
|
|
inputSampleRate: int = 48000 # 48000 or 24000
|
2023-02-19 04:12:25 +03:00
|
|
|
|
2023-01-28 09:56:56 +03:00
|
|
|
crossFadeOffsetRate: float = 0.1
|
|
|
|
crossFadeEndRate: float = 0.9
|
2023-02-19 00:25:22 +03:00
|
|
|
crossFadeOverlapSize: int = 4096
|
2023-02-19 04:12:25 +03:00
|
|
|
|
2023-02-20 01:14:05 +03:00
|
|
|
recordIO: int = 0 # 0:off, 1:on
|
2023-05-06 22:18:18 +03:00
|
|
|
serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
|
|
|
|
serverAudioOutputDevices: list[ServerAudioDevice] = field(
|
|
|
|
default_factory=lambda: []
|
|
|
|
)
|
|
|
|
|
|
|
|
enableServerAudio: int = 0 # 0:off, 1:on
|
|
|
|
serverAudioStated: int = 0 # 0:off, 1:on
|
2023-05-09 12:59:36 +03:00
|
|
|
# serverInputAudioSampleRate: int = 48000
|
|
|
|
# serverOutputAudioSampleRate: int = 48000
|
|
|
|
serverInputAudioSampleRate: int = 44100
|
|
|
|
serverOutputAudioSampleRate: int = 44100
|
2023-05-26 17:53:27 +03:00
|
|
|
# serverInputAudioBufferSize: int = 1024 * 24
|
|
|
|
# serverOutputAudioBufferSize: int = 1024 * 24
|
2023-05-06 22:18:18 +03:00
|
|
|
serverInputDeviceId: int = -1
|
|
|
|
serverOutputDeviceId: int = -1
|
|
|
|
serverReadChunkSize: int = 256
|
2023-05-26 17:53:27 +03:00
|
|
|
serverInputAudioGain: float = 1.0
|
|
|
|
serverOutputAudioGain: float = 1.0
|
2023-05-06 22:18:18 +03:00
|
|
|
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
|
2023-02-10 18:59:44 +03:00
|
|
|
|
2023-01-08 10:18:20 +03:00
|
|
|
# ↓mutableな物だけ列挙
|
2023-04-10 18:21:17 +03:00
|
|
|
intData: list[str] = field(
|
2023-05-06 22:18:18 +03:00
|
|
|
default_factory=lambda: [
|
|
|
|
"inputSampleRate",
|
|
|
|
"crossFadeOverlapSize",
|
|
|
|
"recordIO",
|
|
|
|
"enableServerAudio",
|
|
|
|
"serverAudioStated",
|
|
|
|
"serverInputAudioSampleRate",
|
|
|
|
"serverOutputAudioSampleRate",
|
2023-05-26 17:53:27 +03:00
|
|
|
# "serverInputAudioBufferSize",
|
|
|
|
# "serverOutputAudioBufferSize",
|
2023-05-06 22:18:18 +03:00
|
|
|
"serverInputDeviceId",
|
|
|
|
"serverOutputDeviceId",
|
|
|
|
"serverReadChunkSize",
|
|
|
|
]
|
2023-04-10 18:21:17 +03:00
|
|
|
)
|
|
|
|
floatData: list[str] = field(
|
2023-05-26 17:53:27 +03:00
|
|
|
default_factory=lambda: [
|
|
|
|
"crossFadeOffsetRate",
|
|
|
|
"crossFadeEndRate",
|
|
|
|
"serverInputAudioGain",
|
|
|
|
"serverOutputAudioGain",
|
|
|
|
]
|
2023-04-10 18:21:17 +03:00
|
|
|
)
|
2023-04-27 17:38:25 +03:00
|
|
|
strData: list[str] = field(default_factory=lambda: [])
|
2023-01-08 10:18:20 +03:00
|
|
|
|
2023-01-28 09:56:56 +03:00
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
class VoiceChanger:
|
2023-05-28 16:08:10 +03:00
|
|
|
settings: VoiceChangerSettings = VoiceChangerSettings()
|
2023-05-26 10:52:05 +03:00
|
|
|
voiceChanger: VoiceChangerModel | None = None
|
2023-05-09 12:59:36 +03:00
|
|
|
ioRecorder: IORecorder
|
|
|
|
sola_buffer: AudioInOut
|
|
|
|
namespace: socketio.AsyncNamespace | None = None
|
2023-05-06 22:18:18 +03:00
|
|
|
|
2023-05-09 19:26:38 +03:00
|
|
|
localPerformanceShowTime = 0.0
|
2023-05-09 19:06:34 +03:00
|
|
|
|
2023-05-26 10:26:17 +03:00
|
|
|
emitTo = None
|
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
def audio_callback(
|
|
|
|
self, indata: np.ndarray, outdata: np.ndarray, frames, times, status
|
|
|
|
):
|
|
|
|
try:
|
2023-05-26 17:53:27 +03:00
|
|
|
indata = indata * self.settings.serverInputAudioGain
|
2023-05-06 22:18:18 +03:00
|
|
|
with Timer("all_inference_time") as t:
|
2023-05-09 12:59:36 +03:00
|
|
|
unpackedData = librosa.to_mono(indata.T) * 32768.0
|
|
|
|
out_wav, times = self.on_request(unpackedData)
|
2023-05-13 08:30:15 +03:00
|
|
|
outputChunnels = outdata.shape[1]
|
|
|
|
outdata[:] = (
|
|
|
|
np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels)
|
|
|
|
/ 32768.0
|
|
|
|
)
|
2023-05-26 17:53:27 +03:00
|
|
|
outdata[:] = outdata * self.settings.serverOutputAudioGain
|
2023-05-06 22:18:18 +03:00
|
|
|
all_inference_time = t.secs
|
|
|
|
performance = [all_inference_time] + times
|
2023-05-26 10:26:17 +03:00
|
|
|
if self.emitTo is not None:
|
|
|
|
self.emitTo(performance)
|
2023-05-09 19:26:38 +03:00
|
|
|
self.settings.performance = [round(x * 1000) for x in performance]
|
2023-05-09 12:59:36 +03:00
|
|
|
except Exception as e:
|
2023-05-13 08:30:15 +03:00
|
|
|
print("[Voice Changer] ex:", e)
|
|
|
|
|
|
|
|
def getServerAudioDevice(
|
|
|
|
self, audioDeviceList: list[ServerAudioDevice], index: int
|
|
|
|
):
|
|
|
|
serverAudioDevice = [x for x in audioDeviceList if x.index == index]
|
|
|
|
if len(serverAudioDevice) > 0:
|
|
|
|
return serverAudioDevice[0]
|
|
|
|
else:
|
|
|
|
return None
|
2023-05-08 23:04:34 +03:00
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
def serverLocal(self, _vc):
|
|
|
|
vc: VoiceChanger = _vc
|
2023-05-06 22:18:18 +03:00
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
currentInputDeviceId = -1
|
2023-05-26 17:04:56 +03:00
|
|
|
currentModelSamplingRate = -1
|
2023-05-09 12:59:36 +03:00
|
|
|
currentOutputDeviceId = -1
|
|
|
|
currentInputChunkNum = -1
|
|
|
|
while True:
|
|
|
|
if (
|
|
|
|
vc.settings.serverAudioStated == 0
|
|
|
|
or vc.settings.serverInputDeviceId == -1
|
2023-05-26 10:52:05 +03:00
|
|
|
or vc.voiceChanger is None
|
2023-05-09 12:59:36 +03:00
|
|
|
):
|
|
|
|
vc.settings.inputSampleRate = 48000
|
|
|
|
time.sleep(2)
|
|
|
|
else:
|
|
|
|
sd._terminate()
|
|
|
|
sd._initialize()
|
2023-05-13 08:30:15 +03:00
|
|
|
|
|
|
|
sd.default.device[0] = vc.settings.serverInputDeviceId
|
|
|
|
currentInputDeviceId = vc.settings.serverInputDeviceId
|
|
|
|
sd.default.device[1] = vc.settings.serverOutputDeviceId
|
|
|
|
currentOutputDeviceId = vc.settings.serverOutputDeviceId
|
|
|
|
|
|
|
|
currentInputChannelNum = vc.settings.serverAudioInputDevices
|
|
|
|
|
|
|
|
serverInputAudioDevice = self.getServerAudioDevice(
|
|
|
|
vc.settings.serverAudioInputDevices, currentInputDeviceId
|
|
|
|
)
|
|
|
|
serverOutputAudioDevice = self.getServerAudioDevice(
|
|
|
|
vc.settings.serverAudioOutputDevices, currentOutputDeviceId
|
|
|
|
)
|
|
|
|
print(serverInputAudioDevice, serverOutputAudioDevice)
|
|
|
|
if serverInputAudioDevice is None or serverOutputAudioDevice is None:
|
|
|
|
time.sleep(2)
|
|
|
|
print("serverInputAudioDevice or serverOutputAudioDevice is None")
|
|
|
|
continue
|
|
|
|
|
|
|
|
currentInputChannelNum = serverInputAudioDevice.maxInputChannels
|
2023-05-13 09:23:23 +03:00
|
|
|
currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
|
2023-05-06 22:18:18 +03:00
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
currentInputChunkNum = vc.settings.serverReadChunkSize
|
|
|
|
block_frame = currentInputChunkNum * 128
|
2023-05-13 08:30:15 +03:00
|
|
|
|
2023-05-26 10:26:17 +03:00
|
|
|
# sample rate precheck(alsa cannot use 40000?)
|
2023-05-26 17:04:56 +03:00
|
|
|
try:
|
|
|
|
currentModelSamplingRate = (
|
|
|
|
self.voiceChanger.get_processing_sampling_rate()
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
print("[Voice Changer] ex: get_processing_sampling_rate", e)
|
|
|
|
continue
|
2023-05-26 10:26:17 +03:00
|
|
|
try:
|
|
|
|
with sd.Stream(
|
|
|
|
callback=self.audio_callback,
|
|
|
|
blocksize=block_frame,
|
2023-05-26 17:04:56 +03:00
|
|
|
samplerate=currentModelSamplingRate,
|
2023-05-26 10:26:17 +03:00
|
|
|
dtype="float32",
|
|
|
|
channels=[currentInputChannelNum, currentOutputChannelNum],
|
|
|
|
):
|
|
|
|
pass
|
2023-05-26 17:04:56 +03:00
|
|
|
vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
|
|
|
|
vc.settings.inputSampleRate = currentModelSamplingRate
|
|
|
|
print(
|
|
|
|
f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}"
|
|
|
|
)
|
2023-05-26 10:26:17 +03:00
|
|
|
except Exception as e:
|
|
|
|
print(
|
2023-05-26 17:04:56 +03:00
|
|
|
"[Voice Changer] ex: fallback to device default samplerate",
|
|
|
|
e,
|
2023-05-26 10:26:17 +03:00
|
|
|
)
|
|
|
|
vc.settings.serverInputAudioSampleRate = (
|
|
|
|
serverInputAudioDevice.default_samplerate
|
|
|
|
)
|
2023-05-26 17:04:56 +03:00
|
|
|
vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
|
2023-05-26 10:26:17 +03:00
|
|
|
|
|
|
|
# main loop
|
2023-05-09 12:59:36 +03:00
|
|
|
try:
|
|
|
|
with sd.Stream(
|
|
|
|
callback=self.audio_callback,
|
|
|
|
blocksize=block_frame,
|
2023-05-26 17:04:56 +03:00
|
|
|
samplerate=vc.settings.serverInputAudioSampleRate,
|
2023-05-09 12:59:36 +03:00
|
|
|
dtype="float32",
|
2023-05-13 08:30:15 +03:00
|
|
|
channels=[currentInputChannelNum, currentOutputChannelNum],
|
2023-05-09 12:59:36 +03:00
|
|
|
):
|
|
|
|
while (
|
|
|
|
vc.settings.serverAudioStated == 1
|
|
|
|
and currentInputDeviceId == vc.settings.serverInputDeviceId
|
|
|
|
and currentOutputDeviceId
|
|
|
|
== vc.settings.serverOutputDeviceId
|
2023-05-26 17:04:56 +03:00
|
|
|
and currentModelSamplingRate
|
2023-05-26 10:26:17 +03:00
|
|
|
== self.voiceChanger.get_processing_sampling_rate()
|
2023-05-09 12:59:36 +03:00
|
|
|
and currentInputChunkNum == vc.settings.serverReadChunkSize
|
|
|
|
):
|
|
|
|
time.sleep(2)
|
|
|
|
print(
|
|
|
|
"[Voice Changer] server audio",
|
|
|
|
self.settings.performance,
|
|
|
|
)
|
|
|
|
print(
|
|
|
|
"[Voice Changer] info:",
|
|
|
|
vc.settings.serverAudioStated,
|
|
|
|
currentInputDeviceId,
|
|
|
|
currentOutputDeviceId,
|
2023-05-26 17:04:56 +03:00
|
|
|
vc.settings.serverInputAudioSampleRate,
|
2023-05-09 12:59:36 +03:00
|
|
|
currentInputChunkNum,
|
|
|
|
)
|
|
|
|
|
|
|
|
except Exception as e:
|
2023-05-13 08:30:15 +03:00
|
|
|
print("[Voice Changer] ex:", e)
|
2023-05-09 12:59:36 +03:00
|
|
|
time.sleep(2)
|
2023-01-08 10:18:20 +03:00
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
def __init__(self, params: VoiceChangerParams):
|
2023-01-08 10:18:20 +03:00
|
|
|
# 初期化
|
2023-04-10 02:18:14 +03:00
|
|
|
self.settings = VoiceChangerSettings()
|
2023-01-10 16:49:16 +03:00
|
|
|
self.onnx_session = None
|
2023-04-28 00:39:51 +03:00
|
|
|
self.currentCrossFadeOffsetRate = 0.0
|
|
|
|
self.currentCrossFadeEndRate = 0.0
|
2023-03-12 20:06:39 +03:00
|
|
|
self.currentCrossFadeOverlapSize = 0 # setting
|
|
|
|
self.crossfadeSize = 0 # calculated
|
2023-01-28 09:56:56 +03:00
|
|
|
|
2023-04-10 18:21:17 +03:00
|
|
|
self.voiceChanger = None
|
2023-04-28 00:39:51 +03:00
|
|
|
self.modelType: ModelType | None = None
|
2023-04-10 18:21:17 +03:00
|
|
|
self.params = params
|
|
|
|
self.gpu_num = torch.cuda.device_count()
|
|
|
|
self.prev_audio = np.zeros(4096)
|
2023-04-27 17:38:25 +03:00
|
|
|
self.mps_enabled: bool = (
|
|
|
|
getattr(torch.backends, "mps", None) is not None
|
|
|
|
and torch.backends.mps.is_available()
|
|
|
|
)
|
2023-04-10 18:21:17 +03:00
|
|
|
|
2023-05-06 22:18:18 +03:00
|
|
|
audioinput, audiooutput = list_audio_device()
|
|
|
|
self.settings.serverAudioInputDevices = audioinput
|
|
|
|
self.settings.serverAudioOutputDevices = audiooutput
|
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
thread = threading.Thread(target=self.serverLocal, args=(self,))
|
2023-05-06 22:18:18 +03:00
|
|
|
thread.start()
|
2023-04-27 17:38:25 +03:00
|
|
|
print(
|
|
|
|
f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})"
|
|
|
|
)
|
2023-04-10 18:21:17 +03:00
|
|
|
|
|
|
|
def switchModelType(self, modelType: ModelType):
|
2023-05-09 16:40:21 +03:00
|
|
|
try:
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is not None:
|
2023-05-09 16:40:21 +03:00
|
|
|
# return {"status": "ERROR", "msg": "vc is already selected. currently re-select is not implemented"}
|
|
|
|
del self.voiceChanger
|
|
|
|
self.voiceChanger = None
|
|
|
|
|
|
|
|
self.modelType = modelType
|
|
|
|
if self.modelType == "MMVCv15":
|
|
|
|
from voice_changer.MMVCv15.MMVCv15 import MMVCv15
|
|
|
|
|
|
|
|
self.voiceChanger = MMVCv15() # type: ignore
|
|
|
|
elif self.modelType == "MMVCv13":
|
|
|
|
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
|
|
|
|
|
|
|
self.voiceChanger = MMVCv13()
|
|
|
|
elif self.modelType == "so-vits-svc-40v2":
|
|
|
|
from voice_changer.SoVitsSvc40v2.SoVitsSvc40v2 import SoVitsSvc40v2
|
|
|
|
|
|
|
|
self.voiceChanger = SoVitsSvc40v2(self.params)
|
2023-05-09 19:02:28 +03:00
|
|
|
elif (
|
|
|
|
self.modelType == "so-vits-svc-40"
|
|
|
|
or self.modelType == "so-vits-svc-40_c"
|
|
|
|
):
|
2023-05-09 16:40:21 +03:00
|
|
|
from voice_changer.SoVitsSvc40.SoVitsSvc40 import SoVitsSvc40
|
|
|
|
|
|
|
|
self.voiceChanger = SoVitsSvc40(self.params)
|
|
|
|
elif self.modelType == "DDSP-SVC":
|
|
|
|
from voice_changer.DDSP_SVC.DDSP_SVC import DDSP_SVC
|
|
|
|
|
|
|
|
self.voiceChanger = DDSP_SVC(self.params)
|
|
|
|
elif self.modelType == "RVC":
|
|
|
|
from voice_changer.RVC.RVC import RVC
|
|
|
|
|
|
|
|
self.voiceChanger = RVC(self.params)
|
|
|
|
else:
|
|
|
|
from voice_changer.MMVCv13.MMVCv13 import MMVCv13
|
2023-03-07 05:49:06 +03:00
|
|
|
|
2023-05-09 16:40:21 +03:00
|
|
|
self.voiceChanger = MMVCv13()
|
|
|
|
except Exception as e:
|
|
|
|
print(e)
|
|
|
|
print(traceback.format_exc())
|
2023-04-10 18:21:17 +03:00
|
|
|
return {"status": "OK", "msg": "vc is switched."}
|
2022-12-31 10:08:14 +03:00
|
|
|
|
2023-04-10 18:21:17 +03:00
|
|
|
def getModelType(self):
|
2023-04-28 00:39:51 +03:00
|
|
|
if self.modelType is not None:
|
2023-04-10 18:21:17 +03:00
|
|
|
return {"status": "OK", "vc": self.modelType}
|
|
|
|
else:
|
|
|
|
return {"status": "OK", "vc": "none"}
|
2023-01-04 20:28:36 +03:00
|
|
|
|
2023-04-28 00:39:51 +03:00
|
|
|
def loadModel(self, props: LoadModelParams):
|
2023-04-14 05:03:52 +03:00
|
|
|
try:
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is None:
|
|
|
|
raise RuntimeError("Voice Changer is not selected.")
|
2023-04-16 03:56:12 +03:00
|
|
|
return self.voiceChanger.loadModel(props)
|
2023-04-14 05:03:52 +03:00
|
|
|
except Exception as e:
|
2023-04-28 07:49:40 +03:00
|
|
|
print(traceback.format_exc())
|
2023-04-14 05:03:52 +03:00
|
|
|
print("[Voice Changer] Model Load Error! Check your model is valid.", e)
|
|
|
|
return {"status": "NG"}
|
2022-12-31 10:08:14 +03:00
|
|
|
|
2023-01-07 18:25:21 +03:00
|
|
|
def get_info(self):
|
2023-01-08 10:18:20 +03:00
|
|
|
data = asdict(self.settings)
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is not None:
|
2023-04-10 18:21:17 +03:00
|
|
|
data.update(self.voiceChanger.get_info())
|
2023-01-08 10:18:20 +03:00
|
|
|
return data
|
|
|
|
|
2023-05-06 22:18:18 +03:00
|
|
|
def get_performance(self):
|
|
|
|
return self.settings.performance
|
|
|
|
|
2023-04-10 03:28:00 +03:00
|
|
|
def update_settings(self, key: str, val: Any):
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is None:
|
|
|
|
print("[Voice Changer] Voice Changer is not selected.")
|
2023-05-28 16:08:10 +03:00
|
|
|
return self.get_info()
|
2023-05-26 10:52:05 +03:00
|
|
|
|
2023-03-07 18:38:09 +03:00
|
|
|
if key in self.settings.intData:
|
2023-01-08 10:18:20 +03:00
|
|
|
setattr(self.settings, key, int(val))
|
2023-01-08 15:19:44 +03:00
|
|
|
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
|
2023-03-12 20:06:39 +03:00
|
|
|
self.crossfadeSize = 0
|
2023-02-14 23:02:51 +03:00
|
|
|
if key == "recordIO" and val == 1:
|
2023-03-07 16:30:48 +03:00
|
|
|
if hasattr(self, "ioRecorder"):
|
|
|
|
self.ioRecorder.close()
|
2023-04-27 17:38:25 +03:00
|
|
|
self.ioRecorder = IORecorder(
|
|
|
|
STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate
|
|
|
|
)
|
2023-02-15 01:18:05 +03:00
|
|
|
if key == "recordIO" and val == 0:
|
2023-03-07 16:30:48 +03:00
|
|
|
if hasattr(self, "ioRecorder"):
|
|
|
|
self.ioRecorder.close()
|
2023-02-16 21:03:21 +03:00
|
|
|
pass
|
|
|
|
if key == "recordIO" and val == 2:
|
2023-03-07 16:30:48 +03:00
|
|
|
if hasattr(self, "ioRecorder"):
|
|
|
|
self.ioRecorder.close()
|
|
|
|
|
2023-01-08 10:18:20 +03:00
|
|
|
elif key in self.settings.floatData:
|
|
|
|
setattr(self.settings, key, float(val))
|
|
|
|
elif key in self.settings.strData:
|
|
|
|
setattr(self.settings, key, str(val))
|
2023-01-08 03:45:58 +03:00
|
|
|
else:
|
2023-05-26 10:52:05 +03:00
|
|
|
ret = self.voiceChanger.update_settings(key, val)
|
|
|
|
if ret is False:
|
2023-05-29 11:34:35 +03:00
|
|
|
print(f"({key} is not mutable variable or unknown variable)")
|
2023-01-10 18:59:09 +03:00
|
|
|
return self.get_info()
|
2023-01-08 10:18:20 +03:00
|
|
|
|
2023-03-12 20:06:39 +03:00
|
|
|
def _generate_strength(self, crossfadeSize: int):
|
2023-04-27 17:38:25 +03:00
|
|
|
if (
|
|
|
|
self.crossfadeSize != crossfadeSize
|
|
|
|
or self.currentCrossFadeOffsetRate != self.settings.crossFadeOffsetRate
|
|
|
|
or self.currentCrossFadeEndRate != self.settings.crossFadeEndRate
|
|
|
|
or self.currentCrossFadeOverlapSize != self.settings.crossFadeOverlapSize
|
|
|
|
):
|
2023-03-12 20:06:39 +03:00
|
|
|
self.crossfadeSize = crossfadeSize
|
2023-01-10 18:59:09 +03:00
|
|
|
self.currentCrossFadeOffsetRate = self.settings.crossFadeOffsetRate
|
|
|
|
self.currentCrossFadeEndRate = self.settings.crossFadeEndRate
|
2023-02-19 00:25:22 +03:00
|
|
|
self.currentCrossFadeOverlapSize = self.settings.crossFadeOverlapSize
|
2023-01-11 19:05:38 +03:00
|
|
|
|
2023-03-12 20:06:39 +03:00
|
|
|
cf_offset = int(crossfadeSize * self.settings.crossFadeOffsetRate)
|
|
|
|
cf_end = int(crossfadeSize * self.settings.crossFadeEndRate)
|
2023-01-04 20:28:36 +03:00
|
|
|
cf_range = cf_end - cf_offset
|
|
|
|
percent = np.arange(cf_range) / cf_range
|
|
|
|
|
2023-01-28 09:56:56 +03:00
|
|
|
np_prev_strength = np.cos(percent * 0.5 * np.pi) ** 2
|
|
|
|
np_cur_strength = np.cos((1 - percent) * 0.5 * np.pi) ** 2
|
2023-01-04 20:28:36 +03:00
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
self.np_prev_strength = np.concatenate(
|
|
|
|
[
|
|
|
|
np.ones(cf_offset),
|
|
|
|
np_prev_strength,
|
|
|
|
np.zeros(crossfadeSize - cf_offset - len(np_prev_strength)),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
self.np_cur_strength = np.concatenate(
|
|
|
|
[
|
|
|
|
np.zeros(cf_offset),
|
|
|
|
np_cur_strength,
|
|
|
|
np.ones(crossfadeSize - cf_offset - len(np_cur_strength)),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
print(
|
|
|
|
f"Generated Strengths: for prev:{self.np_prev_strength.shape}, for cur:{self.np_cur_strength.shape}"
|
|
|
|
)
|
2023-01-28 09:56:56 +03:00
|
|
|
|
2023-01-04 20:28:36 +03:00
|
|
|
# ひとつ前の結果とサイズが変わるため、記録は消去する。
|
2023-04-28 00:39:51 +03:00
|
|
|
if hasattr(self, "np_prev_audio1") is True:
|
2023-03-07 15:46:43 +03:00
|
|
|
delattr(self, "np_prev_audio1")
|
2023-04-28 00:39:51 +03:00
|
|
|
if hasattr(self, "sola_buffer") is True:
|
2023-04-14 05:03:52 +03:00
|
|
|
del self.sola_buffer
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-03-07 17:14:14 +03:00
|
|
|
# receivedData: tuple of short
|
2023-04-27 17:38:25 +03:00
|
|
|
def on_request(
|
|
|
|
self, receivedData: AudioInOut
|
|
|
|
) -> tuple[AudioInOut, list[Union[int, float]]]:
|
2023-04-14 22:58:56 +03:00
|
|
|
return self.on_request_sola(receivedData)
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
def on_request_sola(
|
|
|
|
self, receivedData: AudioInOut
|
|
|
|
) -> tuple[AudioInOut, list[Union[int, float]]]:
|
2023-04-17 03:45:12 +03:00
|
|
|
try:
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is None:
|
|
|
|
raise RuntimeError("Voice Changer is not selected.")
|
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
2023-03-10 22:02:40 +03:00
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
# 前処理
|
|
|
|
with Timer("pre-process") as t:
|
|
|
|
if self.settings.inputSampleRate != processing_sampling_rate:
|
2023-04-27 17:38:25 +03:00
|
|
|
newData = cast(
|
|
|
|
AudioInOut,
|
|
|
|
resampy.resample(
|
|
|
|
receivedData,
|
|
|
|
self.settings.inputSampleRate,
|
|
|
|
processing_sampling_rate,
|
|
|
|
),
|
|
|
|
)
|
2023-04-17 03:45:12 +03:00
|
|
|
else:
|
|
|
|
newData = receivedData
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
sola_search_frame = int(0.012 * processing_sampling_rate)
|
|
|
|
# sola_search_frame = 0
|
|
|
|
block_frame = newData.shape[0]
|
|
|
|
crossfade_frame = min(self.settings.crossFadeOverlapSize, block_frame)
|
|
|
|
self._generate_strength(crossfade_frame)
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
data = self.voiceChanger.generate_input(
|
|
|
|
newData, block_frame, crossfade_frame, sola_search_frame
|
|
|
|
)
|
2023-04-17 03:45:12 +03:00
|
|
|
preprocess_time = t.secs
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
# 変換処理
|
|
|
|
with Timer("main-process") as t:
|
2023-04-14 03:18:34 +03:00
|
|
|
# Inference
|
|
|
|
audio = self.voiceChanger.inference(data)
|
|
|
|
|
2023-04-28 00:39:51 +03:00
|
|
|
if hasattr(self, "sola_buffer") is True:
|
2023-04-14 03:18:34 +03:00
|
|
|
np.set_printoptions(threshold=10000)
|
2023-04-28 00:39:51 +03:00
|
|
|
audio_offset = -1 * (
|
|
|
|
sola_search_frame + crossfade_frame + block_frame
|
|
|
|
)
|
|
|
|
audio = audio[audio_offset:]
|
|
|
|
a = 0
|
|
|
|
audio = audio[a:]
|
2023-04-14 03:18:34 +03:00
|
|
|
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
|
2023-04-27 17:38:25 +03:00
|
|
|
cor_nom = np.convolve(
|
|
|
|
audio[: crossfade_frame + sola_search_frame],
|
|
|
|
np.flip(self.sola_buffer),
|
|
|
|
"valid",
|
|
|
|
)
|
|
|
|
cor_den = np.sqrt(
|
|
|
|
np.convolve(
|
|
|
|
audio[: crossfade_frame + sola_search_frame] ** 2,
|
|
|
|
np.ones(crossfade_frame),
|
|
|
|
"valid",
|
|
|
|
)
|
|
|
|
+ 1e-3
|
|
|
|
)
|
2023-04-28 00:39:51 +03:00
|
|
|
sola_offset = int(np.argmax(cor_nom / cor_den))
|
|
|
|
sola_end = sola_offset + block_frame
|
|
|
|
output_wav = audio[sola_offset:sola_end].astype(np.float64)
|
2023-04-14 03:18:34 +03:00
|
|
|
output_wav[:crossfade_frame] *= self.np_cur_strength
|
|
|
|
output_wav[:crossfade_frame] += self.sola_buffer[:]
|
|
|
|
|
|
|
|
result = output_wav
|
|
|
|
else:
|
2023-05-30 20:26:16 +03:00
|
|
|
print("[Voice Changer] warming up... generating sola buffer.")
|
2023-04-14 03:18:34 +03:00
|
|
|
result = np.zeros(4096).astype(np.int16)
|
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
if (
|
2023-04-28 00:39:51 +03:00
|
|
|
hasattr(self, "sola_buffer") is True
|
2023-04-27 17:38:25 +03:00
|
|
|
and sola_offset < sola_search_frame
|
|
|
|
):
|
2023-04-28 00:39:51 +03:00
|
|
|
offset = -1 * (sola_search_frame + crossfade_frame - sola_offset)
|
|
|
|
end = -1 * (sola_search_frame - sola_offset)
|
|
|
|
sola_buf_org = audio[offset:end]
|
2023-04-14 03:18:34 +03:00
|
|
|
self.sola_buffer = sola_buf_org * self.np_prev_strength
|
|
|
|
else:
|
2023-04-27 17:38:25 +03:00
|
|
|
self.sola_buffer = audio[-crossfade_frame:] * self.np_prev_strength
|
2023-04-14 03:18:34 +03:00
|
|
|
# self.sola_buffer = audio[- crossfade_frame:]
|
2023-04-17 03:45:12 +03:00
|
|
|
mainprocess_time = t.secs
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
# 後処理
|
|
|
|
with Timer("post-process") as t:
|
|
|
|
result = result.astype(np.int16)
|
|
|
|
if self.settings.inputSampleRate != processing_sampling_rate:
|
2023-05-09 19:02:28 +03:00
|
|
|
# print(
|
|
|
|
# "samplingrate",
|
|
|
|
# self.settings.inputSampleRate,
|
|
|
|
# processing_sampling_rate,
|
|
|
|
# )
|
2023-04-27 17:38:25 +03:00
|
|
|
outputData = cast(
|
|
|
|
AudioInOut,
|
|
|
|
resampy.resample(
|
|
|
|
result,
|
|
|
|
processing_sampling_rate,
|
|
|
|
self.settings.inputSampleRate,
|
|
|
|
).astype(np.int16),
|
|
|
|
)
|
2023-04-17 03:45:12 +03:00
|
|
|
else:
|
|
|
|
outputData = result
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
print_convert_processing(
|
2023-04-27 17:38:25 +03:00
|
|
|
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz"
|
|
|
|
)
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-05-09 12:59:36 +03:00
|
|
|
if receivedData.shape[0] != outputData.shape[0]:
|
2023-05-09 19:02:28 +03:00
|
|
|
# print(
|
|
|
|
# f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}"
|
|
|
|
# )
|
2023-05-09 12:59:36 +03:00
|
|
|
outputData = pad_array(outputData, receivedData.shape[0])
|
|
|
|
# print_convert_processing(
|
|
|
|
# f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
2023-05-14 20:20:49 +03:00
|
|
|
pass
|
|
|
|
|
|
|
|
if self.settings.recordIO == 1:
|
|
|
|
self.ioRecorder.writeInput(receivedData)
|
|
|
|
self.ioRecorder.writeOutput(outputData.tobytes())
|
|
|
|
|
2023-04-17 03:45:12 +03:00
|
|
|
postprocess_time = t.secs
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
print_convert_processing(
|
|
|
|
f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}"
|
|
|
|
)
|
2023-04-17 03:45:12 +03:00
|
|
|
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
|
|
|
return outputData, perf
|
|
|
|
|
|
|
|
except NoModeLoadedException as e:
|
|
|
|
print("[Voice Changer] [Exception]", e)
|
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
2023-04-18 21:06:45 +03:00
|
|
|
except ONNXInputArgumentException as e:
|
2023-05-28 16:08:10 +03:00
|
|
|
print("[Voice Changer] [Exception] onnx are waiting valid input.", e)
|
2023-04-18 21:06:45 +03:00
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
2023-05-30 20:26:16 +03:00
|
|
|
except HalfPrecisionChangingException:
|
|
|
|
print("[Voice Changer] Switching model configuration....")
|
2023-05-03 07:14:00 +03:00
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
2023-05-30 20:26:16 +03:00
|
|
|
except NotEnoughDataExtimateF0:
|
|
|
|
print("[Voice Changer] warming up... waiting more data.")
|
2023-05-04 11:15:53 +03:00
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
|
|
|
except DeviceChangingException as e:
|
|
|
|
print("[Voice Changer] embedder:", e)
|
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
2023-04-17 03:45:12 +03:00
|
|
|
except Exception as e:
|
|
|
|
print("VC PROCESSING!!!! EXCEPTION!!!", e)
|
|
|
|
print(traceback.format_exc())
|
|
|
|
return np.zeros(1).astype(np.int16), [0, 0, 0]
|
2023-04-14 03:18:34 +03:00
|
|
|
|
2023-04-13 02:00:28 +03:00
|
|
|
def export2onnx(self):
|
|
|
|
return self.voiceChanger.export2onnx()
|
2023-02-20 22:07:43 +03:00
|
|
|
|
2023-04-13 02:00:28 +03:00
|
|
|
##############
|
2023-04-27 17:38:25 +03:00
|
|
|
|
2023-04-30 20:34:01 +03:00
|
|
|
def merge_models(self, request: str):
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is None:
|
|
|
|
print("[Voice Changer] Voice Changer is not selected.")
|
|
|
|
return
|
2023-04-30 20:34:01 +03:00
|
|
|
self.voiceChanger.merge_models(request)
|
|
|
|
return self.get_info()
|
|
|
|
|
2023-05-20 22:21:54 +03:00
|
|
|
def update_model_default(self):
|
2023-05-26 10:52:05 +03:00
|
|
|
if self.voiceChanger is None:
|
|
|
|
print("[Voice Changer] Voice Changer is not selected.")
|
|
|
|
return
|
2023-05-20 22:21:54 +03:00
|
|
|
self.voiceChanger.update_model_default()
|
|
|
|
return self.get_info()
|
|
|
|
|
2023-04-27 17:38:25 +03:00
|
|
|
|
2023-04-10 03:28:00 +03:00
|
|
|
PRINT_CONVERT_PROCESSING: bool = False
|
2023-03-10 21:59:03 +03:00
|
|
|
# PRINT_CONVERT_PROCESSING = True
|
|
|
|
|
|
|
|
|
|
|
|
def print_convert_processing(mess: str):
|
2023-04-28 00:39:51 +03:00
|
|
|
if PRINT_CONVERT_PROCESSING is True:
|
2023-03-10 21:59:03 +03:00
|
|
|
print(mess)
|
|
|
|
|
|
|
|
|
2023-04-12 19:13:25 +03:00
|
|
|
def pad_array(arr: AudioInOut, target_length: int):
|
2023-03-10 19:56:10 +03:00
|
|
|
current_length = arr.shape[0]
|
|
|
|
if current_length >= target_length:
|
|
|
|
return arr
|
|
|
|
else:
|
|
|
|
pad_width = target_length - current_length
|
|
|
|
pad_left = pad_width // 2
|
|
|
|
pad_right = pad_width - pad_left
|
2023-05-14 20:20:49 +03:00
|
|
|
# padded_arr = np.pad(
|
|
|
|
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
|
|
|
|
# )
|
|
|
|
padded_arr = np.pad(arr, (pad_left, pad_right), "edge")
|
2023-03-10 19:56:10 +03:00
|
|
|
return padded_arr
|