WIP:separate server device

This commit is contained in:
wataru 2023-06-16 15:06:35 +09:00
parent 3c1d87751e
commit 8a8386640d
4 changed files with 228 additions and 154 deletions

View File

@ -25,8 +25,8 @@ def list_audio_device():
# print("output:", outputDeviceList)
# print("hostapis", hostapis)
serverAudioInputDevices = []
serverAudioOutputDevices = []
serverAudioInputDevices: list[ServerAudioDevice] = []
serverAudioOutputDevices: list[ServerAudioDevice] = []
for d in inputAudioDeviceList:
serverInputAudioDevice: ServerAudioDevice = ServerAudioDevice(
kind=ServerAudioDeviceTypes.audioinput,

View File

@ -0,0 +1,190 @@
from dataclasses import dataclass, asdict
import numpy as np
from voice_changer.Local.AudioDeviceList import list_audio_device
import time
import sounddevice as sd
from voice_changer.utils.Timer import Timer
import librosa
from voice_changer.utils.VoiceChangerModel import AudioInOut
from typing import Protocol
@dataclass
class ServerDeviceSettings:
enableServerAudio: int = 0 # 0:off, 1:on
serverAudioStated: int = 0 # 0:off, 1:on
serverInputAudioSampleRate: int = 44100
serverOutputAudioSampleRate: int = 44100
serverInputDeviceId: int = -1
serverOutputDeviceId: int = -1
serverReadChunkSize: int = 256
serverInputAudioGain: float = 1.0
serverOutputAudioGain: float = 1.0
EditableServerDeviceSettings = {
"intData": [
"enableServerAudio",
"serverAudioStated",
"serverInputAudioSampleRate",
"serverOutputAudioSampleRate",
"serverInputDeviceId",
"serverOutputDeviceId",
"serverReadChunkSize",
],
"floatData": [
"serverInputAudioGain",
"serverOutputAudioGain",
],
}
class ServerDeviceCallbacks(Protocol):
def on_request(self, unpackedData: AudioInOut):
...
def emitTo(self, performance: list[float]):
...
def get_processing_sampling_rate(self):
...
def setSamplingRate(self, sr: int):
...
class ServerDevice:
def __init__(self, serverDeviceCallbacks: ServerDeviceCallbacks):
self.settings = ServerDeviceSettings()
self.serverDeviceCallbacks = serverDeviceCallbacks
def getServerInputAudioDevice(self, index: int):
audioinput, _audiooutput = list_audio_device()
serverAudioDevice = [x for x in audioinput if x.index == index]
if len(serverAudioDevice) > 0:
return serverAudioDevice[0]
else:
return None
def getServerOutputAudioDevice(self, index: int):
_audioinput, audiooutput = list_audio_device()
serverAudioDevice = [x for x in audiooutput if x.index == index]
if len(serverAudioDevice) > 0:
return serverAudioDevice[0]
else:
return None
def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
try:
indata = indata * self.settings.serverInputAudioGain
with Timer("all_inference_time") as t:
unpackedData = librosa.to_mono(indata.T) * 32768.0
out_wav, times = self.serverDeviceCallbacks.on_request(unpackedData)
outputChunnels = outdata.shape[1]
outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
outdata[:] = outdata * self.settings.serverOutputAudioGain
all_inference_time = t.secs
self.performance = [all_inference_time] + times
self.serverDeviceCallbacks.emitTo(self.performance)
self.performance = [round(x * 1000) for x in self.performance]
except Exception as e:
print("[Voice Changer] ex:", e)
def start(self):
# currentInputDeviceId = -1
# currentOutputDeviceId = -1
# currentInputChunkNum = -1
currentModelSamplingRate = -1
while True:
if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1:
# self.settings.inputSampleRate = 48000
time.sleep(2)
else:
sd._terminate()
sd._initialize()
sd.default.device[0] = self.settings.serverInputDeviceId
# currentInputDeviceId = self.settings.serverInputDeviceId
sd.default.device[1] = self.settings.serverOutputDeviceId
# currentOutputDeviceId = self.settings.serverOutputDeviceId
serverInputAudioDevice = self.getServerInputAudioDevice(sd.default.device[0])
serverOutputAudioDevice = self.getServerOutputAudioDevice(sd.default.device[1])
print(serverInputAudioDevice, serverOutputAudioDevice)
if serverInputAudioDevice is None or serverOutputAudioDevice is None:
time.sleep(2)
print("serverInputAudioDevice or serverOutputAudioDevice is None")
continue
# currentInputChannelNum = serverInputAudioDevice.maxInputChannels
# currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
sd.default.channels[0] = serverInputAudioDevice.maxInputChannels
sd.default.channels[1] = serverOutputAudioDevice.maxOutputChannels
currentInputChunkNum = self.settings.serverReadChunkSize
block_frame = currentInputChunkNum * 128
# sample rate precheck(alsa cannot use 40000?)
try:
currentModelSamplingRate = self.serverDeviceCallbacks.get_processing_sampling_rate()
except Exception as e:
print("[Voice Changer] ex: get_processing_sampling_rate", e)
continue
try:
with sd.Stream(
callback=self.audio_callback,
blocksize=block_frame,
# samplerate=currentModelSamplingRate,
dtype="float32",
# channels=[currentInputChannelNum, currentOutputChannelNum],
):
pass
self.settings.serverInputAudioSampleRate = currentModelSamplingRate
self.serverDeviceCallbacks.setSamplingRate(currentModelSamplingRate)
print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}")
except Exception as e:
print("[Voice Changer] ex: fallback to device default samplerate", e)
print("[Voice Changer] device default samplerate", serverInputAudioDevice.default_samplerate)
self.settings.serverInputAudioSampleRate = round(serverInputAudioDevice.default_samplerate)
self.serverDeviceCallbacks.setSamplingRate(round(serverInputAudioDevice.default_samplerate))
sd.default.samplerate = self.settings.serverInputAudioSampleRate
sd.default.blocksize = block_frame
# main loop
try:
with sd.Stream(
callback=self.audio_callback,
# blocksize=block_frame,
# samplerate=vc.settings.serverInputAudioSampleRate,
dtype="float32",
# channels=[currentInputChannelNum, currentOutputChannelNum],
):
while self.settings.serverAudioStated == 1 and sd.default.device[0] == self.settings.serverInputDeviceId and sd.default.device[1] == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.serverDeviceCallbacks.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize:
time.sleep(2)
print(
"[Voice Changer] server audio",
self.performance,
)
print(f"[Voice Changer] started:{self.settings.serverAudioStated}, input:{sd.default.device[0]}, output:{sd.default.device[1]}, mic_sr:{self.settings.serverInputAudioSampleRate}, model_sr:{currentModelSamplingRate}, chunk:{currentInputChunkNum}, ch:[{sd.default.channels}]")
except Exception as e:
print("[Voice Changer] ex:", e)
time.sleep(2)
def get_info(self):
data = asdict(self.settings)
audioinput, audiooutput = list_audio_device()
data["serverAudioInputDevices"] = audioinput
data["serverAudioOutputDevices"] = audiooutput
return data
def update_settings(self, key: str, val: str | int | float):
if key in EditableServerDeviceSettings["intData"]:
setattr(self.settings, key, int(val))
elif key in EditableServerDeviceSettings["floatData"]:
setattr(self.settings, key, float(val))
return self.get_info()

View File

@ -11,7 +11,6 @@ import resampy
from voice_changer.IORecorder import IORecorder
from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
from voice_changer.utils.LoadModelParams import LoadModelParams
from voice_changer.utils.Timer import Timer
@ -26,10 +25,6 @@ from Exceptions import (
VoiceChangerIsNotSelectedException,
)
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
import threading
import time
import sounddevice as sd
import librosa
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
@ -44,22 +39,7 @@ class VoiceChangerSettings:
crossFadeOverlapSize: int = 4096
recordIO: int = 0 # 0:off, 1:on
serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
enableServerAudio: int = 0 # 0:off, 1:on
serverAudioStated: int = 0 # 0:off, 1:on
# serverInputAudioSampleRate: int = 48000
# serverOutputAudioSampleRate: int = 48000
serverInputAudioSampleRate: int = 44100
serverOutputAudioSampleRate: int = 44100
# serverInputAudioBufferSize: int = 1024 * 24
# serverOutputAudioBufferSize: int = 1024 * 24
serverInputDeviceId: int = -1
serverOutputDeviceId: int = -1
serverReadChunkSize: int = 256
serverInputAudioGain: float = 1.0
serverOutputAudioGain: float = 1.0
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
# ↓mutableな物だけ列挙
@ -68,23 +48,12 @@ class VoiceChangerSettings:
"inputSampleRate",
"crossFadeOverlapSize",
"recordIO",
"enableServerAudio",
"serverAudioStated",
"serverInputAudioSampleRate",
"serverOutputAudioSampleRate",
# "serverInputAudioBufferSize",
# "serverOutputAudioBufferSize",
"serverInputDeviceId",
"serverOutputDeviceId",
"serverReadChunkSize",
]
)
floatData: list[str] = field(
default_factory=lambda: [
"crossFadeOffsetRate",
"crossFadeEndRate",
"serverInputAudioGain",
"serverOutputAudioGain",
]
)
strData: list[str] = field(default_factory=lambda: [])
@ -101,120 +70,6 @@ class VoiceChanger:
emitTo = None
def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
try:
indata = indata * self.settings.serverInputAudioGain
with Timer("all_inference_time") as t:
unpackedData = librosa.to_mono(indata.T) * 32768.0
out_wav, times = self.on_request(unpackedData)
outputChunnels = outdata.shape[1]
outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
outdata[:] = outdata * self.settings.serverOutputAudioGain
all_inference_time = t.secs
performance = [all_inference_time] + times
if self.emitTo is not None:
self.emitTo(performance)
self.settings.performance = [round(x * 1000) for x in performance]
except Exception as e:
print("[Voice Changer] ex:", e)
def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int):
serverAudioDevice = [x for x in audioDeviceList if x.index == index]
if len(serverAudioDevice) > 0:
return serverAudioDevice[0]
else:
return None
def serverLocal(self, _vc):
vc: VoiceChanger = _vc
currentInputDeviceId = -1
currentModelSamplingRate = -1
currentOutputDeviceId = -1
currentInputChunkNum = -1
while True:
if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None:
vc.settings.inputSampleRate = 48000
time.sleep(2)
else:
sd._terminate()
sd._initialize()
sd.default.device[0] = vc.settings.serverInputDeviceId
currentInputDeviceId = vc.settings.serverInputDeviceId
sd.default.device[1] = vc.settings.serverOutputDeviceId
currentOutputDeviceId = vc.settings.serverOutputDeviceId
currentInputChannelNum = vc.settings.serverAudioInputDevices
serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
print(serverInputAudioDevice, serverOutputAudioDevice)
if serverInputAudioDevice is None or serverOutputAudioDevice is None:
time.sleep(2)
print("serverInputAudioDevice or serverOutputAudioDevice is None")
continue
currentInputChannelNum = serverInputAudioDevice.maxInputChannels
currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
currentInputChunkNum = vc.settings.serverReadChunkSize
block_frame = currentInputChunkNum * 128
# sample rate precheck(alsa cannot use 40000?)
try:
currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate()
except Exception as e:
print("[Voice Changer] ex: get_processing_sampling_rate", e)
continue
try:
with sd.Stream(
callback=self.audio_callback,
blocksize=block_frame,
samplerate=currentModelSamplingRate,
dtype="float32",
channels=[currentInputChannelNum, currentOutputChannelNum],
):
pass
vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
vc.settings.inputSampleRate = currentModelSamplingRate
print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
except Exception as e:
print(
"[Voice Changer] ex: fallback to device default samplerate",
e,
)
vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
# main loop
try:
with sd.Stream(
callback=self.audio_callback,
blocksize=block_frame,
samplerate=vc.settings.serverInputAudioSampleRate,
dtype="float32",
channels=[currentInputChannelNum, currentOutputChannelNum],
):
while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
time.sleep(2)
print(
"[Voice Changer] server audio",
self.settings.performance,
)
print(
"[Voice Changer] info:",
vc.settings.serverAudioStated,
currentInputDeviceId,
currentOutputDeviceId,
vc.settings.serverInputAudioSampleRate,
currentInputChunkNum,
)
except Exception as e:
print("[Voice Changer] ex:", e)
time.sleep(2)
def __init__(self, params: VoiceChangerParams):
# 初期化
self.settings = VoiceChangerSettings()
@ -231,12 +86,6 @@ class VoiceChanger:
self.prev_audio = np.zeros(4096)
self.mps_enabled: bool = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
audioinput, audiooutput = list_audio_device()
self.settings.serverAudioInputDevices = audioinput
self.settings.serverAudioOutputDevices = audiooutput
thread = threading.Thread(target=self.serverLocal, args=(self,))
thread.start()
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
def switchModelType(self, modelType: ModelType):
@ -375,6 +224,12 @@ class VoiceChanger:
if hasattr(self, "sola_buffer") is True:
del self.sola_buffer
def get_processing_sampling_rate(self):
if self.voiceChanger is None:
return 0
else:
return self.voiceChanger.get_processing_sampling_rate()
# receivedData: tuple of short
def on_request(self, receivedData: AudioInOut) -> tuple[AudioInOut, list[Union[int, float]]]:
return self.on_request_sola(receivedData)

View File

@ -1,4 +1,5 @@
import numpy as np
from voice_changer.Local.ServerDevice import ServerDevice, ServerDeviceCallbacks
from voice_changer.VoiceChanger import VoiceChanger
from const import ModelType
from voice_changer.utils.LoadModelParams import LoadModelParams
@ -6,6 +7,7 @@ from voice_changer.utils.VoiceChangerModel import AudioInOut
from voice_changer.utils.VoiceChangerParams import VoiceChangerParams
from dataclasses import dataclass, asdict
import torch
import threading
@dataclass()
@ -22,15 +24,38 @@ class VoiceChangerManagerSettings:
# intData: list[str] = field(default_factory=lambda: ["slotIndex"])
class VoiceChangerManager(object):
class VoiceChangerManager(ServerDeviceCallbacks):
_instance = None
############################
# ServerDeviceCallbacks
############################
def on_request(self, unpackedData: AudioInOut):
return self.changeVoice(unpackedData)
def emitTo(self, performance: list[float]):
print("emit ", performance)
def get_processing_sampling_rate(self):
return self.voiceChanger.get_processing_sampling_rate()
def setSamplingRate(self, sr: int):
self.voiceChanger.settings.inputSampleRate = sr
############################
# VoiceChangerManager
############################
def __init__(self, params: VoiceChangerParams):
self.voiceChanger: VoiceChanger = None
self.settings: VoiceChangerManagerSettings = VoiceChangerManagerSettings(dummy=0)
# スタティックな情報を収集
self.gpus: list[GPUInfo] = self._get_gpuInfos()
self.serverDevice = ServerDevice(self)
thread = threading.Thread(target=self.serverDevice.start, args=())
thread.start()
def _get_gpuInfos(self):
devCount = torch.cuda.device_count()
gpus = []
@ -62,6 +87,9 @@ class VoiceChangerManager(object):
data["status"] = "OK"
info = self.serverDevice.get_info()
data.update(info)
if hasattr(self, "voiceChanger"):
info = self.voiceChanger.get_info()
data.update(info)
@ -77,6 +105,7 @@ class VoiceChangerManager(object):
return {"status": "ERROR", "msg": "no model loaded"}
def update_settings(self, key: str, val: str | int | float):
self.serverDevice.update_settings(key, val)
if hasattr(self, "voiceChanger"):
self.voiceChanger.update_settings(key, val)
else: