WIP:refactoring

This commit is contained in:
wataru 2023-06-16 01:49:49 +09:00
parent be42bb682d
commit 9806ce2f3d
4 changed files with 84 additions and 183 deletions

View File

@ -15,7 +15,11 @@ class ServerAudioDevice:
def list_audio_device(): def list_audio_device():
audioDeviceList = sd.query_devices() try:
audioDeviceList = sd.query_devices()
except Exception as e:
print("[Voice Changer] ex: query_devices", e)
return [], []
inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0] inputAudioDeviceList = [d for d in audioDeviceList if d["max_input_channels"] > 0]
outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0] outputAudioDeviceList = [d for d in audioDeviceList if d["max_output_channels"] > 0]

View File

@ -4,34 +4,66 @@ import numpy as np
import librosa import librosa
import sounddevice as sd import sounddevice as sd
from dataclasses import dataclass, asdict, field
from voice_changer.Local.AudioDeviceList import ServerAudioDevice from voice_changer.Local.AudioDeviceList import ServerAudioDevice
from voice_changer.VoiceChanger import VoiceChanger from voice_changer.VoiceChangerManager import VoiceChangerManager
from voice_changer.utils.Timer import Timer from voice_changer.utils.Timer import Timer
@dataclass()
class ServerDeviceSettings:
enableServerAudio: int = 0 # 0:off, 1:on
serverAudioStated: int = 0 # 0:off, 1:on
serverInputAudioSampleRate: int = 44100
serverOutputAudioSampleRate: int = 44100
serverInputDeviceId: int = -1
serverOutputDeviceId: int = -1
serverReadChunkSize: int = 256
serverInputAudioGain: float = 1.0
serverOutputAudioGain: float = 1.0
EditableServerDeviceSettings = {
"intData": [
"enableServerAudio",
"serverAudioStated",
"serverInputAudioSampleRate",
"serverOutputAudioSampleRate",
"serverInputDeviceId",
"serverOutputDeviceId",
"serverReadChunkSize",
],
"floatData": [
"serverInputAudioGain",
"serverOutputAudioGain",
],
}
class ServerDevice: class ServerDevice:
def __init__(self): def __init__(self, voiceChangerManager: VoiceChangerManager):
self.voiceChanger: VoiceChanger | None = None self.settings = ServerDeviceSettings()
pass self.voiceChangerManager: VoiceChangerManager = voiceChangerManager
def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status): def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
if self.voiceChanger is None: if self.voiceChangerManager.voiceChanger is None:
print("[Voice Changer] voiceChanger is None") print("[Voice Changer] voiceChanger is None")
return return
try: try:
indata = indata * self.voiceChanger.settings.serverInputAudioGain indata = indata * self.settings.serverInputAudioGain
with Timer("all_inference_time") as t: with Timer("all_inference_time") as t:
unpackedData = librosa.to_mono(indata.T) * 32768.0 unpackedData = librosa.to_mono(indata.T) * 32768.0
out_wav, times = self.voiceChanger.on_request(unpackedData) out_wav, times = self.voiceChangerManager.voiceChanger.on_request(unpackedData)
outputChunnels = outdata.shape[1] outputChunnels = outdata.shape[1]
outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0 outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
outdata[:] = outdata * self.voiceChanger.settings.serverOutputAudioGain outdata[:] = outdata * self.settings.serverOutputAudioGain
all_inference_time = t.secs all_inference_time = t.secs
performance = [all_inference_time] + times performance = [all_inference_time] + times
if self.voiceChanger.emitTo is not None: if self.voiceChangerManager.voiceChanger.emitTo is not None:
self.voiceChanger.emitTo(performance) self.voiceChangerManager.voiceChanger.emitTo(performance)
self.voiceChanger.settings.performance = [round(x * 1000) for x in performance] self.voiceChangerManager.voiceChanger.settings.performance = [round(x * 1000) for x in performance]
except Exception as e: except Exception as e:
print("[Voice Changer] ex:", e) print("[Voice Changer] ex:", e)
@ -42,31 +74,26 @@ class ServerDevice:
else: else:
return None return None
def serverLocal(self, _vc: VoiceChanger): def serverLocal(self):
self.voiceChanger = _vc
vc = self.voiceChanger
currentInputDeviceId = -1 currentInputDeviceId = -1
currentModelSamplingRate = -1 currentModelSamplingRate = -1
currentOutputDeviceId = -1 currentOutputDeviceId = -1
currentInputChunkNum = -1 currentInputChunkNum = -1
while True: while True:
if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc is None: if self.settings.serverAudioStated == 0 or self.settings.serverInputDeviceId == -1 or self.voiceChangerManager is None:
vc.settings.inputSampleRate = 48000 self.voiceChangerManager.voiceChanger.settings.inputSampleRate = 48000
time.sleep(2) time.sleep(2)
else: else:
sd._terminate() sd._terminate()
sd._initialize() sd._initialize()
sd.default.device[0] = vc.settings.serverInputDeviceId sd.default.device[0] = self.settings.serverInputDeviceId
currentInputDeviceId = vc.settings.serverInputDeviceId currentInputDeviceId = self.settings.serverInputDeviceId
sd.default.device[1] = vc.settings.serverOutputDeviceId sd.default.device[1] = self.settings.serverOutputDeviceId
currentOutputDeviceId = vc.settings.serverOutputDeviceId currentOutputDeviceId = self.settings.serverOutputDeviceId
currentInputChannelNum = vc.settings.serverAudioInputDevices serverInputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioInputDevices, currentInputDeviceId)
serverOutputAudioDevice = self.getServerAudioDevice(self.voiceChangerManager.serverAudioOutputDevices, currentOutputDeviceId)
serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
print(serverInputAudioDevice, serverOutputAudioDevice) print(serverInputAudioDevice, serverOutputAudioDevice)
if serverInputAudioDevice is None or serverOutputAudioDevice is None: if serverInputAudioDevice is None or serverOutputAudioDevice is None:
time.sleep(2) time.sleep(2)
@ -76,12 +103,12 @@ class ServerDevice:
currentInputChannelNum = serverInputAudioDevice.maxInputChannels currentInputChannelNum = serverInputAudioDevice.maxInputChannels
currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
currentInputChunkNum = vc.settings.serverReadChunkSize currentInputChunkNum = self.settings.serverReadChunkSize
block_frame = currentInputChunkNum * 128 block_frame = currentInputChunkNum * 128
# sample rate precheck(alsa cannot use 40000?) # sample rate precheck(alsa cannot use 40000?)
try: try:
currentModelSamplingRate = self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() currentModelSamplingRate = self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate()
except Exception as e: except Exception as e:
print("[Voice Changer] ex: get_processing_sampling_rate", e) print("[Voice Changer] ex: get_processing_sampling_rate", e)
continue continue
@ -94,38 +121,38 @@ class ServerDevice:
channels=[currentInputChannelNum, currentOutputChannelNum], channels=[currentInputChannelNum, currentOutputChannelNum],
): ):
pass pass
vc.settings.serverInputAudioSampleRate = currentModelSamplingRate self.settings.serverInputAudioSampleRate = currentModelSamplingRate
vc.settings.inputSampleRate = currentModelSamplingRate self.voiceChangerManager.voiceChanger.settings.inputSampleRate = currentModelSamplingRate
print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}") print(f"[Voice Changer] sample rate {self.settings.serverInputAudioSampleRate}")
except Exception as e: except Exception as e:
print( print(
"[Voice Changer] ex: fallback to device default samplerate", "[Voice Changer] ex: fallback to device default samplerate",
e, e,
) )
vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate self.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate self.voiceChangerManager.voiceChanger.settings.inputSampleRate = self.settings.serverInputAudioSampleRate
# main loop # main loop
try: try:
with sd.Stream( with sd.Stream(
callback=self.audio_callback, callback=self.audio_callback,
blocksize=block_frame, blocksize=block_frame,
samplerate=vc.settings.serverInputAudioSampleRate, samplerate=self.settings.serverInputAudioSampleRate,
dtype="float32", dtype="float32",
channels=[currentInputChannelNum, currentOutputChannelNum], channels=[currentInputChannelNum, currentOutputChannelNum],
): ):
while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize: while self.settings.serverAudioStated == 1 and currentInputDeviceId == self.settings.serverInputDeviceId and currentOutputDeviceId == self.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChangerManager.voiceChanger.voiceChangerModel.get_processing_sampling_rate() and currentInputChunkNum == self.settings.serverReadChunkSize:
time.sleep(2) time.sleep(2)
print( print(
"[Voice Changer] server audio", "[Voice Changer] server audio",
vc.settings.performance, self.voiceChangerManager.settings.performance,
) )
print( print(
"[Voice Changer] info:", "[Voice Changer] info:",
vc.settings.serverAudioStated, self.settings.serverAudioStated,
currentInputDeviceId, currentInputDeviceId,
currentOutputDeviceId, currentOutputDeviceId,
vc.settings.serverInputAudioSampleRate, self.settings.serverInputAudioSampleRate,
currentInputChunkNum, currentInputChunkNum,
) )

View File

@ -51,22 +51,7 @@ class VoiceChangerSettings:
crossFadeOverlapSize: int = 4096 crossFadeOverlapSize: int = 4096
recordIO: int = 0 # 0:off, 1:on recordIO: int = 0 # 0:off, 1:on
serverAudioInputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
serverAudioOutputDevices: list[ServerAudioDevice] = field(default_factory=lambda: [])
enableServerAudio: int = 0 # 0:off, 1:on
serverAudioStated: int = 0 # 0:off, 1:on
# serverInputAudioSampleRate: int = 48000
# serverOutputAudioSampleRate: int = 48000
serverInputAudioSampleRate: int = 44100
serverOutputAudioSampleRate: int = 44100
# serverInputAudioBufferSize: int = 1024 * 24
# serverOutputAudioBufferSize: int = 1024 * 24
serverInputDeviceId: int = -1
serverOutputDeviceId: int = -1
serverReadChunkSize: int = 256
serverInputAudioGain: float = 1.0
serverOutputAudioGain: float = 1.0
performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0]) performance: list[int] = field(default_factory=lambda: [0, 0, 0, 0])
# ↓mutableな物だけ列挙 # ↓mutableな物だけ列挙
@ -75,23 +60,12 @@ class VoiceChangerSettings:
"inputSampleRate", "inputSampleRate",
"crossFadeOverlapSize", "crossFadeOverlapSize",
"recordIO", "recordIO",
"enableServerAudio",
"serverAudioStated",
"serverInputAudioSampleRate",
"serverOutputAudioSampleRate",
# "serverInputAudioBufferSize",
# "serverOutputAudioBufferSize",
"serverInputDeviceId",
"serverOutputDeviceId",
"serverReadChunkSize",
] ]
) )
floatData: list[str] = field( floatData: list[str] = field(
default_factory=lambda: [ default_factory=lambda: [
"crossFadeOffsetRate", "crossFadeOffsetRate",
"crossFadeEndRate", "crossFadeEndRate",
"serverInputAudioGain",
"serverOutputAudioGain",
] ]
) )
strData: list[str] = field(default_factory=lambda: []) strData: list[str] = field(default_factory=lambda: [])
@ -108,120 +82,6 @@ class VoiceChanger:
# emitTo = None # emitTo = None
# def audio_callback(self, indata: np.ndarray, outdata: np.ndarray, frames, times, status):
# try:
# indata = indata * self.settings.serverInputAudioGain
# with Timer("all_inference_time") as t:
# unpackedData = librosa.to_mono(indata.T) * 32768.0
# out_wav, times = self.on_request(unpackedData)
# outputChunnels = outdata.shape[1]
# outdata[:] = np.repeat(out_wav, outputChunnels).reshape(-1, outputChunnels) / 32768.0
# outdata[:] = outdata * self.settings.serverOutputAudioGain
# all_inference_time = t.secs
# performance = [all_inference_time] + times
# if self.emitTo is not None:
# self.emitTo(performance)
# self.settings.performance = [round(x * 1000) for x in performance]
# except Exception as e:
# print("[Voice Changer] ex:", e)
# def getServerAudioDevice(self, audioDeviceList: list[ServerAudioDevice], index: int):
# serverAudioDevice = [x for x in audioDeviceList if x.index == index]
# if len(serverAudioDevice) > 0:
# return serverAudioDevice[0]
# else:
# return None
# def serverLocal(self, _vc):
# vc: VoiceChanger = _vc
# currentInputDeviceId = -1
# currentModelSamplingRate = -1
# currentOutputDeviceId = -1
# currentInputChunkNum = -1
# while True:
# if vc.settings.serverAudioStated == 0 or vc.settings.serverInputDeviceId == -1 or vc.voiceChanger is None:
# vc.settings.inputSampleRate = 48000
# time.sleep(2)
# else:
# sd._terminate()
# sd._initialize()
# sd.default.device[0] = vc.settings.serverInputDeviceId
# currentInputDeviceId = vc.settings.serverInputDeviceId
# sd.default.device[1] = vc.settings.serverOutputDeviceId
# currentOutputDeviceId = vc.settings.serverOutputDeviceId
# currentInputChannelNum = vc.settings.serverAudioInputDevices
# serverInputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioInputDevices, currentInputDeviceId)
# serverOutputAudioDevice = self.getServerAudioDevice(vc.settings.serverAudioOutputDevices, currentOutputDeviceId)
# print(serverInputAudioDevice, serverOutputAudioDevice)
# if serverInputAudioDevice is None or serverOutputAudioDevice is None:
# time.sleep(2)
# print("serverInputAudioDevice or serverOutputAudioDevice is None")
# continue
# currentInputChannelNum = serverInputAudioDevice.maxInputChannels
# currentOutputChannelNum = serverOutputAudioDevice.maxOutputChannels
# currentInputChunkNum = vc.settings.serverReadChunkSize
# block_frame = currentInputChunkNum * 128
# # sample rate precheck(alsa cannot use 40000?)
# try:
# currentModelSamplingRate = self.voiceChanger.get_processing_sampling_rate()
# except Exception as e:
# print("[Voice Changer] ex: get_processing_sampling_rate", e)
# continue
# try:
# with sd.Stream(
# callback=self.audio_callback,
# blocksize=block_frame,
# samplerate=currentModelSamplingRate,
# dtype="float32",
# channels=[currentInputChannelNum, currentOutputChannelNum],
# ):
# pass
# vc.settings.serverInputAudioSampleRate = currentModelSamplingRate
# vc.settings.inputSampleRate = currentModelSamplingRate
# print(f"[Voice Changer] sample rate {vc.settings.serverInputAudioSampleRate}")
# except Exception as e:
# print(
# "[Voice Changer] ex: fallback to device default samplerate",
# e,
# )
# vc.settings.serverInputAudioSampleRate = serverInputAudioDevice.default_samplerate
# vc.settings.inputSampleRate = vc.settings.serverInputAudioSampleRate
# # main loop
# try:
# with sd.Stream(
# callback=self.audio_callback,
# blocksize=block_frame,
# samplerate=vc.settings.serverInputAudioSampleRate,
# dtype="float32",
# channels=[currentInputChannelNum, currentOutputChannelNum],
# ):
# while vc.settings.serverAudioStated == 1 and currentInputDeviceId == vc.settings.serverInputDeviceId and currentOutputDeviceId == vc.settings.serverOutputDeviceId and currentModelSamplingRate == self.voiceChanger.get_processing_sampling_rate() and currentInputChunkNum == vc.settings.serverReadChunkSize:
# time.sleep(2)
# print(
# "[Voice Changer] server audio",
# self.settings.performance,
# )
# print(
# "[Voice Changer] info:",
# vc.settings.serverAudioStated,
# currentInputDeviceId,
# currentOutputDeviceId,
# vc.settings.serverInputAudioSampleRate,
# currentInputChunkNum,
# )
# except Exception as e:
# print("[Voice Changer] ex:", e)
# time.sleep(2)
def __init__(self, params: VoiceChangerParams, slotIndex: int): def __init__(self, params: VoiceChangerParams, slotIndex: int):
# 初期化 # 初期化
self.settings = VoiceChangerSettings() self.settings = VoiceChangerSettings()
@ -238,9 +98,9 @@ class VoiceChanger:
self.ioRecorder: IORecorder | None = None self.ioRecorder: IORecorder | None = None
self.sola_buffer: AudioInOut | None = None self.sola_buffer: AudioInOut | None = None
audioinput, audiooutput = list_audio_device() # audioinput, audiooutput = list_audio_device()
self.settings.serverAudioInputDevices = audioinput # self.settings.serverAudioInputDevices = audioinput
self.settings.serverAudioOutputDevices = audiooutput # self.settings.serverAudioOutputDevices = audiooutput
self.slotIndex = slotIndex self.slotIndex = slotIndex
self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex) self.slotInfo = loadSlotInfo(params.model_dir, self.slotIndex)

View File

@ -3,6 +3,7 @@ import threading
from data.ModelSample import ModelSamples from data.ModelSample import ModelSamples
from data.ModelSlot import ModelSlots, loadSlotInfo from data.ModelSlot import ModelSlots, loadSlotInfo
from utils.downloader.SampleDownloader import downloadSample, getSampleInfos from utils.downloader.SampleDownloader import downloadSample, getSampleInfos
from voice_changer.Local.AudioDeviceList import ServerAudioDevice, list_audio_device
from voice_changer.Local.ServerDevice import ServerDevice from voice_changer.Local.ServerDevice import ServerDevice
from voice_changer.RVC.ModelSlotGenerator import setSlotAsRVC from voice_changer.RVC.ModelSlotGenerator import setSlotAsRVC
@ -26,6 +27,7 @@ class GPUInfo:
@dataclass() @dataclass()
class VoiceChangerManagerSettings: class VoiceChangerManagerSettings:
slotIndex: int slotIndex: int
intData: list[str] = field(default_factory=lambda: ["slotIndex"]) intData: list[str] = field(default_factory=lambda: ["slotIndex"])
@ -43,6 +45,14 @@ class VoiceChangerManager(object):
self.sampleModels: list[ModelSamples] = getSampleInfos(self.params.sample_mode) self.sampleModels: list[ModelSamples] = getSampleInfos(self.params.sample_mode)
self.gpus: list[GPUInfo] = self._get_gpuInfos() self.gpus: list[GPUInfo] = self._get_gpuInfos()
audioinput, audiooutput = list_audio_device()
self.serverAudioInputDevices: list[ServerAudioDevice] = audioinput
self.serverAudioOutputDevices: list[ServerAudioDevice] = audiooutput
# ServerDevice
thread = threading.Thread(target=self.serverDevice.serverLocal, args=(self,))
thread.start()
def _get_gpuInfos(self): def _get_gpuInfos(self):
devCount = torch.cuda.device_count() devCount = torch.cuda.device_count()
gpus = [] gpus = []
@ -63,8 +73,6 @@ class VoiceChangerManager(object):
print(f"VoiceChanger Initialized (GPU_NUM:{gpu_num}, mps_enabled:{mps_enabled})") print(f"VoiceChanger Initialized (GPU_NUM:{gpu_num}, mps_enabled:{mps_enabled})")
cls._instance.voiceChanger = VoiceChanger(params, cls._instance.settings.slotIndex) cls._instance.voiceChanger = VoiceChanger(params, cls._instance.settings.slotIndex)
thread = threading.Thread(target=cls._instance.serverDevice.serverLocal, args=(cls._instance.voiceChanger,))
thread.start()
cls._instance.voiceChanger.prepareModel() cls._instance.voiceChanger.prepareModel()
return cls._instance return cls._instance
@ -94,6 +102,8 @@ class VoiceChangerManager(object):
data["slotInfos"] = slotInfos data["slotInfos"] = slotInfos
data["gpus"] = self.gpus data["gpus"] = self.gpus
data["sampleModels"] = self.sampleModels data["sampleModels"] = self.sampleModels
data["serverAudioInputDevices"] = self.serverAudioInputDevices
data["serverAudioOutputDevices"] = self.serverAudioOutputDevices
data["status"] = "OK" data["status"] = "OK"
if hasattr(self, "voiceChanger"): if hasattr(self, "voiceChanger"):