2023-04-28 00:39:51 +03:00
from typing import Any , Union , cast
2023-05-06 22:18:18 +03:00
import socketio
2023-04-10 18:21:17 +03:00
from const import TMP_DIR , ModelType
2022-12-31 10:08:14 +03:00
import torch
2023-01-28 09:56:56 +03:00
import os
import traceback
2022-12-31 10:08:14 +03:00
import numpy as np
2023-04-10 18:21:17 +03:00
from dataclasses import dataclass , asdict , field
2023-02-18 14:53:15 +03:00
import resampy
2023-01-14 00:44:30 +03:00
2023-02-10 18:59:44 +03:00
2023-03-07 16:30:48 +03:00
from voice_changer . IORecorder import IORecorder
2023-05-06 22:18:18 +03:00
from voice_changer . Local . AudioDeviceList import ServerAudioDevice , list_audio_device
2023-04-28 00:39:51 +03:00
from voice_changer . utils . LoadModelParams import LoadModelParams
2023-03-07 16:30:48 +03:00
2023-04-12 19:13:25 +03:00
from voice_changer . utils . Timer import Timer
from voice_changer . utils . VoiceChangerModel import VoiceChangerModel , AudioInOut
2023-05-03 07:14:00 +03:00
from Exceptions import (
2023-06-03 12:05:10 +03:00
DeviceCannotSupportHalfPrecisionException ,
2023-05-04 11:15:53 +03:00
DeviceChangingException ,
2023-05-03 07:14:00 +03:00
HalfPrecisionChangingException ,
NoModeLoadedException ,
2023-05-04 11:15:53 +03:00
NotEnoughDataExtimateF0 ,
2023-05-03 07:14:00 +03:00
ONNXInputArgumentException ,
2023-06-03 11:13:36 +03:00
VoiceChangerIsNotSelectedException ,
2023-05-03 07:14:00 +03:00
)
2023-04-27 17:38:25 +03:00
from voice_changer . utils . VoiceChangerParams import VoiceChangerParams
2023-05-06 22:18:18 +03:00
import threading
import time
2023-05-09 12:59:36 +03:00
import sounddevice as sd
import librosa
2023-04-10 03:28:00 +03:00
2023-03-07 16:30:48 +03:00
STREAM_INPUT_FILE = os . path . join ( TMP_DIR , " in.wav " )
STREAM_OUTPUT_FILE = os . path . join ( TMP_DIR , " out.wav " )
2023-02-12 06:25:57 +03:00
2023-01-08 10:18:20 +03:00
@dataclass
2023-04-27 17:38:25 +03:00
class VoiceChangerSettings :
2023-04-20 10:15:57 +03:00
inputSampleRate : int = 48000 # 48000 or 24000
2023-02-19 04:12:25 +03:00
2023-01-28 09:56:56 +03:00
crossFadeOffsetRate : float = 0.1
crossFadeEndRate : float = 0.9
2023-02-19 00:25:22 +03:00
crossFadeOverlapSize : int = 4096
2023-02-19 04:12:25 +03:00
2023-02-20 01:14:05 +03:00
recordIO : int = 0 # 0:off, 1:on
2023-05-06 22:18:18 +03:00
serverAudioInputDevices : list [ ServerAudioDevice ] = field ( default_factory = lambda : [ ] )
2023-06-15 20:50:05 +03:00
serverAudioOutputDevices : list [ ServerAudioDevice ] = field ( default_factory = lambda : [ ] )
2023-05-06 22:18:18 +03:00
enableServerAudio : int = 0 # 0:off, 1:on
serverAudioStated : int = 0 # 0:off, 1:on
2023-05-09 12:59:36 +03:00
# serverInputAudioSampleRate: int = 48000
# serverOutputAudioSampleRate: int = 48000
serverInputAudioSampleRate : int = 44100
serverOutputAudioSampleRate : int = 44100
2023-05-26 17:53:27 +03:00
# serverInputAudioBufferSize: int = 1024 * 24
# serverOutputAudioBufferSize: int = 1024 * 24
2023-05-06 22:18:18 +03:00
serverInputDeviceId : int = - 1
serverOutputDeviceId : int = - 1
serverReadChunkSize : int = 256
2023-05-26 17:53:27 +03:00
serverInputAudioGain : float = 1.0
serverOutputAudioGain : float = 1.0
2023-05-06 22:18:18 +03:00
performance : list [ int ] = field ( default_factory = lambda : [ 0 , 0 , 0 , 0 ] )
2023-02-10 18:59:44 +03:00
2023-01-08 10:18:20 +03:00
# ↓mutableな物だけ列挙
2023-04-10 18:21:17 +03:00
intData : list [ str ] = field (
2023-05-06 22:18:18 +03:00
default_factory = lambda : [
" inputSampleRate " ,
" crossFadeOverlapSize " ,
" recordIO " ,
" enableServerAudio " ,
" serverAudioStated " ,
" serverInputAudioSampleRate " ,
" serverOutputAudioSampleRate " ,
2023-05-26 17:53:27 +03:00
# "serverInputAudioBufferSize",
# "serverOutputAudioBufferSize",
2023-05-06 22:18:18 +03:00
" serverInputDeviceId " ,
" serverOutputDeviceId " ,
" serverReadChunkSize " ,
]
2023-04-10 18:21:17 +03:00
)
floatData : list [ str ] = field (
2023-05-26 17:53:27 +03:00
default_factory = lambda : [
" crossFadeOffsetRate " ,
" crossFadeEndRate " ,
" serverInputAudioGain " ,
" serverOutputAudioGain " ,
]
2023-04-10 18:21:17 +03:00
)
2023-04-27 17:38:25 +03:00
strData : list [ str ] = field ( default_factory = lambda : [ ] )
2023-01-08 10:18:20 +03:00
2023-01-28 09:56:56 +03:00
2023-05-09 12:59:36 +03:00
class VoiceChanger :
2023-05-28 16:08:10 +03:00
settings : VoiceChangerSettings = VoiceChangerSettings ( )
2023-05-26 10:52:05 +03:00
voiceChanger : VoiceChangerModel | None = None
2023-05-09 12:59:36 +03:00
ioRecorder : IORecorder
sola_buffer : AudioInOut
namespace : socketio . AsyncNamespace | None = None
2023-05-06 22:18:18 +03:00
2023-05-09 19:26:38 +03:00
localPerformanceShowTime = 0.0
2023-05-09 19:06:34 +03:00
2023-05-26 10:26:17 +03:00
emitTo = None
2023-06-15 20:50:05 +03:00
def audio_callback ( self , indata : np . ndarray , outdata : np . ndarray , frames , times , status ) :
2023-05-09 12:59:36 +03:00
try :
2023-05-26 17:53:27 +03:00
indata = indata * self . settings . serverInputAudioGain
2023-05-06 22:18:18 +03:00
with Timer ( " all_inference_time " ) as t :
2023-05-09 12:59:36 +03:00
unpackedData = librosa . to_mono ( indata . T ) * 32768.0
out_wav , times = self . on_request ( unpackedData )
2023-05-13 08:30:15 +03:00
outputChunnels = outdata . shape [ 1 ]
2023-06-15 20:50:05 +03:00
outdata [ : ] = np . repeat ( out_wav , outputChunnels ) . reshape ( - 1 , outputChunnels ) / 32768.0
2023-05-26 17:53:27 +03:00
outdata [ : ] = outdata * self . settings . serverOutputAudioGain
2023-05-06 22:18:18 +03:00
all_inference_time = t . secs
performance = [ all_inference_time ] + times
2023-05-26 10:26:17 +03:00
if self . emitTo is not None :
self . emitTo ( performance )
2023-05-09 19:26:38 +03:00
self . settings . performance = [ round ( x * 1000 ) for x in performance ]
2023-05-09 12:59:36 +03:00
except Exception as e :
2023-05-13 08:30:15 +03:00
print ( " [Voice Changer] ex: " , e )
2023-06-15 20:50:05 +03:00
def getServerAudioDevice ( self , audioDeviceList : list [ ServerAudioDevice ] , index : int ) :
2023-05-13 08:30:15 +03:00
serverAudioDevice = [ x for x in audioDeviceList if x . index == index ]
if len ( serverAudioDevice ) > 0 :
return serverAudioDevice [ 0 ]
else :
return None
2023-05-08 23:04:34 +03:00
2023-05-09 12:59:36 +03:00
def serverLocal ( self , _vc ) :
vc : VoiceChanger = _vc
2023-05-06 22:18:18 +03:00
2023-05-09 12:59:36 +03:00
currentInputDeviceId = - 1
2023-05-26 17:04:56 +03:00
currentModelSamplingRate = - 1
2023-05-09 12:59:36 +03:00
currentOutputDeviceId = - 1
currentInputChunkNum = - 1
while True :
2023-06-15 20:50:05 +03:00
if vc . settings . serverAudioStated == 0 or vc . settings . serverInputDeviceId == - 1 or vc . voiceChanger is None :
2023-05-09 12:59:36 +03:00
vc . settings . inputSampleRate = 48000
time . sleep ( 2 )
else :
sd . _terminate ( )
sd . _initialize ( )
2023-05-13 08:30:15 +03:00
sd . default . device [ 0 ] = vc . settings . serverInputDeviceId
currentInputDeviceId = vc . settings . serverInputDeviceId
sd . default . device [ 1 ] = vc . settings . serverOutputDeviceId
currentOutputDeviceId = vc . settings . serverOutputDeviceId
currentInputChannelNum = vc . settings . serverAudioInputDevices
2023-06-15 20:50:05 +03:00
serverInputAudioDevice = self . getServerAudioDevice ( vc . settings . serverAudioInputDevices , currentInputDeviceId )
serverOutputAudioDevice = self . getServerAudioDevice ( vc . settings . serverAudioOutputDevices , currentOutputDeviceId )
2023-05-13 08:30:15 +03:00
print ( serverInputAudioDevice , serverOutputAudioDevice )
if serverInputAudioDevice is None or serverOutputAudioDevice is None :
time . sleep ( 2 )
print ( " serverInputAudioDevice or serverOutputAudioDevice is None " )
continue
currentInputChannelNum = serverInputAudioDevice . maxInputChannels
2023-05-13 09:23:23 +03:00
currentOutputChannelNum = serverOutputAudioDevice . maxOutputChannels
2023-05-06 22:18:18 +03:00
2023-05-09 12:59:36 +03:00
currentInputChunkNum = vc . settings . serverReadChunkSize
block_frame = currentInputChunkNum * 128
2023-05-13 08:30:15 +03:00
2023-05-26 10:26:17 +03:00
# sample rate precheck(alsa cannot use 40000?)
2023-05-26 17:04:56 +03:00
try :
2023-06-15 20:50:05 +03:00
currentModelSamplingRate = self . voiceChanger . get_processing_sampling_rate ( )
2023-05-26 17:04:56 +03:00
except Exception as e :
print ( " [Voice Changer] ex: get_processing_sampling_rate " , e )
continue
2023-05-26 10:26:17 +03:00
try :
with sd . Stream (
callback = self . audio_callback ,
blocksize = block_frame ,
2023-05-26 17:04:56 +03:00
samplerate = currentModelSamplingRate ,
2023-05-26 10:26:17 +03:00
dtype = " float32 " ,
channels = [ currentInputChannelNum , currentOutputChannelNum ] ,
) :
pass
2023-05-26 17:04:56 +03:00
vc . settings . serverInputAudioSampleRate = currentModelSamplingRate
vc . settings . inputSampleRate = currentModelSamplingRate
2023-06-15 20:50:05 +03:00
print ( f " [Voice Changer] sample rate { vc . settings . serverInputAudioSampleRate } " )
2023-05-26 10:26:17 +03:00
except Exception as e :
print (
2023-05-26 17:04:56 +03:00
" [Voice Changer] ex: fallback to device default samplerate " ,
e ,
2023-05-26 10:26:17 +03:00
)
2023-06-15 20:50:05 +03:00
vc . settings . serverInputAudioSampleRate = serverInputAudioDevice . default_samplerate
2023-05-26 17:04:56 +03:00
vc . settings . inputSampleRate = vc . settings . serverInputAudioSampleRate
2023-05-26 10:26:17 +03:00
# main loop
2023-05-09 12:59:36 +03:00
try :
with sd . Stream (
callback = self . audio_callback ,
blocksize = block_frame ,
2023-05-26 17:04:56 +03:00
samplerate = vc . settings . serverInputAudioSampleRate ,
2023-05-09 12:59:36 +03:00
dtype = " float32 " ,
2023-05-13 08:30:15 +03:00
channels = [ currentInputChannelNum , currentOutputChannelNum ] ,
2023-05-09 12:59:36 +03:00
) :
2023-06-15 20:50:05 +03:00
while vc . settings . serverAudioStated == 1 and currentInputDeviceId == vc . settings . serverInputDeviceId and currentOutputDeviceId == vc . settings . serverOutputDeviceId and currentModelSamplingRate == self . voiceChanger . get_processing_sampling_rate ( ) and currentInputChunkNum == vc . settings . serverReadChunkSize :
2023-05-09 12:59:36 +03:00
time . sleep ( 2 )
print (
" [Voice Changer] server audio " ,
self . settings . performance ,
)
print (
" [Voice Changer] info: " ,
vc . settings . serverAudioStated ,
currentInputDeviceId ,
currentOutputDeviceId ,
2023-05-26 17:04:56 +03:00
vc . settings . serverInputAudioSampleRate ,
2023-05-09 12:59:36 +03:00
currentInputChunkNum ,
)
except Exception as e :
2023-05-13 08:30:15 +03:00
print ( " [Voice Changer] ex: " , e )
2023-05-09 12:59:36 +03:00
time . sleep ( 2 )
2023-01-08 10:18:20 +03:00
2023-04-27 17:38:25 +03:00
def __init__ ( self , params : VoiceChangerParams ) :
2023-01-08 10:18:20 +03:00
# 初期化
2023-04-10 02:18:14 +03:00
self . settings = VoiceChangerSettings ( )
2023-01-10 16:49:16 +03:00
self . onnx_session = None
2023-04-28 00:39:51 +03:00
self . currentCrossFadeOffsetRate = 0.0
self . currentCrossFadeEndRate = 0.0
2023-03-12 20:06:39 +03:00
self . currentCrossFadeOverlapSize = 0 # setting
self . crossfadeSize = 0 # calculated
2023-01-28 09:56:56 +03:00
2023-04-10 18:21:17 +03:00
self . voiceChanger = None
2023-04-28 00:39:51 +03:00
self . modelType : ModelType | None = None
2023-04-10 18:21:17 +03:00
self . params = params
self . gpu_num = torch . cuda . device_count ( )
self . prev_audio = np . zeros ( 4096 )
2023-06-15 20:50:05 +03:00
self . mps_enabled : bool = getattr ( torch . backends , " mps " , None ) is not None and torch . backends . mps . is_available ( )
2023-04-10 18:21:17 +03:00
2023-05-06 22:18:18 +03:00
audioinput , audiooutput = list_audio_device ( )
self . settings . serverAudioInputDevices = audioinput
self . settings . serverAudioOutputDevices = audiooutput
2023-05-09 12:59:36 +03:00
thread = threading . Thread ( target = self . serverLocal , args = ( self , ) )
2023-05-06 22:18:18 +03:00
thread . start ( )
2023-06-15 20:50:05 +03:00
print ( f " VoiceChanger Initialized (GPU_NUM: { self . gpu_num } , mps_enabled: { self . mps_enabled } ) " )
2023-04-10 18:21:17 +03:00
def switchModelType ( self , modelType : ModelType ) :
2023-05-09 16:40:21 +03:00
try :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is not None :
2023-05-09 16:40:21 +03:00
# return {"status": "ERROR", "msg": "vc is already selected. currently re-select is not implemented"}
del self . voiceChanger
self . voiceChanger = None
self . modelType = modelType
if self . modelType == " MMVCv15 " :
from voice_changer . MMVCv15 . MMVCv15 import MMVCv15
self . voiceChanger = MMVCv15 ( ) # type: ignore
elif self . modelType == " MMVCv13 " :
from voice_changer . MMVCv13 . MMVCv13 import MMVCv13
self . voiceChanger = MMVCv13 ( )
elif self . modelType == " so-vits-svc-40v2 " :
from voice_changer . SoVitsSvc40v2 . SoVitsSvc40v2 import SoVitsSvc40v2
self . voiceChanger = SoVitsSvc40v2 ( self . params )
2023-06-15 20:50:05 +03:00
elif self . modelType == " so-vits-svc-40 " or self . modelType == " so-vits-svc-40_c " :
2023-05-09 16:40:21 +03:00
from voice_changer . SoVitsSvc40 . SoVitsSvc40 import SoVitsSvc40
self . voiceChanger = SoVitsSvc40 ( self . params )
elif self . modelType == " DDSP-SVC " :
from voice_changer . DDSP_SVC . DDSP_SVC import DDSP_SVC
self . voiceChanger = DDSP_SVC ( self . params )
elif self . modelType == " RVC " :
from voice_changer . RVC . RVC import RVC
self . voiceChanger = RVC ( self . params )
else :
from voice_changer . MMVCv13 . MMVCv13 import MMVCv13
2023-03-07 05:49:06 +03:00
2023-05-09 16:40:21 +03:00
self . voiceChanger = MMVCv13 ( )
except Exception as e :
print ( e )
print ( traceback . format_exc ( ) )
2023-04-10 18:21:17 +03:00
return { " status " : " OK " , " msg " : " vc is switched. " }
2022-12-31 10:08:14 +03:00
2023-04-10 18:21:17 +03:00
def getModelType ( self ) :
2023-04-28 00:39:51 +03:00
if self . modelType is not None :
2023-04-10 18:21:17 +03:00
return { " status " : " OK " , " vc " : self . modelType }
else :
return { " status " : " OK " , " vc " : " none " }
2023-01-04 20:28:36 +03:00
2023-04-28 00:39:51 +03:00
def loadModel ( self , props : LoadModelParams ) :
2023-04-14 05:03:52 +03:00
try :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
2023-06-15 20:50:05 +03:00
raise VoiceChangerIsNotSelectedException ( " Voice Changer is not selected. " )
2023-04-16 03:56:12 +03:00
return self . voiceChanger . loadModel ( props )
2023-04-14 05:03:52 +03:00
except Exception as e :
2023-04-28 07:49:40 +03:00
print ( traceback . format_exc ( ) )
2023-04-14 05:03:52 +03:00
print ( " [Voice Changer] Model Load Error! Check your model is valid. " , e )
return { " status " : " NG " }
2022-12-31 10:08:14 +03:00
2023-01-07 18:25:21 +03:00
def get_info ( self ) :
2023-01-08 10:18:20 +03:00
data = asdict ( self . settings )
2023-05-26 10:52:05 +03:00
if self . voiceChanger is not None :
2023-04-10 18:21:17 +03:00
data . update ( self . voiceChanger . get_info ( ) )
2023-01-08 10:18:20 +03:00
return data
2023-05-06 22:18:18 +03:00
def get_performance ( self ) :
return self . settings . performance
2023-04-10 03:28:00 +03:00
def update_settings ( self , key : str , val : Any ) :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
2023-05-28 16:08:10 +03:00
return self . get_info ( )
2023-05-26 10:52:05 +03:00
2023-03-07 18:38:09 +03:00
if key in self . settings . intData :
2023-01-08 10:18:20 +03:00
setattr ( self . settings , key , int ( val ) )
2023-01-08 15:19:44 +03:00
if key == " crossFadeOffsetRate " or key == " crossFadeEndRate " :
2023-03-12 20:06:39 +03:00
self . crossfadeSize = 0
2023-02-14 23:02:51 +03:00
if key == " recordIO " and val == 1 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-06-15 20:50:05 +03:00
self . ioRecorder = IORecorder ( STREAM_INPUT_FILE , STREAM_OUTPUT_FILE , self . settings . inputSampleRate )
2023-02-15 01:18:05 +03:00
if key == " recordIO " and val == 0 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-02-16 21:03:21 +03:00
pass
if key == " recordIO " and val == 2 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-01-08 10:18:20 +03:00
elif key in self . settings . floatData :
setattr ( self . settings , key , float ( val ) )
elif key in self . settings . strData :
setattr ( self . settings , key , str ( val ) )
2023-01-08 03:45:58 +03:00
else :
2023-05-26 10:52:05 +03:00
ret = self . voiceChanger . update_settings ( key , val )
if ret is False :
2023-05-31 08:30:35 +03:00
pass
# print(f"({key} is not mutable variable or unknown variable)")
2023-01-10 18:59:09 +03:00
return self . get_info ( )
2023-01-08 10:18:20 +03:00
2023-03-12 20:06:39 +03:00
def _generate_strength ( self , crossfadeSize : int ) :
2023-06-15 20:50:05 +03:00
if self . crossfadeSize != crossfadeSize or self . currentCrossFadeOffsetRate != self . settings . crossFadeOffsetRate or self . currentCrossFadeEndRate != self . settings . crossFadeEndRate or self . currentCrossFadeOverlapSize != self . settings . crossFadeOverlapSize :
2023-03-12 20:06:39 +03:00
self . crossfadeSize = crossfadeSize
2023-01-10 18:59:09 +03:00
self . currentCrossFadeOffsetRate = self . settings . crossFadeOffsetRate
self . currentCrossFadeEndRate = self . settings . crossFadeEndRate
2023-02-19 00:25:22 +03:00
self . currentCrossFadeOverlapSize = self . settings . crossFadeOverlapSize
2023-01-11 19:05:38 +03:00
2023-03-12 20:06:39 +03:00
cf_offset = int ( crossfadeSize * self . settings . crossFadeOffsetRate )
cf_end = int ( crossfadeSize * self . settings . crossFadeEndRate )
2023-01-04 20:28:36 +03:00
cf_range = cf_end - cf_offset
percent = np . arange ( cf_range ) / cf_range
2023-01-28 09:56:56 +03:00
np_prev_strength = np . cos ( percent * 0.5 * np . pi ) * * 2
np_cur_strength = np . cos ( ( 1 - percent ) * 0.5 * np . pi ) * * 2
2023-01-04 20:28:36 +03:00
2023-04-27 17:38:25 +03:00
self . np_prev_strength = np . concatenate (
[
np . ones ( cf_offset ) ,
np_prev_strength ,
np . zeros ( crossfadeSize - cf_offset - len ( np_prev_strength ) ) ,
]
)
self . np_cur_strength = np . concatenate (
[
np . zeros ( cf_offset ) ,
np_cur_strength ,
np . ones ( crossfadeSize - cf_offset - len ( np_cur_strength ) ) ,
]
)
2023-06-15 20:50:05 +03:00
print ( f " Generated Strengths: for prev: { self . np_prev_strength . shape } , for cur: { self . np_cur_strength . shape } " )
2023-01-28 09:56:56 +03:00
2023-01-04 20:28:36 +03:00
# ひとつ前の結果とサイズが変わるため、記録は消去する。
2023-04-28 00:39:51 +03:00
if hasattr ( self , " np_prev_audio1 " ) is True :
2023-03-07 15:46:43 +03:00
delattr ( self , " np_prev_audio1 " )
2023-04-28 00:39:51 +03:00
if hasattr ( self , " sola_buffer " ) is True :
2023-04-14 05:03:52 +03:00
del self . sola_buffer
2023-04-14 03:18:34 +03:00
2023-03-07 17:14:14 +03:00
# receivedData: tuple of short
2023-06-15 20:50:05 +03:00
def on_request ( self , receivedData : AudioInOut ) - > tuple [ AudioInOut , list [ Union [ int , float ] ] ] :
2023-04-14 22:58:56 +03:00
return self . on_request_sola ( receivedData )
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
def on_request_sola ( self , receivedData : AudioInOut ) - > tuple [ AudioInOut , list [ Union [ int , float ] ] ] :
2023-04-17 03:45:12 +03:00
try :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
2023-06-15 20:50:05 +03:00
raise VoiceChangerIsNotSelectedException ( " Voice Changer is not selected. " )
2023-05-26 10:52:05 +03:00
2023-04-17 03:45:12 +03:00
processing_sampling_rate = self . voiceChanger . get_processing_sampling_rate ( )
# 前処理
with Timer ( " pre-process " ) as t :
if self . settings . inputSampleRate != processing_sampling_rate :
2023-04-27 17:38:25 +03:00
newData = cast (
AudioInOut ,
resampy . resample (
receivedData ,
self . settings . inputSampleRate ,
processing_sampling_rate ,
) ,
)
2023-04-17 03:45:12 +03:00
else :
newData = receivedData
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
sola_search_frame = int ( 0.012 * processing_sampling_rate )
# sola_search_frame = 0
block_frame = newData . shape [ 0 ]
crossfade_frame = min ( self . settings . crossFadeOverlapSize , block_frame )
self . _generate_strength ( crossfade_frame )
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
data = self . voiceChanger . generate_input ( newData , block_frame , crossfade_frame , sola_search_frame )
2023-04-17 03:45:12 +03:00
preprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
# 変換処理
with Timer ( " main-process " ) as t :
2023-04-14 03:18:34 +03:00
# Inference
audio = self . voiceChanger . inference ( data )
2023-04-28 00:39:51 +03:00
if hasattr ( self , " sola_buffer " ) is True :
2023-04-14 03:18:34 +03:00
np . set_printoptions ( threshold = 10000 )
2023-06-15 20:50:05 +03:00
audio_offset = - 1 * ( sola_search_frame + crossfade_frame + block_frame )
2023-04-28 00:39:51 +03:00
audio = audio [ audio_offset : ]
2023-06-02 17:33:46 +03:00
2023-04-14 03:18:34 +03:00
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
2023-04-27 17:38:25 +03:00
cor_nom = np . convolve (
audio [ : crossfade_frame + sola_search_frame ] ,
np . flip ( self . sola_buffer ) ,
" valid " ,
)
cor_den = np . sqrt (
np . convolve (
audio [ : crossfade_frame + sola_search_frame ] * * 2 ,
np . ones ( crossfade_frame ) ,
" valid " ,
)
+ 1e-3
)
2023-04-28 00:39:51 +03:00
sola_offset = int ( np . argmax ( cor_nom / cor_den ) )
sola_end = sola_offset + block_frame
output_wav = audio [ sola_offset : sola_end ] . astype ( np . float64 )
2023-04-14 03:18:34 +03:00
output_wav [ : crossfade_frame ] * = self . np_cur_strength
output_wav [ : crossfade_frame ] + = self . sola_buffer [ : ]
result = output_wav
else :
2023-05-30 20:26:16 +03:00
print ( " [Voice Changer] warming up... generating sola buffer. " )
2023-04-14 03:18:34 +03:00
result = np . zeros ( 4096 ) . astype ( np . int16 )
2023-06-15 20:50:05 +03:00
if hasattr ( self , " sola_buffer " ) is True and sola_offset < sola_search_frame :
2023-04-28 00:39:51 +03:00
offset = - 1 * ( sola_search_frame + crossfade_frame - sola_offset )
end = - 1 * ( sola_search_frame - sola_offset )
sola_buf_org = audio [ offset : end ]
2023-04-14 03:18:34 +03:00
self . sola_buffer = sola_buf_org * self . np_prev_strength
else :
2023-04-27 17:38:25 +03:00
self . sola_buffer = audio [ - crossfade_frame : ] * self . np_prev_strength
2023-04-14 03:18:34 +03:00
# self.sola_buffer = audio[- crossfade_frame:]
2023-04-17 03:45:12 +03:00
mainprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
# 後処理
with Timer ( " post-process " ) as t :
result = result . astype ( np . int16 )
if self . settings . inputSampleRate != processing_sampling_rate :
2023-05-09 19:02:28 +03:00
# print(
# "samplingrate",
# self.settings.inputSampleRate,
# processing_sampling_rate,
# )
2023-04-27 17:38:25 +03:00
outputData = cast (
AudioInOut ,
resampy . resample (
result ,
processing_sampling_rate ,
self . settings . inputSampleRate ,
) . astype ( np . int16 ) ,
)
2023-04-17 03:45:12 +03:00
else :
outputData = result
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
print_convert_processing ( f " Output data size of { result . shape [ 0 ] } / { processing_sampling_rate } hz { outputData . shape [ 0 ] } / { self . settings . inputSampleRate } hz " )
2023-04-14 03:18:34 +03:00
2023-05-09 12:59:36 +03:00
if receivedData . shape [ 0 ] != outputData . shape [ 0 ] :
2023-05-09 19:02:28 +03:00
# print(
# f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}"
# )
2023-05-09 12:59:36 +03:00
outputData = pad_array ( outputData , receivedData . shape [ 0 ] )
# print_convert_processing(
# f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
2023-05-14 20:20:49 +03:00
pass
if self . settings . recordIO == 1 :
self . ioRecorder . writeInput ( receivedData )
self . ioRecorder . writeOutput ( outputData . tobytes ( ) )
2023-04-17 03:45:12 +03:00
postprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
print_convert_processing ( f " [fin] Input/Output size: { receivedData . shape [ 0 ] } , { outputData . shape [ 0 ] } " )
2023-04-17 03:45:12 +03:00
perf = [ preprocess_time , mainprocess_time , postprocess_time ]
return outputData , perf
except NoModeLoadedException as e :
print ( " [Voice Changer] [Exception] " , e )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-18 21:06:45 +03:00
except ONNXInputArgumentException as e :
2023-05-28 16:08:10 +03:00
print ( " [Voice Changer] [Exception] onnx are waiting valid input. " , e )
2023-04-18 21:06:45 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-05-30 20:26:16 +03:00
except HalfPrecisionChangingException :
print ( " [Voice Changer] Switching model configuration.... " )
2023-05-03 07:14:00 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-05-30 20:26:16 +03:00
except NotEnoughDataExtimateF0 :
print ( " [Voice Changer] warming up... waiting more data. " )
2023-05-04 11:15:53 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
except DeviceChangingException as e :
print ( " [Voice Changer] embedder: " , e )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-06-03 11:13:36 +03:00
except VoiceChangerIsNotSelectedException :
2023-06-15 20:50:05 +03:00
print ( " [Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc. " )
2023-06-03 11:13:36 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-06-03 12:05:10 +03:00
except DeviceCannotSupportHalfPrecisionException :
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-17 03:45:12 +03:00
except Exception as e :
2023-06-03 11:13:36 +03:00
print ( " [Voice Changer] VC PROCESSING EXCEPTION!!! " , e )
2023-04-17 03:45:12 +03:00
print ( traceback . format_exc ( ) )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-14 03:18:34 +03:00
2023-04-13 02:00:28 +03:00
def export2onnx ( self ) :
return self . voiceChanger . export2onnx ( )
2023-02-20 22:07:43 +03:00
2023-04-13 02:00:28 +03:00
##############
2023-04-27 17:38:25 +03:00
2023-04-30 20:34:01 +03:00
def merge_models ( self , request : str ) :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
return
2023-04-30 20:34:01 +03:00
self . voiceChanger . merge_models ( request )
return self . get_info ( )
2023-05-20 22:21:54 +03:00
def update_model_default ( self ) :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
return
2023-05-20 22:21:54 +03:00
self . voiceChanger . update_model_default ( )
return self . get_info ( )
2023-06-07 21:08:59 +03:00
def update_model_info ( self , newData : str ) :
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
return
self . voiceChanger . update_model_info ( newData )
return self . get_info ( )
def upload_model_assets ( self , params : str ) :
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
return
self . voiceChanger . upload_model_assets ( params )
return self . get_info ( )
2023-04-27 17:38:25 +03:00
2023-04-10 03:28:00 +03:00
PRINT_CONVERT_PROCESSING : bool = False
2023-03-10 21:59:03 +03:00
# PRINT_CONVERT_PROCESSING = True
def print_convert_processing ( mess : str ) :
2023-04-28 00:39:51 +03:00
if PRINT_CONVERT_PROCESSING is True :
2023-03-10 21:59:03 +03:00
print ( mess )
2023-04-12 19:13:25 +03:00
def pad_array ( arr : AudioInOut , target_length : int ) :
2023-03-10 19:56:10 +03:00
current_length = arr . shape [ 0 ]
if current_length > = target_length :
return arr
else :
pad_width = target_length - current_length
pad_left = pad_width / / 2
pad_right = pad_width - pad_left
2023-05-14 20:20:49 +03:00
# padded_arr = np.pad(
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
# )
padded_arr = np . pad ( arr , ( pad_left , pad_right ) , " edge " )
2023-03-10 19:56:10 +03:00
return padded_arr