2023-04-28 00:39:51 +03:00
from typing import Any , Union , cast
2023-05-06 22:18:18 +03:00
2023-04-10 18:21:17 +03:00
from const import TMP_DIR , ModelType
2022-12-31 10:08:14 +03:00
import torch
2023-01-28 09:56:56 +03:00
import os
import traceback
2022-12-31 10:08:14 +03:00
import numpy as np
2023-04-10 18:21:17 +03:00
from dataclasses import dataclass , asdict , field
2023-02-18 14:53:15 +03:00
import resampy
2023-01-14 00:44:30 +03:00
2023-02-10 18:59:44 +03:00
2023-03-07 16:30:48 +03:00
from voice_changer . IORecorder import IORecorder
2023-04-28 00:39:51 +03:00
from voice_changer . utils . LoadModelParams import LoadModelParams
2023-03-07 16:30:48 +03:00
2023-04-12 19:13:25 +03:00
from voice_changer . utils . Timer import Timer
2023-06-16 11:12:03 +03:00
from voice_changer . utils . VoiceChangerModel import AudioInOut
2023-05-03 07:14:00 +03:00
from Exceptions import (
2023-06-03 12:05:10 +03:00
DeviceCannotSupportHalfPrecisionException ,
2023-05-04 11:15:53 +03:00
DeviceChangingException ,
2023-05-03 07:14:00 +03:00
HalfPrecisionChangingException ,
NoModeLoadedException ,
2023-05-04 11:15:53 +03:00
NotEnoughDataExtimateF0 ,
2023-05-03 07:14:00 +03:00
ONNXInputArgumentException ,
2023-06-03 11:13:36 +03:00
VoiceChangerIsNotSelectedException ,
2023-05-03 07:14:00 +03:00
)
2023-04-27 17:38:25 +03:00
from voice_changer . utils . VoiceChangerParams import VoiceChangerParams
2023-04-10 03:28:00 +03:00
2023-03-07 16:30:48 +03:00
STREAM_INPUT_FILE = os . path . join ( TMP_DIR , " in.wav " )
STREAM_OUTPUT_FILE = os . path . join ( TMP_DIR , " out.wav " )
2023-02-12 06:25:57 +03:00
2023-01-08 10:18:20 +03:00
@dataclass
2023-04-27 17:38:25 +03:00
class VoiceChangerSettings :
2023-04-20 10:15:57 +03:00
inputSampleRate : int = 48000 # 48000 or 24000
2023-02-19 04:12:25 +03:00
2023-01-28 09:56:56 +03:00
crossFadeOffsetRate : float = 0.1
crossFadeEndRate : float = 0.9
2023-02-19 00:25:22 +03:00
crossFadeOverlapSize : int = 4096
2023-02-19 04:12:25 +03:00
2023-02-20 01:14:05 +03:00
recordIO : int = 0 # 0:off, 1:on
2023-06-16 09:06:35 +03:00
2023-05-06 22:18:18 +03:00
performance : list [ int ] = field ( default_factory = lambda : [ 0 , 0 , 0 , 0 ] )
2023-02-10 18:59:44 +03:00
2023-01-08 10:18:20 +03:00
# ↓mutableな物だけ列挙
2023-04-10 18:21:17 +03:00
intData : list [ str ] = field (
2023-05-06 22:18:18 +03:00
default_factory = lambda : [
" inputSampleRate " ,
" crossFadeOverlapSize " ,
" recordIO " ,
]
2023-04-10 18:21:17 +03:00
)
floatData : list [ str ] = field (
2023-05-26 17:53:27 +03:00
default_factory = lambda : [
" crossFadeOffsetRate " ,
" crossFadeEndRate " ,
]
2023-04-10 18:21:17 +03:00
)
2023-04-27 17:38:25 +03:00
strData : list [ str ] = field ( default_factory = lambda : [ ] )
2023-01-08 10:18:20 +03:00
2023-01-28 09:56:56 +03:00
2023-05-09 12:59:36 +03:00
class VoiceChanger :
ioRecorder : IORecorder
2023-06-17 08:16:29 +03:00
sola_buffer : AudioInOut
2023-05-26 10:26:17 +03:00
2023-04-27 17:38:25 +03:00
def __init__ ( self , params : VoiceChangerParams ) :
2023-01-08 10:18:20 +03:00
# 初期化
2023-04-10 02:18:14 +03:00
self . settings = VoiceChangerSettings ( )
2023-01-10 16:49:16 +03:00
self . onnx_session = None
2023-04-28 00:39:51 +03:00
self . currentCrossFadeOffsetRate = 0.0
self . currentCrossFadeEndRate = 0.0
2023-03-12 20:06:39 +03:00
self . currentCrossFadeOverlapSize = 0 # setting
self . crossfadeSize = 0 # calculated
2023-01-28 09:56:56 +03:00
2023-04-10 18:21:17 +03:00
self . voiceChanger = None
2023-04-28 00:39:51 +03:00
self . modelType : ModelType | None = None
2023-04-10 18:21:17 +03:00
self . params = params
self . gpu_num = torch . cuda . device_count ( )
self . prev_audio = np . zeros ( 4096 )
2023-06-15 20:50:05 +03:00
self . mps_enabled : bool = getattr ( torch . backends , " mps " , None ) is not None and torch . backends . mps . is_available ( )
2023-04-10 18:21:17 +03:00
2023-06-15 20:50:05 +03:00
print ( f " VoiceChanger Initialized (GPU_NUM: { self . gpu_num } , mps_enabled: { self . mps_enabled } ) " )
2023-04-10 18:21:17 +03:00
2023-06-21 01:23:13 +03:00
def setModel ( self , model : Any ) :
self . voiceChanger = model
2023-04-10 18:21:17 +03:00
def getModelType ( self ) :
2023-04-28 00:39:51 +03:00
if self . modelType is not None :
2023-04-10 18:21:17 +03:00
return { " status " : " OK " , " vc " : self . modelType }
else :
return { " status " : " OK " , " vc " : " none " }
2023-01-04 20:28:36 +03:00
2023-04-28 00:39:51 +03:00
def loadModel ( self , props : LoadModelParams ) :
2023-04-14 05:03:52 +03:00
try :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
2023-06-15 20:50:05 +03:00
raise VoiceChangerIsNotSelectedException ( " Voice Changer is not selected. " )
2023-04-16 03:56:12 +03:00
return self . voiceChanger . loadModel ( props )
2023-04-14 05:03:52 +03:00
except Exception as e :
2023-04-28 07:49:40 +03:00
print ( traceback . format_exc ( ) )
2023-04-14 05:03:52 +03:00
print ( " [Voice Changer] Model Load Error! Check your model is valid. " , e )
return { " status " : " NG " }
2022-12-31 10:08:14 +03:00
2023-01-07 18:25:21 +03:00
def get_info ( self ) :
2023-01-08 10:18:20 +03:00
data = asdict ( self . settings )
2023-05-26 10:52:05 +03:00
if self . voiceChanger is not None :
2023-04-10 18:21:17 +03:00
data . update ( self . voiceChanger . get_info ( ) )
2023-01-08 10:18:20 +03:00
return data
2023-05-06 22:18:18 +03:00
def get_performance ( self ) :
return self . settings . performance
2023-04-10 03:28:00 +03:00
def update_settings ( self , key : str , val : Any ) :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
2023-05-28 16:08:10 +03:00
return self . get_info ( )
2023-05-26 10:52:05 +03:00
2023-06-28 23:50:32 +03:00
if key == " serverAudioStated " and val == 0 :
self . settings . inputSampleRate = 48000
2023-03-07 18:38:09 +03:00
if key in self . settings . intData :
2023-01-08 10:18:20 +03:00
setattr ( self . settings , key , int ( val ) )
2023-01-08 15:19:44 +03:00
if key == " crossFadeOffsetRate " or key == " crossFadeEndRate " :
2023-03-12 20:06:39 +03:00
self . crossfadeSize = 0
2023-02-14 23:02:51 +03:00
if key == " recordIO " and val == 1 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-06-15 20:50:05 +03:00
self . ioRecorder = IORecorder ( STREAM_INPUT_FILE , STREAM_OUTPUT_FILE , self . settings . inputSampleRate )
2023-02-15 01:18:05 +03:00
if key == " recordIO " and val == 0 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-02-16 21:03:21 +03:00
pass
if key == " recordIO " and val == 2 :
2023-03-07 16:30:48 +03:00
if hasattr ( self , " ioRecorder " ) :
self . ioRecorder . close ( )
2023-01-08 10:18:20 +03:00
elif key in self . settings . floatData :
setattr ( self . settings , key , float ( val ) )
elif key in self . settings . strData :
setattr ( self . settings , key , str ( val ) )
2023-01-08 03:45:58 +03:00
else :
2023-05-26 10:52:05 +03:00
ret = self . voiceChanger . update_settings ( key , val )
if ret is False :
2023-05-31 08:30:35 +03:00
pass
# print(f"({key} is not mutable variable or unknown variable)")
2023-01-10 18:59:09 +03:00
return self . get_info ( )
2023-01-08 10:18:20 +03:00
2023-03-12 20:06:39 +03:00
def _generate_strength ( self , crossfadeSize : int ) :
2023-06-15 20:50:05 +03:00
if self . crossfadeSize != crossfadeSize or self . currentCrossFadeOffsetRate != self . settings . crossFadeOffsetRate or self . currentCrossFadeEndRate != self . settings . crossFadeEndRate or self . currentCrossFadeOverlapSize != self . settings . crossFadeOverlapSize :
2023-03-12 20:06:39 +03:00
self . crossfadeSize = crossfadeSize
2023-01-10 18:59:09 +03:00
self . currentCrossFadeOffsetRate = self . settings . crossFadeOffsetRate
self . currentCrossFadeEndRate = self . settings . crossFadeEndRate
2023-02-19 00:25:22 +03:00
self . currentCrossFadeOverlapSize = self . settings . crossFadeOverlapSize
2023-01-11 19:05:38 +03:00
2023-03-12 20:06:39 +03:00
cf_offset = int ( crossfadeSize * self . settings . crossFadeOffsetRate )
cf_end = int ( crossfadeSize * self . settings . crossFadeEndRate )
2023-01-04 20:28:36 +03:00
cf_range = cf_end - cf_offset
percent = np . arange ( cf_range ) / cf_range
2023-01-28 09:56:56 +03:00
np_prev_strength = np . cos ( percent * 0.5 * np . pi ) * * 2
np_cur_strength = np . cos ( ( 1 - percent ) * 0.5 * np . pi ) * * 2
2023-01-04 20:28:36 +03:00
2023-04-27 17:38:25 +03:00
self . np_prev_strength = np . concatenate (
[
np . ones ( cf_offset ) ,
np_prev_strength ,
np . zeros ( crossfadeSize - cf_offset - len ( np_prev_strength ) ) ,
]
)
self . np_cur_strength = np . concatenate (
[
np . zeros ( cf_offset ) ,
np_cur_strength ,
np . ones ( crossfadeSize - cf_offset - len ( np_cur_strength ) ) ,
]
)
2023-06-15 20:50:05 +03:00
print ( f " Generated Strengths: for prev: { self . np_prev_strength . shape } , for cur: { self . np_cur_strength . shape } " )
2023-01-28 09:56:56 +03:00
2023-01-04 20:28:36 +03:00
# ひとつ前の結果とサイズが変わるため、記録は消去する。
2023-04-28 00:39:51 +03:00
if hasattr ( self , " np_prev_audio1 " ) is True :
2023-03-07 15:46:43 +03:00
delattr ( self , " np_prev_audio1 " )
2023-04-28 00:39:51 +03:00
if hasattr ( self , " sola_buffer " ) is True :
2023-04-14 05:03:52 +03:00
del self . sola_buffer
2023-04-14 03:18:34 +03:00
2023-06-16 09:06:35 +03:00
def get_processing_sampling_rate ( self ) :
if self . voiceChanger is None :
return 0
else :
return self . voiceChanger . get_processing_sampling_rate ( )
2023-03-07 17:14:14 +03:00
# receivedData: tuple of short
2023-06-15 20:50:05 +03:00
def on_request ( self , receivedData : AudioInOut ) - > tuple [ AudioInOut , list [ Union [ int , float ] ] ] :
2023-04-14 22:58:56 +03:00
return self . on_request_sola ( receivedData )
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
def on_request_sola ( self , receivedData : AudioInOut ) - > tuple [ AudioInOut , list [ Union [ int , float ] ] ] :
2023-04-17 03:45:12 +03:00
try :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
2023-06-15 20:50:05 +03:00
raise VoiceChangerIsNotSelectedException ( " Voice Changer is not selected. " )
2023-05-26 10:52:05 +03:00
2023-04-17 03:45:12 +03:00
processing_sampling_rate = self . voiceChanger . get_processing_sampling_rate ( )
# 前処理
with Timer ( " pre-process " ) as t :
if self . settings . inputSampleRate != processing_sampling_rate :
2023-04-27 17:38:25 +03:00
newData = cast (
AudioInOut ,
resampy . resample (
receivedData ,
self . settings . inputSampleRate ,
processing_sampling_rate ,
) ,
)
2023-04-17 03:45:12 +03:00
else :
newData = receivedData
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
sola_search_frame = int ( 0.012 * processing_sampling_rate )
# sola_search_frame = 0
block_frame = newData . shape [ 0 ]
crossfade_frame = min ( self . settings . crossFadeOverlapSize , block_frame )
self . _generate_strength ( crossfade_frame )
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
data = self . voiceChanger . generate_input ( newData , block_frame , crossfade_frame , sola_search_frame )
2023-04-17 03:45:12 +03:00
preprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
# 変換処理
with Timer ( " main-process " ) as t :
2023-04-14 03:18:34 +03:00
# Inference
audio = self . voiceChanger . inference ( data )
2023-04-28 00:39:51 +03:00
if hasattr ( self , " sola_buffer " ) is True :
2023-04-14 03:18:34 +03:00
np . set_printoptions ( threshold = 10000 )
2023-06-15 20:50:05 +03:00
audio_offset = - 1 * ( sola_search_frame + crossfade_frame + block_frame )
2023-04-28 00:39:51 +03:00
audio = audio [ audio_offset : ]
2023-06-02 17:33:46 +03:00
2023-04-14 03:18:34 +03:00
# SOLA algorithm from https://github.com/yxlllc/DDSP-SVC, https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI
2023-04-27 17:38:25 +03:00
cor_nom = np . convolve (
audio [ : crossfade_frame + sola_search_frame ] ,
np . flip ( self . sola_buffer ) ,
" valid " ,
)
cor_den = np . sqrt (
np . convolve (
audio [ : crossfade_frame + sola_search_frame ] * * 2 ,
np . ones ( crossfade_frame ) ,
" valid " ,
)
+ 1e-3
)
2023-04-28 00:39:51 +03:00
sola_offset = int ( np . argmax ( cor_nom / cor_den ) )
sola_end = sola_offset + block_frame
output_wav = audio [ sola_offset : sola_end ] . astype ( np . float64 )
2023-04-14 03:18:34 +03:00
output_wav [ : crossfade_frame ] * = self . np_cur_strength
output_wav [ : crossfade_frame ] + = self . sola_buffer [ : ]
result = output_wav
else :
2023-05-30 20:26:16 +03:00
print ( " [Voice Changer] warming up... generating sola buffer. " )
2023-04-14 03:18:34 +03:00
result = np . zeros ( 4096 ) . astype ( np . int16 )
2023-06-15 20:50:05 +03:00
if hasattr ( self , " sola_buffer " ) is True and sola_offset < sola_search_frame :
2023-04-28 00:39:51 +03:00
offset = - 1 * ( sola_search_frame + crossfade_frame - sola_offset )
end = - 1 * ( sola_search_frame - sola_offset )
sola_buf_org = audio [ offset : end ]
2023-04-14 03:18:34 +03:00
self . sola_buffer = sola_buf_org * self . np_prev_strength
else :
2023-04-27 17:38:25 +03:00
self . sola_buffer = audio [ - crossfade_frame : ] * self . np_prev_strength
2023-04-14 03:18:34 +03:00
# self.sola_buffer = audio[- crossfade_frame:]
2023-04-17 03:45:12 +03:00
mainprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-04-17 03:45:12 +03:00
# 後処理
with Timer ( " post-process " ) as t :
result = result . astype ( np . int16 )
if self . settings . inputSampleRate != processing_sampling_rate :
2023-05-09 19:02:28 +03:00
# print(
# "samplingrate",
# self.settings.inputSampleRate,
# processing_sampling_rate,
# )
2023-04-27 17:38:25 +03:00
outputData = cast (
AudioInOut ,
resampy . resample (
result ,
processing_sampling_rate ,
self . settings . inputSampleRate ,
) . astype ( np . int16 ) ,
)
2023-04-17 03:45:12 +03:00
else :
outputData = result
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
print_convert_processing ( f " Output data size of { result . shape [ 0 ] } / { processing_sampling_rate } hz { outputData . shape [ 0 ] } / { self . settings . inputSampleRate } hz " )
2023-04-14 03:18:34 +03:00
2023-05-09 12:59:36 +03:00
if receivedData . shape [ 0 ] != outputData . shape [ 0 ] :
2023-05-09 19:02:28 +03:00
# print(
# f"Padding, in:{receivedData.shape[0]} out:{outputData.shape[0]}"
# )
2023-05-09 12:59:36 +03:00
outputData = pad_array ( outputData , receivedData . shape [ 0 ] )
# print_convert_processing(
# f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
2023-05-14 20:20:49 +03:00
pass
if self . settings . recordIO == 1 :
self . ioRecorder . writeInput ( receivedData )
self . ioRecorder . writeOutput ( outputData . tobytes ( ) )
2023-04-17 03:45:12 +03:00
postprocess_time = t . secs
2023-04-14 03:18:34 +03:00
2023-06-15 20:50:05 +03:00
print_convert_processing ( f " [fin] Input/Output size: { receivedData . shape [ 0 ] } , { outputData . shape [ 0 ] } " )
2023-04-17 03:45:12 +03:00
perf = [ preprocess_time , mainprocess_time , postprocess_time ]
return outputData , perf
except NoModeLoadedException as e :
print ( " [Voice Changer] [Exception] " , e )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-18 21:06:45 +03:00
except ONNXInputArgumentException as e :
2023-05-28 16:08:10 +03:00
print ( " [Voice Changer] [Exception] onnx are waiting valid input. " , e )
2023-04-18 21:06:45 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-05-30 20:26:16 +03:00
except HalfPrecisionChangingException :
print ( " [Voice Changer] Switching model configuration.... " )
2023-05-03 07:14:00 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-05-30 20:26:16 +03:00
except NotEnoughDataExtimateF0 :
print ( " [Voice Changer] warming up... waiting more data. " )
2023-05-04 11:15:53 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
except DeviceChangingException as e :
print ( " [Voice Changer] embedder: " , e )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-06-03 11:13:36 +03:00
except VoiceChangerIsNotSelectedException :
2023-06-15 20:50:05 +03:00
print ( " [Voice Changer] Voice Changer is not selected. Wait a bit and if there is no improvement, please re-select vc. " )
2023-06-03 11:13:36 +03:00
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-06-03 12:05:10 +03:00
except DeviceCannotSupportHalfPrecisionException :
# RVC.pyでfallback処理をするので、ここはダミーデータ返すだけ。
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-17 03:45:12 +03:00
except Exception as e :
2023-06-03 11:13:36 +03:00
print ( " [Voice Changer] VC PROCESSING EXCEPTION!!! " , e )
2023-04-17 03:45:12 +03:00
print ( traceback . format_exc ( ) )
return np . zeros ( 1 ) . astype ( np . int16 ) , [ 0 , 0 , 0 ]
2023-04-14 03:18:34 +03:00
2023-04-13 02:00:28 +03:00
def export2onnx ( self ) :
return self . voiceChanger . export2onnx ( )
2023-02-20 22:07:43 +03:00
2023-04-13 02:00:28 +03:00
##############
2023-04-27 17:38:25 +03:00
2023-04-30 20:34:01 +03:00
def merge_models ( self , request : str ) :
2023-05-26 10:52:05 +03:00
if self . voiceChanger is None :
print ( " [Voice Changer] Voice Changer is not selected. " )
return
2023-04-30 20:34:01 +03:00
self . voiceChanger . merge_models ( request )
return self . get_info ( )
2023-04-27 17:38:25 +03:00
2023-04-10 03:28:00 +03:00
PRINT_CONVERT_PROCESSING : bool = False
2023-03-10 21:59:03 +03:00
# PRINT_CONVERT_PROCESSING = True
def print_convert_processing ( mess : str ) :
2023-04-28 00:39:51 +03:00
if PRINT_CONVERT_PROCESSING is True :
2023-03-10 21:59:03 +03:00
print ( mess )
2023-04-12 19:13:25 +03:00
def pad_array ( arr : AudioInOut , target_length : int ) :
2023-03-10 19:56:10 +03:00
current_length = arr . shape [ 0 ]
if current_length > = target_length :
return arr
else :
pad_width = target_length - current_length
pad_left = pad_width / / 2
pad_right = pad_width - pad_left
2023-05-14 20:20:49 +03:00
# padded_arr = np.pad(
# arr, (pad_left, pad_right), "constant", constant_values=(0, 0)
# )
padded_arr = np . pad ( arr , ( pad_left , pad_right ) , " edge " )
2023-03-10 19:56:10 +03:00
return padded_arr