mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-03-11 09:58:49 +03:00
WIP: support so-vits-svc, message impruvement
This commit is contained in:
parent
010eb395ef
commit
d7cff0709f
@ -141,7 +141,8 @@ class SoVitsSvc40v2:
|
||||
wav_44k = audio_buffer
|
||||
# f0 = utils.compute_f0_parselmouth(wav, sampling_rate=self.target_sample, hop_length=self.hop_size)
|
||||
f0 = utils.compute_f0_dio(wav_44k, sampling_rate=self.hps.data.sampling_rate, hop_length=self.hps.data.hop_length)
|
||||
print(f"--- >>>>> ---- >>>> {wav_44k.shape[0] / self.hps.data.hop_length}")
|
||||
if wav_44k.shape[0] % self.hps.data.hop_length != 0:
|
||||
print(f" !!! !!! !!! wav size not multiple of hopsize: {wav_44k.shape[0] / self.hps.data.hop_length}")
|
||||
|
||||
f0, uv = utils.interpolate_f0(f0)
|
||||
f0 = torch.FloatTensor(f0)
|
||||
|
@ -147,7 +147,7 @@ class VoiceChanger():
|
||||
# receivedData: tuple of short
|
||||
def on_request(self, receivedData: any):
|
||||
processing_sampling_rate = self.voiceChanger.get_processing_sampling_rate()
|
||||
print(f"------------ Convert processing.... ------------")
|
||||
print_convert_processing(f"------------ Convert processing.... ------------")
|
||||
# 前処理
|
||||
with Timer("pre-process") as t:
|
||||
|
||||
@ -158,7 +158,8 @@ class VoiceChanger():
|
||||
|
||||
inputSize = newData.shape[0]
|
||||
convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
|
||||
print(f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
|
||||
print_convert_processing(
|
||||
f" Input data size of {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
|
||||
|
||||
if convertSize < 8192:
|
||||
convertSize = 8192
|
||||
@ -170,8 +171,9 @@ class VoiceChanger():
|
||||
overlapSize = min(self.settings.crossFadeOverlapSize, inputSize)
|
||||
cropRange = (-1 * (inputSize + overlapSize), -1 * overlapSize)
|
||||
|
||||
print(f" Convert input data size of {convertSize}")
|
||||
print(f" overlap:{overlapSize}, cropRange:{cropRange}")
|
||||
print_convert_processing(f" Convert input data size of {convertSize}")
|
||||
print_convert_processing(f" overlap:{overlapSize}, cropRange:{cropRange}")
|
||||
|
||||
self._generate_strength(inputSize)
|
||||
data = self.voiceChanger.generate_input(newData, convertSize, cropRange)
|
||||
preprocess_time = t.secs
|
||||
@ -190,16 +192,18 @@ class VoiceChanger():
|
||||
cur_overlap = audio[cur_overlap_start:cur_overlap_end]
|
||||
# cur_overlap = audio[-1 * (inputSize + overlapSize):-1 * inputSize]
|
||||
powered_prev = prev_overlap * self.np_prev_strength
|
||||
print(f" ---- ---- ---- audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
|
||||
print(f" ---- ---- ---------------- {cur_overlap_start}, {cur_overlap_end}")
|
||||
print_convert_processing(
|
||||
f" audio:{audio.shape}, cur_overlap:{cur_overlap.shape}, self.np_cur_strength:{self.np_cur_strength.shape}")
|
||||
print_convert_processing(f" cur_overlap_strt:{cur_overlap_start}, cur_overlap_end{cur_overlap_end}")
|
||||
powered_cur = cur_overlap * self.np_cur_strength
|
||||
powered_result = powered_prev + powered_cur
|
||||
|
||||
cur = audio[-1 * inputSize:-1 * overlapSize]
|
||||
result = np.concatenate([powered_result, cur], axis=0)
|
||||
print(f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
|
||||
# print(prev_overlap.shape, self.np_prev_strength.shape, cur_overlap.shape, self.np_cur_strength.shape)
|
||||
# print(">>>>>>>>>>>", -1 * (inputSize + overlapSize), -1 * inputSize, self.np_prev_audio1.shape, overlapSize)
|
||||
print_convert_processing(
|
||||
f" overlap:{overlapSize}, current:{cur.shape[0]}, result:{result.shape[0]}... result should be same as input")
|
||||
if cur.shape[0] != result.shape[0]:
|
||||
print_convert_processing(f" current and result should be same as input")
|
||||
|
||||
else:
|
||||
result = np.zeros(4096).astype(np.int16)
|
||||
@ -221,7 +225,8 @@ class VoiceChanger():
|
||||
else:
|
||||
outputData = result
|
||||
|
||||
print(f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
print_convert_processing(
|
||||
f" Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
|
||||
if self.settings.recordIO == 1:
|
||||
self.ioRecorder.writeInput(receivedData)
|
||||
@ -229,16 +234,26 @@ class VoiceChanger():
|
||||
|
||||
if receivedData.shape[0] != outputData.shape[0]:
|
||||
outputData = pad_array(outputData, receivedData.shape[0])
|
||||
print(
|
||||
print_convert_processing(
|
||||
f" Padded!, Output data size of {result.shape[0]}/{processing_sampling_rate}hz {outputData.shape[0]}/{self.settings.inputSampleRate}hz")
|
||||
|
||||
postprocess_time = t.secs
|
||||
|
||||
print(" [fin] Input/Output size:", receivedData.shape[0], outputData.shape[0])
|
||||
print_convert_processing(f" [fin] Input/Output size:{receivedData.shape[0]},{outputData.shape[0]}")
|
||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||
return outputData, perf
|
||||
|
||||
|
||||
##############
|
||||
PRINT_CONVERT_PROCESSING = False
|
||||
# PRINT_CONVERT_PROCESSING = True
|
||||
|
||||
|
||||
def print_convert_processing(mess: str):
|
||||
if PRINT_CONVERT_PROCESSING == True:
|
||||
print(mess)
|
||||
|
||||
|
||||
def pad_array(arr, target_length):
|
||||
current_length = arr.shape[0]
|
||||
if current_length >= target_length:
|
||||
@ -250,8 +265,6 @@ def pad_array(arr, target_length):
|
||||
padded_arr = np.pad(arr, (pad_left, pad_right), 'constant', constant_values=(0, 0))
|
||||
return padded_arr
|
||||
|
||||
##############
|
||||
|
||||
|
||||
class Timer(object):
|
||||
def __init__(self, title: str):
|
||||
|
Loading…
x
Reference in New Issue
Block a user