mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
customize pipline
This commit is contained in:
parent
e780af7fc2
commit
0d06fcd16b
@ -105,10 +105,6 @@
|
|||||||
"name": "indexRatio",
|
"name": "indexRatio",
|
||||||
"options": {}
|
"options": {}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"name": "noiseScale",
|
|
||||||
"options": {}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "silentThreshold",
|
"name": "silentThreshold",
|
||||||
"options": {}
|
"options": {}
|
||||||
|
@ -180,14 +180,12 @@ class RVC:
|
|||||||
convertSize = convertSize + (128 - (convertSize % 128))
|
convertSize = convertSize + (128 - (convertSize % 128))
|
||||||
|
|
||||||
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
|
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
|
||||||
print("convert size", convertSize, self.audio_buffer.shape)
|
|
||||||
|
|
||||||
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
|
crop = self.audio_buffer[-1 * (inputSize + crossfadeSize):-1 * (crossfadeSize)]
|
||||||
rms = np.sqrt(np.square(crop).mean(axis=0))
|
rms = np.sqrt(np.square(crop).mean(axis=0))
|
||||||
vol = max(rms, self.prevVol * 0.0)
|
vol = max(rms, self.prevVol * 0.0)
|
||||||
self.prevVol = vol
|
self.prevVol = vol
|
||||||
|
|
||||||
print("audio len 01,", len(self.audio_buffer))
|
|
||||||
return (self.audio_buffer, convertSize, vol)
|
return (self.audio_buffer, convertSize, vol)
|
||||||
|
|
||||||
def _onnx_inference(self, data):
|
def _onnx_inference(self, data):
|
||||||
@ -212,7 +210,9 @@ class RVC:
|
|||||||
return np.zeros(convertSize).astype(np.int16)
|
return np.zeros(convertSize).astype(np.int16)
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
vc = VC(self.settings.modelSamplingRate, dev, self.is_half)
|
repeat = 3 if self.is_half else 1
|
||||||
|
repeat *= self.settings.rvcQuality # 0 or 3
|
||||||
|
vc = VC(self.settings.modelSamplingRate, dev, self.is_half, repeat)
|
||||||
sid = 0
|
sid = 0
|
||||||
times = [0, 0, 0]
|
times = [0, 0, 0]
|
||||||
f0_up_key = self.settings.tran
|
f0_up_key = self.settings.tran
|
||||||
@ -245,9 +245,7 @@ class RVC:
|
|||||||
audio = data[0]
|
audio = data[0]
|
||||||
convertSize = data[1]
|
convertSize = data[1]
|
||||||
vol = data[2]
|
vol = data[2]
|
||||||
print("audio len 02,", len(audio))
|
|
||||||
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
|
audio = resampy.resample(audio, self.settings.modelSamplingRate, 16000)
|
||||||
print("audio len 03,", len(audio))
|
|
||||||
|
|
||||||
if vol < self.settings.silentThreshold:
|
if vol < self.settings.silentThreshold:
|
||||||
return np.zeros(convertSize).astype(np.int16)
|
return np.zeros(convertSize).astype(np.int16)
|
||||||
@ -266,7 +264,6 @@ class RVC:
|
|||||||
if_f0 = 1
|
if_f0 = 1
|
||||||
f0_file = None
|
f0_file = None
|
||||||
|
|
||||||
print("audio len 0,", len(audio))
|
|
||||||
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
|
audio_out = vc.pipeline(self.hubert_model, self.net_g, sid, audio, times, f0_up_key, f0_method,
|
||||||
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
|
file_index, file_big_npy, index_rate, if_f0, f0_file=f0_file)
|
||||||
result = audio_out * np.sqrt(vol)
|
result = audio_out * np.sqrt(vol)
|
||||||
|
@ -76,7 +76,6 @@ class VC(object):
|
|||||||
assert feats.dim() == 1, feats.dim()
|
assert feats.dim() == 1, feats.dim()
|
||||||
feats = feats.view(1, -1)
|
feats = feats.view(1, -1)
|
||||||
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False)
|
||||||
print("padding_mask", padding_mask)
|
|
||||||
|
|
||||||
inputs = {
|
inputs = {
|
||||||
"source": feats.to(self.device),
|
"source": feats.to(self.device),
|
||||||
@ -98,9 +97,8 @@ class VC(object):
|
|||||||
npy = npy.astype("float16")
|
npy = npy.astype("float16")
|
||||||
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
feats = torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + (1 - index_rate) * feats
|
||||||
|
|
||||||
print("feats shape1", feats.shape)
|
|
||||||
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1)
|
||||||
print("feats shape2", feats.shape)
|
|
||||||
t1 = ttime()
|
t1 = ttime()
|
||||||
p_len = audio0.shape[0] // self.window
|
p_len = audio0.shape[0] // self.window
|
||||||
if (feats.shape[1] < p_len):
|
if (feats.shape[1] < p_len):
|
||||||
@ -109,23 +107,18 @@ class VC(object):
|
|||||||
pitch = pitch[:, :p_len]
|
pitch = pitch[:, :p_len]
|
||||||
pitchf = pitchf[:, :p_len]
|
pitchf = pitchf[:, :p_len]
|
||||||
p_len = torch.tensor([p_len], device=self.device).long()
|
p_len = torch.tensor([p_len], device=self.device).long()
|
||||||
|
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
print("vc audio len feat 1,", feats.shape)
|
|
||||||
if (pitch != None and pitchf != None):
|
|
||||||
print("vc audio len feat use pitch!!!!!!!,", feats.shape)
|
|
||||||
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
|
audio1 = (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
|
||||||
else:
|
|
||||||
audio1 = (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768).data.cpu().float().numpy().astype(np.int16)
|
|
||||||
del feats, p_len, padding_mask
|
del feats, p_len, padding_mask
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
t2 = ttime()
|
t2 = ttime()
|
||||||
times[0] += (t1 - t0)
|
times[0] += (t1 - t0)
|
||||||
times[2] += (t2 - t1)
|
times[2] += (t2 - t1)
|
||||||
print("vc audio return", len(audio1), audio1)
|
|
||||||
return audio1
|
return audio1
|
||||||
|
|
||||||
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None):
|
def pipeline(self, model, net_g, sid, audio, times, f0_up_key, f0_method, file_index, file_big_npy, index_rate, if_f0, f0_file=None):
|
||||||
print("audio len 1,", len(audio))
|
|
||||||
if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
|
if (file_big_npy != "" and file_index != "" and os.path.exists(file_big_npy) == True and os.path.exists(file_index) == True and index_rate != 0):
|
||||||
try:
|
try:
|
||||||
index = faiss.read_index(file_index)
|
index = faiss.read_index(file_index)
|
||||||
@ -135,13 +128,7 @@ class VC(object):
|
|||||||
index = big_npy = None
|
index = big_npy = None
|
||||||
else:
|
else:
|
||||||
index = big_npy = None
|
index = big_npy = None
|
||||||
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode='reflect')
|
|
||||||
print("audio_pad len 1,", len(audio_pad))
|
|
||||||
opt_ts = []
|
|
||||||
|
|
||||||
print("audio_pad len 2,", len(audio_pad), opt_ts)
|
|
||||||
|
|
||||||
s = 0
|
|
||||||
audio_opt = []
|
audio_opt = []
|
||||||
t = None
|
t = None
|
||||||
t1 = ttime()
|
t1 = ttime()
|
||||||
@ -153,7 +140,6 @@ class VC(object):
|
|||||||
pitch, pitchf = None, None
|
pitch, pitchf = None, None
|
||||||
if (if_f0 == 1):
|
if (if_f0 == 1):
|
||||||
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
|
pitch, pitchf = self.get_f0(audio_pad, p_len, f0_up_key, f0_method, inp_f0)
|
||||||
print("pitch!", pitch)
|
|
||||||
pitch = pitch[:p_len]
|
pitch = pitch[:p_len]
|
||||||
pitchf = pitchf[:p_len]
|
pitchf = pitchf[:p_len]
|
||||||
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
||||||
@ -170,5 +156,4 @@ class VC(object):
|
|||||||
audio_opt = np.concatenate(audio_opt)
|
audio_opt = np.concatenate(audio_opt)
|
||||||
del pitch, pitchf, sid
|
del pitch, pitchf, sid
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
print("result", audio_opt)
|
|
||||||
return audio_opt
|
return audio_opt
|
||||||
|
Loading…
Reference in New Issue
Block a user