This commit is contained in:
wataru 2023-04-28 17:18:33 +09:00
parent 50d1977d50
commit 569d8d2c0c
2 changed files with 17 additions and 35 deletions

View File

@ -191,8 +191,8 @@ class DDSP_SVC:
setattr(self.settings, key, str(val)) setattr(self.settings, key, str(val))
if key == "f0Detector": if key == "f0Detector":
print("f0Detector update", val) print("f0Detector update", val)
if val == "dio": # if val == "dio":
val = "parselmouth" # val = "parselmouth"
if hasattr(self, "sampling_rate") is False: if hasattr(self, "sampling_rate") is False:
self.sampling_rate = 44100 self.sampling_rate = 44100

View File

@ -1,5 +1,6 @@
import numpy as np import numpy as np
import parselmouth
# import parselmouth
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from config import x_query, x_center, x_max # type:ignore from config import x_query, x_center, x_max # type:ignore
@ -27,28 +28,13 @@ class VC(object):
silence_front_offset = int(np.round(real_silence_front * self.sr)) silence_front_offset = int(np.round(real_silence_front * self.sr))
audio = audio[silence_front_offset:] audio = audio[silence_front_offset:]
time_step = self.window / self.sr * 1000 # time_step = self.window / self.sr * 1000
f0_min = 50 f0_min = 50
f0_max = 1100 f0_max = 1100
f0_mel_min = 1127 * np.log(1 + f0_min / 700) f0_mel_min = 1127 * np.log(1 + f0_min / 700)
f0_mel_max = 1127 * np.log(1 + f0_max / 700) f0_mel_max = 1127 * np.log(1 + f0_max / 700)
if f0_method == "pm": if f0_method == "pm":
f0 = ( print("not implemented. use harvest")
parselmouth.Sound(audio, self.sr)
.to_pitch_ac(
time_step=time_step / 1000,
voicing_threshold=0.6,
pitch_floor=f0_min,
pitch_ceiling=f0_max,
)
.selected_array["frequency"]
)
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
elif f0_method == "harvest":
f0, t = pyworld.harvest( f0, t = pyworld.harvest(
audio.astype(np.double), audio.astype(np.double),
fs=self.sr, fs=self.sr,
@ -62,22 +48,18 @@ class VC(object):
f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame) f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
) )
else: else:
print("[Voice Changer] invalid f0 detector, use pm.", f0_method) f0, t = pyworld.harvest(
f0 = ( audio.astype(np.double),
parselmouth.Sound(audio, self.sr) fs=self.sr,
.to_pitch_ac( f0_ceil=f0_max,
time_step=time_step / 1000, frame_period=10,
voicing_threshold=0.6, )
pitch_floor=f0_min, f0 = pyworld.stonemask(audio.astype(np.double), f0, t, self.sr)
pitch_ceiling=f0_max, f0 = signal.medfilt(f0, 3)
)
.selected_array["frequency"] f0 = np.pad(
f0.astype("float"), (start_frame, n_frames - len(f0) - start_frame)
) )
pad_size = (p_len - len(f0) + 1) // 2
if pad_size > 0 or p_len - len(f0) - pad_size > 0:
f0 = np.pad(
f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant"
)
f0 *= pow(2, f0_up_key / 12) f0 *= pow(2, f0_up_key / 12)
f0bak = f0.copy() f0bak = f0.copy()