WIP: so-vits-svc 40v2, alpha (refactoring4 apply for v15)

This commit is contained in:
wataru 2023-03-13 02:40:04 +09:00
parent 7a2fd74d6c
commit 5818292046
4 changed files with 18 additions and 9 deletions

File diff suppressed because one or more lines are too long

View File

@ -28,7 +28,7 @@ const App = () => {
return ( return (
<div className="top-title"> <div className="top-title">
<span className="title">Voice Changer Setting</span> <span className="title">Voice Changer Setting</span>
<span className="top-title-version">for v.1.5.x</span> <span className="top-title-version">for MMVC v.1.5.x</span>
<span className="belongings"> <span className="belongings">
<a className="link" href="https://github.com/w-okada/voice-changer" target="_blank" rel="noopener noreferrer"> <a className="link" href="https://github.com/w-okada/voice-changer" target="_blank" rel="noopener noreferrer">
<img src="./assets/icons/github.svg" /> <img src="./assets/icons/github.svg" />

View File

@ -132,6 +132,9 @@ class MMVCv15:
return data return data
def get_processing_sampling_rate(self):
return self.hps.data.sampling_rate
def _get_f0(self, detector: str, newData: any): def _get_f0(self, detector: str, newData: any):
audio_norm_np = newData.astype(np.float64) audio_norm_np = newData.astype(np.float64)
@ -146,7 +149,8 @@ class MMVCv15:
def _get_spec(self, newData: any): def _get_spec(self, newData: any):
audio = torch.FloatTensor(newData) audio = torch.FloatTensor(newData)
audio_norm = audio / self.hps.data.max_wav_value # normalize # audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio
audio_norm = audio_norm.unsqueeze(0) # unsqueeze audio_norm = audio_norm.unsqueeze(0) # unsqueeze
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length, spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
@ -154,15 +158,21 @@ class MMVCv15:
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
return spec return spec
def generate_input(self, newData: any, convertSize: int): def generate_input(self, newData: any, inputSize: int, crossfadeSize: int):
newData = newData.astype(np.float32) newData = newData.astype(np.float32) / self.hps.data.max_wav_value
if hasattr(self, "audio_buffer"): if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else: else:
self.audio_buffer = newData self.audio_buffer = newData
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出 convertSize = inputSize + crossfadeSize
if convertSize < 8192:
convertSize = 8192
if convertSize % self.hps.data.hop_length != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (self.hps.data.hop_length - (convertSize % self.hps.data.hop_length))
self.audio_buffer = self.audio_buffer[-1 * convertSize:] # 変換対象の部分だけ抽出
f0 = self._get_f0(self.settings.f0Detector, self.audio_buffer) # f0 生成 f0 = self._get_f0(self.settings.f0Detector, self.audio_buffer) # f0 生成
spec = self._get_spec(self.audio_buffer) spec = self._get_spec(self.audio_buffer)

View File

@ -158,9 +158,8 @@ class VoiceChanger():
newData = receivedData newData = receivedData
inputSize = newData.shape[0] inputSize = newData.shape[0]
crossfadeSize = self.settings.crossFadeOverlapSize if self.settings.crossFadeOverlapSize > 0 else inputSize crossfadeSize = min(self.settings.crossFadeOverlapSize, inputSize)
# convertSize = inputSize + min(self.settings.crossFadeOverlapSize, inputSize)
print_convert_processing( print_convert_processing(
f" Input data size: {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz") f" Input data size: {receivedData.shape[0]}/{self.settings.inputSampleRate}hz {inputSize}/{processing_sampling_rate}hz")
print_convert_processing( print_convert_processing(