WIP: refactor, break down generate input

This commit is contained in:
wataru 2023-03-07 23:14:14 +09:00
parent f76fff5959
commit 6f97320c0d
3 changed files with 39 additions and 28 deletions

View File

@ -28,7 +28,9 @@ class MMVC_Namespace(socketio.AsyncNamespace):
print(data) print(data)
await self.emit('response', [timestamp, 0], to=sid) await self.emit('response', [timestamp, 0], to=sid)
else: else:
unpackedData = np.array(struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)) # tuple of short
unpackedData = struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)
# audio1, perf = self.voiceChangerManager.changeVoice(unpackedData) # audio1, perf = self.voiceChangerManager.changeVoice(unpackedData)
res = self.voiceChangerManager.changeVoice(unpackedData) res = self.voiceChangerManager.changeVoice(unpackedData)
audio1 = res[0] audio1 = res[0]

View File

@ -1,3 +1,4 @@
import sys import sys
sys.path.append("MMVC_Client/python") sys.path.append("MMVC_Client/python")
@ -90,7 +91,6 @@ class VoiceChanger():
self.gpu_num = torch.cuda.device_count() self.gpu_num = torch.cuda.device_count()
self.text_norm = torch.LongTensor([0, 6, 0]) self.text_norm = torch.LongTensor([0, 6, 0])
self.audio_buffer = torch.zeros(1, 0)
self.prev_audio = np.zeros(1) self.prev_audio = np.zeros(1)
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available() self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
@ -234,18 +234,9 @@ class VoiceChanger():
if hasattr(self, 'np_prev_audio1') == True: if hasattr(self, 'np_prev_audio1') == True:
delattr(self, "np_prev_audio1") delattr(self, "np_prev_audio1")
def _generate_input(self, unpackedData: any, convertSize: int): def _get_f0(self, newData: any):
# 今回変換するデータをテンソルとして整形する
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
# audio_norm = self.audio_buffer[:, -(convertSize + 1280 * 2):] # 変換対象の部分だけ抽出
audio_norm = self.audio_buffer[:, -(convertSize):] # 変換対象の部分だけ抽出
self.audio_buffer = audio_norm
# TBD: numpy <--> pytorch変換が行ったり来たりしているが、まずは動かすことを最優先。 audio_norm_np = newData.astype(np.float64)
audio_norm_np = audio_norm.squeeze().numpy().astype(np.float64)
if self.settings.f0Detector == "dio": if self.settings.f0Detector == "dio":
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5) _f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate) f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
@ -253,18 +244,32 @@ class VoiceChanger():
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0) f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length)) f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
f0 = torch.from_numpy(f0.astype(np.float32)) f0 = torch.from_numpy(f0.astype(np.float32))
return f0
def _get_spec(self, newData: any):
audio = torch.FloatTensor(newData)
audio_norm = audio / self.hps.data.max_wav_value # normalize
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length, spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length, self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False) center=False)
# dispose_stft_specs = 2
# spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
# f0 = f0[dispose_stft_specs:-dispose_stft_specs]
spec = torch.squeeze(spec, 0) spec = torch.squeeze(spec, 0)
return spec
def _generate_input(self, newData: any, convertSize: int):
newData = np.array(newData).astype(np.float32)
if hasattr(self, "audio_buffer"):
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
else:
self.audio_buffer = newData
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
f0 = self._get_f0(self.audio_buffer) # f0 生成
spec = self._get_spec(self.audio_buffer)
sid = torch.LongTensor([int(self.settings.srcId)]) sid = torch.LongTensor([int(self.settings.srcId)])
# data = (self.text_norm, spec, audio_norm, sid)
# data = TextAudioSpeakerCollate()([data])
data = TextAudioSpeakerCollate( data = TextAudioSpeakerCollate(
sample_rate=self.hps.data.sampling_rate, sample_rate=self.hps.data.sampling_rate,
hop_size=self.hps.data.hop_length, hop_size=self.hps.data.hop_length,
@ -318,19 +323,23 @@ class VoiceChanger():
result = audio1.float().cpu().numpy() result = audio1.float().cpu().numpy()
return result return result
def on_request(self, unpackedData: any): # receivedData: tuple of short
def on_request(self, receivedData: any):
with Timer("pre-process") as t: with Timer("pre-process") as t:
if self.settings.inputSampleRate != 24000: if self.settings.inputSampleRate != 24000:
unpackedData = resampy.resample(unpackedData, 48000, 24000) newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
convertSize = unpackedData.shape[0] + min(self.settings.crossFadeOverlapSize, unpackedData.shape[0]) else:
newData = receivedData
convertSize = len(newData) + min(self.settings.crossFadeOverlapSize, len(newData))
# print(convertSize, unpackedData.shape[0]) # print(convertSize, unpackedData.shape[0])
if convertSize < 8192: if convertSize < 8192:
convertSize = 8192 convertSize = 8192
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。 if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
convertSize = convertSize + (128 - (convertSize % 128)) convertSize = convertSize + (128 - (convertSize % 128))
self._generate_strength(unpackedData.shape[0])
data = self._generate_input(unpackedData, convertSize) self._generate_strength(len(newData))
data = self._generate_input(newData, convertSize)
preprocess_time = t.secs preprocess_time = t.secs
with Timer("main-process") as t: with Timer("main-process") as t:
@ -342,7 +351,7 @@ class VoiceChanger():
audio = self._pyTorch_inference(data) audio = self._pyTorch_inference(data)
# result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0]) # result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0])
inputSize = unpackedData.shape[0] inputSize = len(newData)
if hasattr(self, 'np_prev_audio1') == True: if hasattr(self, 'np_prev_audio1') == True:
np.set_printoptions(threshold=10000) np.set_printoptions(threshold=10000)
@ -376,11 +385,11 @@ class VoiceChanger():
if self.settings.recordIO == 1: if self.settings.recordIO == 1:
# self.stream_in.write(unpackedData.astype(np.int16).tobytes()) # self.stream_in.write(unpackedData.astype(np.int16).tobytes())
# self.stream_out.write(result.tobytes()) # self.stream_out.write(result.tobytes())
self.ioRecorder.writeInput(unpackedData.astype(np.int16).tobytes()) self.ioRecorder.writeInput(receivedData.astype(np.int16).tobytes())
self.ioRecorder.writeOutput(result.tobytes()) self.ioRecorder.writeOutput(result.tobytes())
if self.settings.inputSampleRate != 24000: if self.settings.inputSampleRate != 24000:
result = resampy.resample(result, 24000, 48000).astype(np.int16) result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
postprocess_time = t.secs postprocess_time = t.secs
perf = [preprocess_time, mainprocess_time, postprocess_time] perf = [preprocess_time, mainprocess_time, postprocess_time]

View File

@ -31,9 +31,9 @@ class VoiceChangerManager():
else: else:
return {"status": "ERROR", "msg": "no model loaded"} return {"status": "ERROR", "msg": "no model loaded"}
def changeVoice(self, unpackedData: any): def changeVoice(self, receivedData: any):
if hasattr(self, 'voiceChanger') == True: if hasattr(self, 'voiceChanger') == True:
return self.voiceChanger.on_request(unpackedData) return self.voiceChanger.on_request(receivedData)
else: else:
print("Voice Change is not loaded. Did you load a correct model?") print("Voice Change is not loaded. Did you load a correct model?")
return np.zeros(1).astype(np.int16), [] return np.zeros(1).astype(np.int16), []