mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 21:45:00 +03:00
WIP: refactor, break down generate input
This commit is contained in:
parent
f76fff5959
commit
6f97320c0d
@ -28,7 +28,9 @@ class MMVC_Namespace(socketio.AsyncNamespace):
|
|||||||
print(data)
|
print(data)
|
||||||
await self.emit('response', [timestamp, 0], to=sid)
|
await self.emit('response', [timestamp, 0], to=sid)
|
||||||
else:
|
else:
|
||||||
unpackedData = np.array(struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data))
|
# tuple of short
|
||||||
|
unpackedData = struct.unpack('<%sh' % (len(data) // struct.calcsize('<h')), data)
|
||||||
|
|
||||||
# audio1, perf = self.voiceChangerManager.changeVoice(unpackedData)
|
# audio1, perf = self.voiceChangerManager.changeVoice(unpackedData)
|
||||||
res = self.voiceChangerManager.changeVoice(unpackedData)
|
res = self.voiceChangerManager.changeVoice(unpackedData)
|
||||||
audio1 = res[0]
|
audio1 = res[0]
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
import sys
|
import sys
|
||||||
sys.path.append("MMVC_Client/python")
|
sys.path.append("MMVC_Client/python")
|
||||||
|
|
||||||
@ -90,7 +91,6 @@ class VoiceChanger():
|
|||||||
|
|
||||||
self.gpu_num = torch.cuda.device_count()
|
self.gpu_num = torch.cuda.device_count()
|
||||||
self.text_norm = torch.LongTensor([0, 6, 0])
|
self.text_norm = torch.LongTensor([0, 6, 0])
|
||||||
self.audio_buffer = torch.zeros(1, 0)
|
|
||||||
self.prev_audio = np.zeros(1)
|
self.prev_audio = np.zeros(1)
|
||||||
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
self.mps_enabled = getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available()
|
||||||
|
|
||||||
@ -234,18 +234,9 @@ class VoiceChanger():
|
|||||||
if hasattr(self, 'np_prev_audio1') == True:
|
if hasattr(self, 'np_prev_audio1') == True:
|
||||||
delattr(self, "np_prev_audio1")
|
delattr(self, "np_prev_audio1")
|
||||||
|
|
||||||
def _generate_input(self, unpackedData: any, convertSize: int):
|
def _get_f0(self, newData: any):
|
||||||
# 今回変換するデータをテンソルとして整形する
|
|
||||||
audio = torch.FloatTensor(unpackedData.astype(np.float32)) # float32でtensorfを作成
|
|
||||||
audio_norm = audio / self.hps.data.max_wav_value # normalize
|
|
||||||
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
|
|
||||||
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1) # 過去のデータに連結
|
|
||||||
# audio_norm = self.audio_buffer[:, -(convertSize + 1280 * 2):] # 変換対象の部分だけ抽出
|
|
||||||
audio_norm = self.audio_buffer[:, -(convertSize):] # 変換対象の部分だけ抽出
|
|
||||||
self.audio_buffer = audio_norm
|
|
||||||
|
|
||||||
# TBD: numpy <--> pytorch変換が行ったり来たりしているが、まずは動かすことを最優先。
|
audio_norm_np = newData.astype(np.float64)
|
||||||
audio_norm_np = audio_norm.squeeze().numpy().astype(np.float64)
|
|
||||||
if self.settings.f0Detector == "dio":
|
if self.settings.f0Detector == "dio":
|
||||||
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
|
_f0, _time = pw.dio(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5)
|
||||||
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
|
f0 = pw.stonemask(audio_norm_np, _f0, _time, self.hps.data.sampling_rate)
|
||||||
@ -253,18 +244,32 @@ class VoiceChanger():
|
|||||||
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
|
f0, t = pw.harvest(audio_norm_np, self.hps.data.sampling_rate, frame_period=5.5, f0_floor=71.0, f0_ceil=1000.0)
|
||||||
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
|
f0 = convert_continuos_f0(f0, int(audio_norm_np.shape[0] / self.hps.data.hop_length))
|
||||||
f0 = torch.from_numpy(f0.astype(np.float32))
|
f0 = torch.from_numpy(f0.astype(np.float32))
|
||||||
|
return f0
|
||||||
|
|
||||||
|
def _get_spec(self, newData: any):
|
||||||
|
audio = torch.FloatTensor(newData)
|
||||||
|
audio_norm = audio / self.hps.data.max_wav_value # normalize
|
||||||
|
audio_norm = audio_norm.unsqueeze(0) # unsqueeze
|
||||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||||
center=False)
|
center=False)
|
||||||
# dispose_stft_specs = 2
|
|
||||||
# spec = spec[:, dispose_stft_specs:-dispose_stft_specs]
|
|
||||||
# f0 = f0[dispose_stft_specs:-dispose_stft_specs]
|
|
||||||
spec = torch.squeeze(spec, 0)
|
spec = torch.squeeze(spec, 0)
|
||||||
|
return spec
|
||||||
|
|
||||||
|
def _generate_input(self, newData: any, convertSize: int):
|
||||||
|
newData = np.array(newData).astype(np.float32)
|
||||||
|
|
||||||
|
if hasattr(self, "audio_buffer"):
|
||||||
|
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) # 過去のデータに連結
|
||||||
|
else:
|
||||||
|
self.audio_buffer = newData
|
||||||
|
|
||||||
|
self.audio_buffer = self.audio_buffer[-(convertSize):] # 変換対象の部分だけ抽出
|
||||||
|
|
||||||
|
f0 = self._get_f0(self.audio_buffer) # f0 生成
|
||||||
|
spec = self._get_spec(self.audio_buffer)
|
||||||
sid = torch.LongTensor([int(self.settings.srcId)])
|
sid = torch.LongTensor([int(self.settings.srcId)])
|
||||||
|
|
||||||
# data = (self.text_norm, spec, audio_norm, sid)
|
|
||||||
# data = TextAudioSpeakerCollate()([data])
|
|
||||||
data = TextAudioSpeakerCollate(
|
data = TextAudioSpeakerCollate(
|
||||||
sample_rate=self.hps.data.sampling_rate,
|
sample_rate=self.hps.data.sampling_rate,
|
||||||
hop_size=self.hps.data.hop_length,
|
hop_size=self.hps.data.hop_length,
|
||||||
@ -318,19 +323,23 @@ class VoiceChanger():
|
|||||||
result = audio1.float().cpu().numpy()
|
result = audio1.float().cpu().numpy()
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def on_request(self, unpackedData: any):
|
# receivedData: tuple of short
|
||||||
|
def on_request(self, receivedData: any):
|
||||||
|
|
||||||
with Timer("pre-process") as t:
|
with Timer("pre-process") as t:
|
||||||
if self.settings.inputSampleRate != 24000:
|
if self.settings.inputSampleRate != 24000:
|
||||||
unpackedData = resampy.resample(unpackedData, 48000, 24000)
|
newData = resampy.resample(receivedData, self.settings.inputSampleRate, 24000)
|
||||||
convertSize = unpackedData.shape[0] + min(self.settings.crossFadeOverlapSize, unpackedData.shape[0])
|
else:
|
||||||
|
newData = receivedData
|
||||||
|
convertSize = len(newData) + min(self.settings.crossFadeOverlapSize, len(newData))
|
||||||
# print(convertSize, unpackedData.shape[0])
|
# print(convertSize, unpackedData.shape[0])
|
||||||
if convertSize < 8192:
|
if convertSize < 8192:
|
||||||
convertSize = 8192
|
convertSize = 8192
|
||||||
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
if convertSize % 128 != 0: # モデルの出力のホップサイズで切り捨てが発生するので補う。
|
||||||
convertSize = convertSize + (128 - (convertSize % 128))
|
convertSize = convertSize + (128 - (convertSize % 128))
|
||||||
self._generate_strength(unpackedData.shape[0])
|
|
||||||
data = self._generate_input(unpackedData, convertSize)
|
self._generate_strength(len(newData))
|
||||||
|
data = self._generate_input(newData, convertSize)
|
||||||
preprocess_time = t.secs
|
preprocess_time = t.secs
|
||||||
|
|
||||||
with Timer("main-process") as t:
|
with Timer("main-process") as t:
|
||||||
@ -342,7 +351,7 @@ class VoiceChanger():
|
|||||||
audio = self._pyTorch_inference(data)
|
audio = self._pyTorch_inference(data)
|
||||||
# result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0])
|
# result = self.voiceChanger._pyTorch_inference(data, unpackedData.shape[0])
|
||||||
|
|
||||||
inputSize = unpackedData.shape[0]
|
inputSize = len(newData)
|
||||||
|
|
||||||
if hasattr(self, 'np_prev_audio1') == True:
|
if hasattr(self, 'np_prev_audio1') == True:
|
||||||
np.set_printoptions(threshold=10000)
|
np.set_printoptions(threshold=10000)
|
||||||
@ -376,11 +385,11 @@ class VoiceChanger():
|
|||||||
if self.settings.recordIO == 1:
|
if self.settings.recordIO == 1:
|
||||||
# self.stream_in.write(unpackedData.astype(np.int16).tobytes())
|
# self.stream_in.write(unpackedData.astype(np.int16).tobytes())
|
||||||
# self.stream_out.write(result.tobytes())
|
# self.stream_out.write(result.tobytes())
|
||||||
self.ioRecorder.writeInput(unpackedData.astype(np.int16).tobytes())
|
self.ioRecorder.writeInput(receivedData.astype(np.int16).tobytes())
|
||||||
self.ioRecorder.writeOutput(result.tobytes())
|
self.ioRecorder.writeOutput(result.tobytes())
|
||||||
|
|
||||||
if self.settings.inputSampleRate != 24000:
|
if self.settings.inputSampleRate != 24000:
|
||||||
result = resampy.resample(result, 24000, 48000).astype(np.int16)
|
result = resampy.resample(result, 24000, self.settings.inputSampleRate).astype(np.int16)
|
||||||
postprocess_time = t.secs
|
postprocess_time = t.secs
|
||||||
|
|
||||||
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
perf = [preprocess_time, mainprocess_time, postprocess_time]
|
||||||
|
@ -31,9 +31,9 @@ class VoiceChangerManager():
|
|||||||
else:
|
else:
|
||||||
return {"status": "ERROR", "msg": "no model loaded"}
|
return {"status": "ERROR", "msg": "no model loaded"}
|
||||||
|
|
||||||
def changeVoice(self, unpackedData: any):
|
def changeVoice(self, receivedData: any):
|
||||||
if hasattr(self, 'voiceChanger') == True:
|
if hasattr(self, 'voiceChanger') == True:
|
||||||
return self.voiceChanger.on_request(unpackedData)
|
return self.voiceChanger.on_request(receivedData)
|
||||||
else:
|
else:
|
||||||
print("Voice Change is not loaded. Did you load a correct model?")
|
print("Voice Change is not loaded. Did you load a correct model?")
|
||||||
return np.zeros(1).astype(np.int16), []
|
return np.zeros(1).astype(np.int16), []
|
||||||
|
Loading…
Reference in New Issue
Block a user