mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
WIP: refactor, separate io analyzer, io recorder from main class
This commit is contained in:
parent
18a87d9d24
commit
f76fff5959
40
server/voice_changer/IOAnalyzer.py
Normal file
40
server/voice_changer/IOAnalyzer.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import os
|
||||||
|
import numpy as np
|
||||||
|
import pylab
|
||||||
|
import librosa
|
||||||
|
import librosa.display
|
||||||
|
import pyworld as pw
|
||||||
|
|
||||||
|
|
||||||
|
class IOAnalyzer:
|
||||||
|
|
||||||
|
def _get_f0_dio(self, y, sr):
|
||||||
|
_f0, time = pw.dio(y, sr, frame_period=5)
|
||||||
|
f0 = pw.stonemask(y, _f0, time, sr)
|
||||||
|
time = np.linspace(0, y.shape[0] / sr, len(time))
|
||||||
|
return f0, time
|
||||||
|
|
||||||
|
def _get_f0_harvest(self, y, sr):
|
||||||
|
_f0, time = pw.harvest(y, sr, frame_period=5)
|
||||||
|
f0 = pw.stonemask(y, _f0, time, sr)
|
||||||
|
time = np.linspace(0, y.shape[0] / sr, len(time))
|
||||||
|
return f0, time
|
||||||
|
|
||||||
|
def analyze(self, inputDataFile: str, dioImageFile: str, harvestImageFile: str, samplingRate: int):
|
||||||
|
y, sr = librosa.load(inputDataFile, samplingRate)
|
||||||
|
y = y.astype(np.float64)
|
||||||
|
spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
|
||||||
|
f0_dio, times = self._get_f0_dio(y, sr=samplingRate)
|
||||||
|
f0_harvest, times = self._get_f0_harvest(y, sr=samplingRate)
|
||||||
|
|
||||||
|
pylab.close()
|
||||||
|
HOP_LENGTH = 128
|
||||||
|
img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
|
||||||
|
pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
|
||||||
|
pylab.savefig(dioImageFile)
|
||||||
|
|
||||||
|
pylab.close()
|
||||||
|
HOP_LENGTH = 128
|
||||||
|
img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
|
||||||
|
pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
|
||||||
|
pylab.savefig(harvestImageFile)
|
37
server/voice_changer/IORecorder.py
Normal file
37
server/voice_changer/IORecorder.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
import wave
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
class IORecorder:
|
||||||
|
|
||||||
|
def __init__(self, inputFilename: str, outputFilename: str, samplingRate: int):
|
||||||
|
|
||||||
|
self._clearFile(inputFilename)
|
||||||
|
self._clearFile(outputFilename)
|
||||||
|
|
||||||
|
self.fi = wave.open(inputFilename, 'wb')
|
||||||
|
self.fi.setnchannels(1)
|
||||||
|
self.fi.setsampwidth(2)
|
||||||
|
self.fi.setframerate(samplingRate)
|
||||||
|
|
||||||
|
self.fo = wave.open(outputFilename, 'wb')
|
||||||
|
self.fo.setnchannels(1)
|
||||||
|
self.fo.setsampwidth(2)
|
||||||
|
self.fo.setframerate(samplingRate)
|
||||||
|
|
||||||
|
def _clearFile(self, filename: str):
|
||||||
|
if os.path.exists(filename):
|
||||||
|
print("[IORecorder] delete old analyze file.", filename)
|
||||||
|
os.remove(filename)
|
||||||
|
else:
|
||||||
|
print("[IORecorder] old analyze file not exist.", filename)
|
||||||
|
|
||||||
|
def writeInput(self, wav):
|
||||||
|
self.fi.writeframes(wav)
|
||||||
|
|
||||||
|
def writeOutput(self, wav):
|
||||||
|
self.fo.writeframes(wav)
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.fi.close()
|
||||||
|
self.fo.close()
|
@ -18,6 +18,9 @@ import pyworld as pw
|
|||||||
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
|
from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
|
||||||
|
|
||||||
from voice_changer.MMVCv15 import MMVCv15
|
from voice_changer.MMVCv15 import MMVCv15
|
||||||
|
from voice_changer.IORecorder import IORecorder
|
||||||
|
from voice_changer.IOAnalyzer import IOAnalyzer
|
||||||
|
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
@ -34,53 +37,10 @@ import librosa.display
|
|||||||
SAMPLING_RATE = 24000
|
SAMPLING_RATE = 24000
|
||||||
|
|
||||||
|
|
||||||
class MockStream:
|
STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
|
||||||
"""gi
|
STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
|
||||||
オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
|
STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png")
|
||||||
"""
|
STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")
|
||||||
|
|
||||||
def __init__(self, sampling_rate):
|
|
||||||
self.sampling_rate = sampling_rate
|
|
||||||
self.start_count = 2
|
|
||||||
self.end_count = 2
|
|
||||||
self.fr = None
|
|
||||||
self.fw = None
|
|
||||||
|
|
||||||
def open_inputfile(self, input_filename):
|
|
||||||
self.fr = wave.open(input_filename, 'rb')
|
|
||||||
|
|
||||||
def open_outputfile(self, output_filename):
|
|
||||||
self.fw = wave.open(output_filename, 'wb')
|
|
||||||
self.fw.setnchannels(1)
|
|
||||||
self.fw.setsampwidth(2)
|
|
||||||
self.fw.setframerate(self.sampling_rate)
|
|
||||||
|
|
||||||
def read(self, length, exception_on_overflow=False):
|
|
||||||
if self.start_count > 0:
|
|
||||||
wav = bytes(length * 2)
|
|
||||||
self.start_count -= 1 # 最初の2回はダミーの空データ送る
|
|
||||||
else:
|
|
||||||
wav = self.fr.readframes(length)
|
|
||||||
if len(wav) <= 0: # データなくなってから最後の2回はダミーの空データを送る
|
|
||||||
wav = bytes(length * 2)
|
|
||||||
self.end_count -= 1
|
|
||||||
if self.end_count < 0:
|
|
||||||
Hyperparameters.VC_END_FLAG = True
|
|
||||||
return wav
|
|
||||||
|
|
||||||
def write(self, wav):
|
|
||||||
self.fw.writeframes(wav)
|
|
||||||
|
|
||||||
def stop_stream(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
if self.fr != None:
|
|
||||||
self.fr.close()
|
|
||||||
self.fr = None
|
|
||||||
if self.fw != None:
|
|
||||||
self.fw.close()
|
|
||||||
self.fw = None
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -136,33 +96,6 @@ class VoiceChanger():
|
|||||||
|
|
||||||
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
|
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
|
||||||
|
|
||||||
def _setupRecordIO(self):
|
|
||||||
# IO Recorder Setup
|
|
||||||
if hasattr(self, "stream_out"):
|
|
||||||
self.stream_out.close()
|
|
||||||
mock_stream_out = MockStream(24000)
|
|
||||||
stream_output_file = os.path.join(TMP_DIR, "out.wav")
|
|
||||||
if os.path.exists(stream_output_file):
|
|
||||||
print("delete old analyze file.", stream_output_file)
|
|
||||||
os.remove(stream_output_file)
|
|
||||||
else:
|
|
||||||
print("old analyze file not exist.", stream_output_file)
|
|
||||||
|
|
||||||
mock_stream_out.open_outputfile(stream_output_file)
|
|
||||||
self.stream_out = mock_stream_out
|
|
||||||
|
|
||||||
if hasattr(self, "stream_in"):
|
|
||||||
self.stream_in.close()
|
|
||||||
mock_stream_in = MockStream(24000)
|
|
||||||
stream_input_file = os.path.join(TMP_DIR, "in.wav")
|
|
||||||
if os.path.exists(stream_input_file):
|
|
||||||
print("delete old analyze file.", stream_input_file)
|
|
||||||
os.remove(stream_input_file)
|
|
||||||
else:
|
|
||||||
print("old analyze file not exist.", stream_output_file)
|
|
||||||
mock_stream_in.open_outputfile(stream_input_file)
|
|
||||||
self.stream_in = mock_stream_in
|
|
||||||
|
|
||||||
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
|
def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
|
||||||
self.settings.configFile = config
|
self.settings.configFile = config
|
||||||
self.hps = get_hparams_from_file(config)
|
self.hps = get_hparams_from_file(config)
|
||||||
@ -222,18 +155,6 @@ class VoiceChanger():
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _get_f0_dio(self, y, sr=SAMPLING_RATE):
|
|
||||||
_f0, time = pw.dio(y, sr, frame_period=5)
|
|
||||||
f0 = pw.stonemask(y, _f0, time, sr)
|
|
||||||
time = np.linspace(0, y.shape[0] / sr, len(time))
|
|
||||||
return f0, time
|
|
||||||
|
|
||||||
def _get_f0_harvest(self, y, sr=SAMPLING_RATE):
|
|
||||||
_f0, time = pw.harvest(y, sr, frame_period=5)
|
|
||||||
f0 = pw.stonemask(y, _f0, time, sr)
|
|
||||||
time = np.linspace(0, y.shape[0] / sr, len(time))
|
|
||||||
return f0, time
|
|
||||||
|
|
||||||
def update_setteings(self, key: str, val: any):
|
def update_setteings(self, key: str, val: any):
|
||||||
if key == "onnxExecutionProvider" and self.onnx_session != None:
|
if key == "onnxExecutionProvider" and self.onnx_session != None:
|
||||||
if val == "CUDAExecutionProvider":
|
if val == "CUDAExecutionProvider":
|
||||||
@ -254,31 +175,22 @@ class VoiceChanger():
|
|||||||
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
|
if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
|
||||||
self.unpackedData_length = 0
|
self.unpackedData_length = 0
|
||||||
if key == "recordIO" and val == 1:
|
if key == "recordIO" and val == 1:
|
||||||
self._setupRecordIO()
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
|
self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
|
||||||
if key == "recordIO" and val == 0:
|
if key == "recordIO" and val == 0:
|
||||||
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
pass
|
pass
|
||||||
if key == "recordIO" and val == 2:
|
if key == "recordIO" and val == 2:
|
||||||
|
if hasattr(self, "ioRecorder"):
|
||||||
|
self.ioRecorder.close()
|
||||||
|
|
||||||
|
if hasattr(self, "ioAnalyzer") == False:
|
||||||
|
self.ioAnalyzer = IOAnalyzer()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stream_input_file = os.path.join(TMP_DIR, "in.wav")
|
self.ioAnalyzer.analyze(STREAM_INPUT_FILE, STREAM_ANALYZE_FILE_DIO, STREAM_ANALYZE_FILE_HARVEST, self.settings.inputSampleRate)
|
||||||
analyze_file_dio = os.path.join(TMP_DIR, "analyze-dio.png")
|
|
||||||
analyze_file_harvest = os.path.join(TMP_DIR, "analyze-harvest.png")
|
|
||||||
y, sr = librosa.load(stream_input_file, SAMPLING_RATE)
|
|
||||||
y = y.astype(np.float64)
|
|
||||||
spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
|
|
||||||
f0_dio, times = self._get_f0_dio(y)
|
|
||||||
f0_harvest, times = self._get_f0_harvest(y)
|
|
||||||
|
|
||||||
pylab.close()
|
|
||||||
HOP_LENGTH = 128
|
|
||||||
img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
|
|
||||||
pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
|
|
||||||
pylab.savefig(analyze_file_dio)
|
|
||||||
|
|
||||||
pylab.close()
|
|
||||||
HOP_LENGTH = 128
|
|
||||||
img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
|
|
||||||
pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
|
|
||||||
pylab.savefig(analyze_file_harvest)
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("recordIO exception", e)
|
print("recordIO exception", e)
|
||||||
@ -462,8 +374,10 @@ class VoiceChanger():
|
|||||||
result = result.astype(np.int16)
|
result = result.astype(np.int16)
|
||||||
# print("on_request result size:",result.shape)
|
# print("on_request result size:",result.shape)
|
||||||
if self.settings.recordIO == 1:
|
if self.settings.recordIO == 1:
|
||||||
self.stream_in.write(unpackedData.astype(np.int16).tobytes())
|
# self.stream_in.write(unpackedData.astype(np.int16).tobytes())
|
||||||
self.stream_out.write(result.tobytes())
|
# self.stream_out.write(result.tobytes())
|
||||||
|
self.ioRecorder.writeInput(unpackedData.astype(np.int16).tobytes())
|
||||||
|
self.ioRecorder.writeOutput(result.tobytes())
|
||||||
|
|
||||||
if self.settings.inputSampleRate != 24000:
|
if self.settings.inputSampleRate != 24000:
|
||||||
result = resampy.resample(result, 24000, 48000).astype(np.int16)
|
result = resampy.resample(result, 24000, 48000).astype(np.int16)
|
||||||
|
Loading…
Reference in New Issue
Block a user