WIP: refactor, separate io analyzer, io recorder from main class

2025-02-03 00:33:57 +03:00 · 2023-03-07 22:30:48 +09:00 · 2023-03-07 22:30:48 +09:00 · f76fff5959
commit f76fff5959
parent 18a87d9d24
3 changed files with 100 additions and 109 deletions
--- a/server/voice_changer/IOAnalyzer.py
+++ b/server/voice_changer/IOAnalyzer.py
@ -0,0 +1,40 @@
 import os
 import numpy as np
 import pylab
 import librosa
 import librosa.display
 import pyworld as pw
 class IOAnalyzer:
    def _get_f0_dio(self, y, sr):
        _f0, time = pw.dio(y, sr, frame_period=5)
        f0 = pw.stonemask(y, _f0, time, sr)
        time = np.linspace(0, y.shape[0] / sr, len(time))
        return f0, time
    def _get_f0_harvest(self, y, sr):
        _f0, time = pw.harvest(y, sr, frame_period=5)
        f0 = pw.stonemask(y, _f0, time, sr)
        time = np.linspace(0, y.shape[0] / sr, len(time))
        return f0, time
    def analyze(self, inputDataFile: str, dioImageFile: str, harvestImageFile: str, samplingRate: int):
        y, sr = librosa.load(inputDataFile, samplingRate)
        y = y.astype(np.float64)
        spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
        f0_dio, times = self._get_f0_dio(y, sr=samplingRate)
        f0_harvest, times = self._get_f0_harvest(y, sr=samplingRate)
        pylab.close()
        HOP_LENGTH = 128
        img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
        pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
        pylab.savefig(dioImageFile)
        pylab.close()
        HOP_LENGTH = 128
        img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
        pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
        pylab.savefig(harvestImageFile)
--- a/server/voice_changer/IORecorder.py
+++ b/server/voice_changer/IORecorder.py
@ -0,0 +1,37 @@
 import wave
 import os
 class IORecorder:
    def __init__(self, inputFilename: str, outputFilename: str, samplingRate: int):
        self._clearFile(inputFilename)
        self._clearFile(outputFilename)
        self.fi = wave.open(inputFilename, 'wb')
        self.fi.setnchannels(1)
        self.fi.setsampwidth(2)
        self.fi.setframerate(samplingRate)
        self.fo = wave.open(outputFilename, 'wb')
        self.fo.setnchannels(1)
        self.fo.setsampwidth(2)
        self.fo.setframerate(samplingRate)
    def _clearFile(self, filename: str):
        if os.path.exists(filename):
            print("[IORecorder] delete old analyze file.", filename)
            os.remove(filename)
        else:
            print("[IORecorder] old analyze file not exist.", filename)
    def writeInput(self, wav):
        self.fi.writeframes(wav)
    def writeOutput(self, wav):
        self.fo.writeframes(wav)
    def close(self):
        self.fi.close()
        self.fo.close()
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -18,6 +18,9 @@ import pyworld as pw
 from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint
 from voice_changer.MMVCv15 import MMVCv15
 from voice_changer.IORecorder import IORecorder
 from voice_changer.IOAnalyzer import IOAnalyzer
 import time
@ -34,53 +37,10 @@ import librosa.display
 SAMPLING_RATE = 24000
-class MockStream:
+STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
-    """gi
+STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
-    オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
+STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png")
-    """
+STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")
    def __init__(self, sampling_rate):
        self.sampling_rate = sampling_rate
        self.start_count = 2
        self.end_count = 2
        self.fr = None
        self.fw = None
    def open_inputfile(self, input_filename):
        self.fr = wave.open(input_filename, 'rb')
    def open_outputfile(self, output_filename):
        self.fw = wave.open(output_filename, 'wb')
        self.fw.setnchannels(1)
        self.fw.setsampwidth(2)
        self.fw.setframerate(self.sampling_rate)
    def read(self, length, exception_on_overflow=False):
        if self.start_count > 0:
            wav = bytes(length * 2)
            self.start_count -= 1  # 最初の2回はダミーの空データ送る
        else:
            wav = self.fr.readframes(length)
        if len(wav) <= 0:  # データなくなってから最後の2回はダミーの空データを送る
            wav = bytes(length * 2)
            self.end_count -= 1
            if self.end_count < 0:
                Hyperparameters.VC_END_FLAG = True
        return wav
    def write(self, wav):
        self.fw.writeframes(wav)
    def stop_stream(self):
        pass
    def close(self):
        if self.fr != None:
            self.fr.close()
            self.fr = None
        if self.fw != None:
            self.fw.close()
            self.fw = None
@dataclass
@ -136,33 +96,6 @@ class VoiceChanger():
        print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")
    def _setupRecordIO(self):
        # IO Recorder Setup
        if hasattr(self, "stream_out"):
            self.stream_out.close()
        mock_stream_out = MockStream(24000)
        stream_output_file = os.path.join(TMP_DIR, "out.wav")
        if os.path.exists(stream_output_file):
            print("delete old analyze file.", stream_output_file)
            os.remove(stream_output_file)
        else:
            print("old analyze file not exist.", stream_output_file)
        mock_stream_out.open_outputfile(stream_output_file)
        self.stream_out = mock_stream_out
        if hasattr(self, "stream_in"):
            self.stream_in.close()
        mock_stream_in = MockStream(24000)
        stream_input_file = os.path.join(TMP_DIR, "in.wav")
        if os.path.exists(stream_input_file):
            print("delete old analyze file.", stream_input_file)
            os.remove(stream_input_file)
        else:
            print("old analyze file not exist.", stream_output_file)
        mock_stream_in.open_outputfile(stream_input_file)
        self.stream_in = mock_stream_in
    def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
        self.settings.configFile = config
        self.hps = get_hparams_from_file(config)
@ -222,18 +155,6 @@ class VoiceChanger():
        return data
    def _get_f0_dio(self, y, sr=SAMPLING_RATE):
        _f0, time = pw.dio(y, sr, frame_period=5)
        f0 = pw.stonemask(y, _f0, time, sr)
        time = np.linspace(0, y.shape[0] / sr, len(time))
        return f0, time
    def _get_f0_harvest(self, y, sr=SAMPLING_RATE):
        _f0, time = pw.harvest(y, sr, frame_period=5)
        f0 = pw.stonemask(y, _f0, time, sr)
        time = np.linspace(0, y.shape[0] / sr, len(time))
        return f0, time
    def update_setteings(self, key: str, val: any):
        if key == "onnxExecutionProvider" and self.onnx_session != None:
            if val == "CUDAExecutionProvider":
@ -254,31 +175,22 @@ class VoiceChanger():
            if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
                self.unpackedData_length = 0
            if key == "recordIO" and val == 1:
-                self._setupRecordIO()
+                if hasattr(self, "ioRecorder"):
                    self.ioRecorder.close()
                self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
            if key == "recordIO" and val == 0:
                if hasattr(self, "ioRecorder"):
                    self.ioRecorder.close()
                pass
            if key == "recordIO" and val == 2:
                if hasattr(self, "ioRecorder"):
                    self.ioRecorder.close()
                if hasattr(self, "ioAnalyzer") == False:
                    self.ioAnalyzer = IOAnalyzer()
                try:
-                    stream_input_file = os.path.join(TMP_DIR, "in.wav")
+                    self.ioAnalyzer.analyze(STREAM_INPUT_FILE, STREAM_ANALYZE_FILE_DIO, STREAM_ANALYZE_FILE_HARVEST, self.settings.inputSampleRate)
                    analyze_file_dio = os.path.join(TMP_DIR, "analyze-dio.png")
                    analyze_file_harvest = os.path.join(TMP_DIR, "analyze-harvest.png")
                    y, sr = librosa.load(stream_input_file, SAMPLING_RATE)
                    y = y.astype(np.float64)
                    spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
                    f0_dio, times = self._get_f0_dio(y)
                    f0_harvest, times = self._get_f0_harvest(y)
                    pylab.close()
                    HOP_LENGTH = 128
                    img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
                    pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
                    pylab.savefig(analyze_file_dio)
                    pylab.close()
                    HOP_LENGTH = 128
                    img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
                    pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
                    pylab.savefig(analyze_file_harvest)
                except Exception as e:
                    print("recordIO exception", e)
@ -462,8 +374,10 @@ class VoiceChanger():
            result = result.astype(np.int16)
            # print("on_request result size:",result.shape)
            if self.settings.recordIO == 1:
-                self.stream_in.write(unpackedData.astype(np.int16).tobytes())
+                # self.stream_in.write(unpackedData.astype(np.int16).tobytes())
-                self.stream_out.write(result.tobytes())
+                # self.stream_out.write(result.tobytes())
                self.ioRecorder.writeInput(unpackedData.astype(np.int16).tobytes())
                self.ioRecorder.writeOutput(result.tobytes())
            if self.settings.inputSampleRate != 24000:
                result = resampy.resample(result, 24000, 48000).astype(np.int16)