WIP: refactor, separate io analyzer, io recorder from main class

2025-01-23 21:45:00 +03:00 · 2023-03-07 22:30:48 +09:00 · 2023-03-07 22:30:48 +09:00 · f76fff5959
commit f76fff5959
parent 18a87d9d24
3 changed files with 100 additions and 109 deletions
--- a/server/voice_changer/IOAnalyzer.py
+++ b/server/voice_changer/IOAnalyzer.py
@ -0,0 +1,40 @@
+import os
+import numpy as np
+import pylab
+import librosa
+import librosa.display
+import pyworld as pw
+
+
+class IOAnalyzer:
+
+    def _get_f0_dio(self, y, sr):
+        _f0, time = pw.dio(y, sr, frame_period=5)
+        f0 = pw.stonemask(y, _f0, time, sr)
+        time = np.linspace(0, y.shape[0] / sr, len(time))
+        return f0, time
+
+    def _get_f0_harvest(self, y, sr):
+        _f0, time = pw.harvest(y, sr, frame_period=5)
+        f0 = pw.stonemask(y, _f0, time, sr)
+        time = np.linspace(0, y.shape[0] / sr, len(time))
+        return f0, time
+
+    def analyze(self, inputDataFile: str, dioImageFile: str, harvestImageFile: str, samplingRate: int):
+        y, sr = librosa.load(inputDataFile, samplingRate)
+        y = y.astype(np.float64)
+        spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
+        f0_dio, times = self._get_f0_dio(y, sr=samplingRate)
+        f0_harvest, times = self._get_f0_harvest(y, sr=samplingRate)
+
+        pylab.close()
+        HOP_LENGTH = 128
+        img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
+        pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
+        pylab.savefig(dioImageFile)
+
+        pylab.close()
+        HOP_LENGTH = 128
+        img = librosa.display.specshow(spec, sr=samplingRate, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
+        pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
+        pylab.savefig(harvestImageFile)
--- a/server/voice_changer/IORecorder.py
+++ b/server/voice_changer/IORecorder.py
@ -0,0 +1,37 @@
+import wave
+import os
+
+
+class IORecorder:
+
+    def __init__(self, inputFilename: str, outputFilename: str, samplingRate: int):
+
+        self._clearFile(inputFilename)
+        self._clearFile(outputFilename)
+
+        self.fi = wave.open(inputFilename, 'wb')
+        self.fi.setnchannels(1)
+        self.fi.setsampwidth(2)
+        self.fi.setframerate(samplingRate)
+
+        self.fo = wave.open(outputFilename, 'wb')
+        self.fo.setnchannels(1)
+        self.fo.setsampwidth(2)
+        self.fo.setframerate(samplingRate)
+
+    def _clearFile(self, filename: str):
+        if os.path.exists(filename):
+            print("[IORecorder] delete old analyze file.", filename)
+            os.remove(filename)
+        else:
+            print("[IORecorder] old analyze file not exist.", filename)
+
+    def writeInput(self, wav):
+        self.fi.writeframes(wav)
+
+    def writeOutput(self, wav):
+        self.fo.writeframes(wav)
+
+    def close(self):
+        self.fi.close()
+        self.fo.close()
--- a/server/voice_changer/VoiceChanger.py
+++ b/server/voice_changer/VoiceChanger.py
@ -18,6 +18,9 @@ import pyworld as pw
 from voice_changer.client_modules import convert_continuos_f0, spectrogram_torch, TextAudioSpeakerCollate, get_hparams_from_file, load_checkpoint

 from voice_changer.MMVCv15 import MMVCv15
+from voice_changer.IORecorder import IORecorder
+from voice_changer.IOAnalyzer import IOAnalyzer
+

 import time

@ -34,53 +37,10 @@ import librosa.display
 SAMPLING_RATE = 24000


-class MockStream:
-    """gi
-    オーディオストリーミング入出力をファイル入出力にそのまま置き換えるためのモック
-    """
-
-    def __init__(self, sampling_rate):
-        self.sampling_rate = sampling_rate
-        self.start_count = 2
-        self.end_count = 2
-        self.fr = None
-        self.fw = None
-
-    def open_inputfile(self, input_filename):
-        self.fr = wave.open(input_filename, 'rb')
-
-    def open_outputfile(self, output_filename):
-        self.fw = wave.open(output_filename, 'wb')
-        self.fw.setnchannels(1)
-        self.fw.setsampwidth(2)
-        self.fw.setframerate(self.sampling_rate)
-
-    def read(self, length, exception_on_overflow=False):
-        if self.start_count > 0:
-            wav = bytes(length * 2)
-            self.start_count -= 1  # 最初の2回はダミーの空データ送る
-        else:
-            wav = self.fr.readframes(length)
-        if len(wav) <= 0:  # データなくなってから最後の2回はダミーの空データを送る
-            wav = bytes(length * 2)
-            self.end_count -= 1
-            if self.end_count < 0:
-                Hyperparameters.VC_END_FLAG = True
-        return wav
-
-    def write(self, wav):
-        self.fw.writeframes(wav)
-
-    def stop_stream(self):
-        pass
-
-    def close(self):
-        if self.fr != None:
-            self.fr.close()
-            self.fr = None
-        if self.fw != None:
-            self.fw.close()
-            self.fw = None
+STREAM_INPUT_FILE = os.path.join(TMP_DIR, "in.wav")
+STREAM_OUTPUT_FILE = os.path.join(TMP_DIR, "out.wav")
+STREAM_ANALYZE_FILE_DIO = os.path.join(TMP_DIR, "analyze-dio.png")
+STREAM_ANALYZE_FILE_HARVEST = os.path.join(TMP_DIR, "analyze-harvest.png")


@dataclass
@ -136,33 +96,6 @@ class VoiceChanger():

        print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num}, mps_enabled:{self.mps_enabled})")

-    def _setupRecordIO(self):
-        # IO Recorder Setup
-        if hasattr(self, "stream_out"):
-            self.stream_out.close()
-        mock_stream_out = MockStream(24000)
-        stream_output_file = os.path.join(TMP_DIR, "out.wav")
-        if os.path.exists(stream_output_file):
-            print("delete old analyze file.", stream_output_file)
-            os.remove(stream_output_file)
-        else:
-            print("old analyze file not exist.", stream_output_file)
-
-        mock_stream_out.open_outputfile(stream_output_file)
-        self.stream_out = mock_stream_out
-
-        if hasattr(self, "stream_in"):
-            self.stream_in.close()
-        mock_stream_in = MockStream(24000)
-        stream_input_file = os.path.join(TMP_DIR, "in.wav")
-        if os.path.exists(stream_input_file):
-            print("delete old analyze file.", stream_input_file)
-            os.remove(stream_input_file)
-        else:
-            print("old analyze file not exist.", stream_output_file)
-        mock_stream_in.open_outputfile(stream_input_file)
-        self.stream_in = mock_stream_in
-
    def loadModel(self, config: str, pyTorch_model_file: str = None, onnx_model_file: str = None):
        self.settings.configFile = config
        self.hps = get_hparams_from_file(config)
@ -222,18 +155,6 @@ class VoiceChanger():

        return data

-    def _get_f0_dio(self, y, sr=SAMPLING_RATE):
-        _f0, time = pw.dio(y, sr, frame_period=5)
-        f0 = pw.stonemask(y, _f0, time, sr)
-        time = np.linspace(0, y.shape[0] / sr, len(time))
-        return f0, time
-
-    def _get_f0_harvest(self, y, sr=SAMPLING_RATE):
-        _f0, time = pw.harvest(y, sr, frame_period=5)
-        f0 = pw.stonemask(y, _f0, time, sr)
-        time = np.linspace(0, y.shape[0] / sr, len(time))
-        return f0, time
-
    def update_setteings(self, key: str, val: any):
        if key == "onnxExecutionProvider" and self.onnx_session != None:
            if val == "CUDAExecutionProvider":
@ -254,31 +175,22 @@ class VoiceChanger():
            if key == "crossFadeOffsetRate" or key == "crossFadeEndRate":
                self.unpackedData_length = 0
            if key == "recordIO" and val == 1:
-                self._setupRecordIO()
+                if hasattr(self, "ioRecorder"):
+                    self.ioRecorder.close()
+                self.ioRecorder = IORecorder(STREAM_INPUT_FILE, STREAM_OUTPUT_FILE, self.settings.inputSampleRate)
            if key == "recordIO" and val == 0:
+                if hasattr(self, "ioRecorder"):
+                    self.ioRecorder.close()
                pass
            if key == "recordIO" and val == 2:
+                if hasattr(self, "ioRecorder"):
+                    self.ioRecorder.close()
+
+                if hasattr(self, "ioAnalyzer") == False:
+                    self.ioAnalyzer = IOAnalyzer()
+
                try:
-                    stream_input_file = os.path.join(TMP_DIR, "in.wav")
-                    analyze_file_dio = os.path.join(TMP_DIR, "analyze-dio.png")
-                    analyze_file_harvest = os.path.join(TMP_DIR, "analyze-harvest.png")
-                    y, sr = librosa.load(stream_input_file, SAMPLING_RATE)
-                    y = y.astype(np.float64)
-                    spec = librosa.amplitude_to_db(np.abs(librosa.stft(y, n_fft=2048, win_length=2048, hop_length=128)), ref=np.max)
-                    f0_dio, times = self._get_f0_dio(y)
-                    f0_harvest, times = self._get_f0_harvest(y)
-
-                    pylab.close()
-                    HOP_LENGTH = 128
-                    img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
-                    pylab.plot(times, f0_dio, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
-                    pylab.savefig(analyze_file_dio)
-
-                    pylab.close()
-                    HOP_LENGTH = 128
-                    img = librosa.display.specshow(spec, sr=SAMPLING_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='log', )
-                    pylab.plot(times, f0_harvest, label='f0', color=(0, 1, 1, 0.6), linewidth=3)
-                    pylab.savefig(analyze_file_harvest)
+                    self.ioAnalyzer.analyze(STREAM_INPUT_FILE, STREAM_ANALYZE_FILE_DIO, STREAM_ANALYZE_FILE_HARVEST, self.settings.inputSampleRate)

                except Exception as e:
                    print("recordIO exception", e)
@ -462,8 +374,10 @@ class VoiceChanger():
            result = result.astype(np.int16)
            # print("on_request result size:",result.shape)
            if self.settings.recordIO == 1:
-                self.stream_in.write(unpackedData.astype(np.int16).tobytes())
-                self.stream_out.write(result.tobytes())
+                # self.stream_in.write(unpackedData.astype(np.int16).tobytes())
+                # self.stream_out.write(result.tobytes())
+                self.ioRecorder.writeInput(unpackedData.astype(np.int16).tobytes())
+                self.ioRecorder.writeOutput(result.tobytes())

            if self.settings.inputSampleRate != 24000:
                result = resampy.resample(result, 24000, 48000).astype(np.int16)