From b559582dc422eb47a2c48233d9543c8b44b72308 Mon Sep 17 00:00:00 2001 From: w-okada Date: Mon, 17 Jul 2023 22:21:58 +0900 Subject: [PATCH] bugfix WIP: server device mode --- server/voice_changer/DiffusionSVC/DiffusionSVC.py | 13 +++++++------ .../voice_changer/DiffusionSVC/pipeline/Pipeline.py | 2 -- server/voice_changer/RVC/RVC.py | 2 +- server/voice_changer/VoiceChanger.py | 6 ++++++ server/voice_changer/VoiceChangerManager.py | 4 ++-- server/voice_changer/VoiceChangerV2.py | 8 ++++++++ server/voice_changer/utils/VoiceChangerIF.py | 6 ++++++ 7 files changed, 30 insertions(+), 11 deletions(-) diff --git a/server/voice_changer/DiffusionSVC/DiffusionSVC.py b/server/voice_changer/DiffusionSVC/DiffusionSVC.py index 182a75c8..7f371eb2 100644 --- a/server/voice_changer/DiffusionSVC/DiffusionSVC.py +++ b/server/voice_changer/DiffusionSVC/DiffusionSVC.py @@ -42,7 +42,7 @@ class DiffusionSVC(VoiceChangerModel): # その他の設定 self.settings.tran = self.slotInfo.defaultTune self.settings.dstId = self.slotInfo.dstId - self.settings.kstep = self.slotInfo.defaultKstep + self.settings.kStep = self.slotInfo.defaultKstep print("[Voice Changer] [DiffusionSVC] Initializing... done") @@ -86,8 +86,8 @@ class DiffusionSVC(VoiceChangerModel): solaSearchFrame: int = 0, ): newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) - - new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate # 100 は hubertのhosizeから (16000 / 160) + new_feature_length = int(((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) # 100 は hubertのhosizeから (16000 / 160). + # ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。 if self.audio_buffer is not None: # 過去のデータに連結 self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0) @@ -104,13 +104,14 @@ class DiffusionSVC(VoiceChangerModel): convertSize = convertSize + (128 - (convertSize % 128)) # バッファがたまっていない場合はzeroで補う + generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1 if self.audio_buffer.shape[0] < convertSize: self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer]) - self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer]) - self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer]) + self.pitchf_buffer = np.concatenate([np.zeros(generateFeatureLength), self.pitchf_buffer]) + self.feature_buffer = np.concatenate([np.zeros([generateFeatureLength, self.slotInfo.embChannels]), self.feature_buffer]) convertOffset = -1 * convertSize - featureOffset = -convertSize * 100 // self.slotInfo.samplingRate + featureOffset = -1 * generateFeatureLength self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出 self.pitchf_buffer = self.pitchf_buffer[featureOffset:] self.feature_buffer = self.feature_buffer[featureOffset:] diff --git a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py index 1a266975..ba730e77 100644 --- a/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py +++ b/server/voice_changer/DiffusionSVC/pipeline/Pipeline.py @@ -143,8 +143,6 @@ class Pipeline(object): f0_up_key, silence_front=silence_front, ) -# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0): - pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() except IndexError as e: # NOQA raise NotEnoughDataExtimateF0() diff --git a/server/voice_changer/RVC/RVC.py b/server/voice_changer/RVC/RVC.py index e6adcfb7..87a578b6 100644 --- a/server/voice_changer/RVC/RVC.py +++ b/server/voice_changer/RVC/RVC.py @@ -101,7 +101,7 @@ class RVC(VoiceChangerModel): solaSearchFrame: int = 0, ): newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。(extraDataLength, Crossfade等も同じSRで処理)(★1) - + # ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。 new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate if self.audio_buffer is not None: # 過去のデータに連結 diff --git a/server/voice_changer/VoiceChanger.py b/server/voice_changer/VoiceChanger.py index ad515609..d199f067 100755 --- a/server/voice_changer/VoiceChanger.py +++ b/server/voice_changer/VoiceChanger.py @@ -84,6 +84,12 @@ class VoiceChanger(VoiceChangerIF): def setModel(self, model: Any): self.voiceChanger = model + def setInputSampleRate(self, sr: int): + self.settings.inputSampleRate = sr + + def setOutputSampleRate(self, sr: int): + self.settings.outputSampleRate = sr + def get_info(self): data = asdict(self.settings) if self.voiceChanger is not None: diff --git a/server/voice_changer/VoiceChangerManager.py b/server/voice_changer/VoiceChangerManager.py index 91779539..23beeb2d 100644 --- a/server/voice_changer/VoiceChangerManager.py +++ b/server/voice_changer/VoiceChangerManager.py @@ -53,10 +53,10 @@ class VoiceChangerManager(ServerDeviceCallbacks): return self.voiceChanger.get_processing_sampling_rate() def setInputSamplingRate(self, sr: int): - self.voiceChanger.settings.inputSampleRate = sr + self.voiceChanger.setInputSampleRate(sr) def setOutputSamplingRate(self, sr: int): - self.voiceChanger.settings.outputSampleRate = sr + self.voiceChanger.setOutputSampleRate(sr) ############################ # VoiceChangerManager diff --git a/server/voice_changer/VoiceChangerV2.py b/server/voice_changer/VoiceChangerV2.py index b4c51802..1f3c7fe2 100644 --- a/server/voice_changer/VoiceChangerV2.py +++ b/server/voice_changer/VoiceChangerV2.py @@ -95,6 +95,14 @@ class VoiceChangerV2(VoiceChangerIF): self.voiceChanger = model self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + def setInputSampleRate(self, sr: int): + self.settings.inputSampleRate = sr + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + + def setOutputSampleRate(self, sr: int): + self.settings.outputSampleRate = sr + self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate) + def get_info(self): data = asdict(self.settings) if self.voiceChanger is not None: diff --git a/server/voice_changer/utils/VoiceChangerIF.py b/server/voice_changer/utils/VoiceChangerIF.py index 8063528e..2aa4c94e 100644 --- a/server/voice_changer/utils/VoiceChangerIF.py +++ b/server/voice_changer/utils/VoiceChangerIF.py @@ -25,3 +25,9 @@ class VoiceChangerIF(Protocol): def export2onnx() -> Any: ... + + def setInputSampleRate(self, sr: int): + ... + + def setOutputSampleRate(self, sr: int): + ...