bugfix WIP: server device mode

This commit is contained in:
w-okada 2023-07-17 22:21:58 +09:00
parent 371e1b8cac
commit b559582dc4
7 changed files with 30 additions and 11 deletions

View File

@ -42,7 +42,7 @@ class DiffusionSVC(VoiceChangerModel):
# その他の設定
self.settings.tran = self.slotInfo.defaultTune
self.settings.dstId = self.slotInfo.dstId
self.settings.kstep = self.slotInfo.defaultKstep
self.settings.kStep = self.slotInfo.defaultKstep
print("[Voice Changer] [DiffusionSVC] Initializing... done")
@ -86,8 +86,8 @@ class DiffusionSVC(VoiceChangerModel):
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0 # DiffusionSVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate # 100 は hubertのhosizeから (16000 / 160)
new_feature_length = int(((newData.shape[0] / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) # 100 は hubertのhosizeから (16000 / 160).
# ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。
if self.audio_buffer is not None:
# 過去のデータに連結
self.audio_buffer = np.concatenate([self.audio_buffer, newData], 0)
@ -104,13 +104,14 @@ class DiffusionSVC(VoiceChangerModel):
convertSize = convertSize + (128 - (convertSize % 128))
# バッファがたまっていない場合はzeroで補う
generateFeatureLength = int(((convertSize / self.inputSampleRate) * self.slotInfo.samplingRate) / 512) + 1
if self.audio_buffer.shape[0] < convertSize:
self.audio_buffer = np.concatenate([np.zeros([convertSize]), self.audio_buffer])
self.pitchf_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate]), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([convertSize * 100 // self.slotInfo.samplingRate, self.slotInfo.embChannels]), self.feature_buffer])
self.pitchf_buffer = np.concatenate([np.zeros(generateFeatureLength), self.pitchf_buffer])
self.feature_buffer = np.concatenate([np.zeros([generateFeatureLength, self.slotInfo.embChannels]), self.feature_buffer])
convertOffset = -1 * convertSize
featureOffset = -convertSize * 100 // self.slotInfo.samplingRate
featureOffset = -1 * generateFeatureLength
self.audio_buffer = self.audio_buffer[convertOffset:] # 変換対象の部分だけ抽出
self.pitchf_buffer = self.pitchf_buffer[featureOffset:]
self.feature_buffer = self.feature_buffer[featureOffset:]

View File

@ -143,8 +143,6 @@ class Pipeline(object):
f0_up_key,
silence_front=silence_front,
)
# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
except IndexError as e: # NOQA
raise NotEnoughDataExtimateF0()

View File

@ -101,7 +101,7 @@ class RVC(VoiceChangerModel):
solaSearchFrame: int = 0,
):
newData = newData.astype(np.float32) / 32768.0 # RVCのモデルのサンプリングレートで入ってきている。extraDataLength, Crossfade等も同じSRで処理(★1)
# ↑newData.shape[0]//sampleRate でデータ秒数。これに16000かけてhubertの世界でのデータ長。これにhop数(160)でわるとfeatsのデータサイズになる。
new_feature_length = newData.shape[0] * 100 // self.slotInfo.samplingRate
if self.audio_buffer is not None:
# 過去のデータに連結

View File

@ -84,6 +84,12 @@ class VoiceChanger(VoiceChangerIF):
def setModel(self, model: Any):
self.voiceChanger = model
def setInputSampleRate(self, sr: int):
self.settings.inputSampleRate = sr
def setOutputSampleRate(self, sr: int):
self.settings.outputSampleRate = sr
def get_info(self):
data = asdict(self.settings)
if self.voiceChanger is not None:

View File

@ -53,10 +53,10 @@ class VoiceChangerManager(ServerDeviceCallbacks):
return self.voiceChanger.get_processing_sampling_rate()
def setInputSamplingRate(self, sr: int):
self.voiceChanger.settings.inputSampleRate = sr
self.voiceChanger.setInputSampleRate(sr)
def setOutputSamplingRate(self, sr: int):
self.voiceChanger.settings.outputSampleRate = sr
self.voiceChanger.setOutputSampleRate(sr)
############################
# VoiceChangerManager

View File

@ -95,6 +95,14 @@ class VoiceChangerV2(VoiceChangerIF):
self.voiceChanger = model
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
def setInputSampleRate(self, sr: int):
self.settings.inputSampleRate = sr
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
def setOutputSampleRate(self, sr: int):
self.settings.outputSampleRate = sr
self.voiceChanger.setSamplingRate(self.settings.inputSampleRate, self.settings.outputSampleRate)
def get_info(self):
data = asdict(self.settings)
if self.voiceChanger is not None:

View File

@ -25,3 +25,9 @@ class VoiceChangerIF(Protocol):
def export2onnx() -> Any:
...
def setInputSampleRate(self, sr: int):
...
def setOutputSampleRate(self, sr: int):
...