mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-02-03 00:33:57 +03:00
Diffusion SVC:
pitch extractor sr is changed from fixed(16k) to audio sampl rate
This commit is contained in:
parent
e8aeb1eaa7
commit
371e1b8cac
2
client/demo/dist/index.js
vendored
2
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
2712
client/demo/package-lock.json
generated
2712
client/demo/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -21,21 +21,21 @@
|
|||||||
"author": "wataru.okada@flect.co.jp",
|
"author": "wataru.okada@flect.co.jp",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@babel/core": "^7.22.8",
|
"@babel/core": "^7.22.9",
|
||||||
"@babel/plugin-transform-runtime": "^7.22.7",
|
"@babel/plugin-transform-runtime": "^7.22.9",
|
||||||
"@babel/preset-env": "^7.22.7",
|
"@babel/preset-env": "^7.22.9",
|
||||||
"@babel/preset-react": "^7.22.5",
|
"@babel/preset-react": "^7.22.5",
|
||||||
"@babel/preset-typescript": "^7.22.5",
|
"@babel/preset-typescript": "^7.22.5",
|
||||||
"@types/node": "^20.4.1",
|
"@types/node": "^20.4.2",
|
||||||
"@types/react": "^18.2.14",
|
"@types/react": "^18.2.15",
|
||||||
"@types/react-dom": "^18.2.6",
|
"@types/react-dom": "^18.2.7",
|
||||||
"autoprefixer": "^10.4.14",
|
"autoprefixer": "^10.4.14",
|
||||||
"babel-loader": "^9.1.3",
|
"babel-loader": "^9.1.3",
|
||||||
"copy-webpack-plugin": "^11.0.0",
|
"copy-webpack-plugin": "^11.0.0",
|
||||||
"css-loader": "^6.8.1",
|
"css-loader": "^6.8.1",
|
||||||
"eslint": "^8.44.0",
|
"eslint": "^8.45.0",
|
||||||
"eslint-config-prettier": "^8.8.0",
|
"eslint-config-prettier": "^8.8.0",
|
||||||
"eslint-plugin-prettier": "^4.2.1",
|
"eslint-plugin-prettier": "^5.0.0",
|
||||||
"eslint-plugin-react": "^7.32.2",
|
"eslint-plugin-react": "^7.32.2",
|
||||||
"eslint-webpack-plugin": "^4.0.1",
|
"eslint-webpack-plugin": "^4.0.1",
|
||||||
"html-loader": "^4.2.0",
|
"html-loader": "^4.2.0",
|
||||||
@ -54,12 +54,13 @@
|
|||||||
"webpack-dev-server": "^4.15.1"
|
"webpack-dev-server": "^4.15.1"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@dannadori/voice-changer-client-js": "^1.0.160",
|
"@dannadori/voice-changer-client-js": "^1.0.161",
|
||||||
"@fortawesome/fontawesome-svg-core": "^6.4.0",
|
"@fortawesome/fontawesome-svg-core": "^6.4.0",
|
||||||
"@fortawesome/free-brands-svg-icons": "^6.4.0",
|
"@fortawesome/free-brands-svg-icons": "^6.4.0",
|
||||||
"@fortawesome/free-regular-svg-icons": "^6.4.0",
|
"@fortawesome/free-regular-svg-icons": "^6.4.0",
|
||||||
"@fortawesome/free-solid-svg-icons": "^6.4.0",
|
"@fortawesome/free-solid-svg-icons": "^6.4.0",
|
||||||
"@fortawesome/react-fontawesome": "^0.2.0",
|
"@fortawesome/react-fontawesome": "^0.2.0",
|
||||||
|
"protobufjs": "^7.2.4",
|
||||||
"react": "^18.2.0",
|
"react": "^18.2.0",
|
||||||
"react-dom": "^18.2.0"
|
"react-dom": "^18.2.0"
|
||||||
}
|
}
|
||||||
|
2635
client/lib/package-lock.json
generated
2635
client/lib/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@dannadori/voice-changer-client-js",
|
"name": "@dannadori/voice-changer-client-js",
|
||||||
"version": "1.0.160",
|
"version": "1.0.161",
|
||||||
"description": "",
|
"description": "",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"directories": {
|
"directories": {
|
||||||
@ -27,15 +27,15 @@
|
|||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/audioworklet": "^0.0.48",
|
"@types/audioworklet": "^0.0.48",
|
||||||
"@types/node": "^20.4.1",
|
"@types/node": "^20.4.2",
|
||||||
"@types/react": "18.2.14",
|
"@types/react": "18.2.15",
|
||||||
"@types/react-dom": "18.2.6",
|
"@types/react-dom": "18.2.7",
|
||||||
"eslint": "^8.44.0",
|
"eslint": "^8.45.0",
|
||||||
"eslint-config-prettier": "^8.8.0",
|
"eslint-config-prettier": "^8.8.0",
|
||||||
"eslint-plugin-prettier": "^4.2.1",
|
"eslint-plugin-prettier": "^5.0.0",
|
||||||
"eslint-plugin-react": "^7.25.3",
|
"eslint-plugin-react": "^7.32.2",
|
||||||
"eslint-webpack-plugin": "^4.0.1",
|
"eslint-webpack-plugin": "^4.0.1",
|
||||||
"npm-run-all": "^4.1.2",
|
"npm-run-all": "^4.1.5",
|
||||||
"prettier": "^3.0.0",
|
"prettier": "^3.0.0",
|
||||||
"raw-loader": "^4.0.2",
|
"raw-loader": "^4.0.2",
|
||||||
"rimraf": "^5.0.1",
|
"rimraf": "^5.0.1",
|
||||||
@ -47,9 +47,10 @@
|
|||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/readable-stream": "^2.3.15",
|
"@types/readable-stream": "^2.3.15",
|
||||||
"amazon-chime-sdk-js": "^2.7.0",
|
"amazon-chime-sdk-js": "^3.15.0",
|
||||||
"buffer": "^6.0.3",
|
"buffer": "^6.0.3",
|
||||||
"localforage": "^1.10.0",
|
"localforage": "^1.10.0",
|
||||||
|
"protobufjs": "^7.2.4",
|
||||||
"react": "^18.2.0",
|
"react": "^18.2.0",
|
||||||
"react-dom": "^18.2.0",
|
"react-dom": "^18.2.0",
|
||||||
"socket.io-client": "^4.7.1"
|
"socket.io-client": "^4.7.1"
|
||||||
|
@ -156,6 +156,7 @@ class DiffusionSVC(VoiceChangerModel):
|
|||||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||||
sid,
|
sid,
|
||||||
audio,
|
audio,
|
||||||
|
self.inputSampleRate,
|
||||||
pitchf,
|
pitchf,
|
||||||
feature,
|
feature,
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
|
@ -104,6 +104,7 @@ class Pipeline(object):
|
|||||||
self,
|
self,
|
||||||
sid,
|
sid,
|
||||||
audio, # torch.tensor [n]
|
audio, # torch.tensor [n]
|
||||||
|
sr,
|
||||||
pitchf, # np.array [m]
|
pitchf, # np.array [m]
|
||||||
feature, # np.array [m, feat]
|
feature, # np.array [m, feat]
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
@ -126,13 +127,23 @@ class Pipeline(object):
|
|||||||
with Timer("pre-process") as t:
|
with Timer("pre-process") as t:
|
||||||
# ピッチ検出
|
# ピッチ検出
|
||||||
try:
|
try:
|
||||||
|
# pitch = self.pitchExtractor.extract(
|
||||||
|
# audio16k.squeeze(),
|
||||||
|
# pitchf,
|
||||||
|
# f0_up_key,
|
||||||
|
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||||
|
# silence_front=silence_front,
|
||||||
|
# )
|
||||||
pitch = self.pitchExtractor.extract(
|
pitch = self.pitchExtractor.extract(
|
||||||
audio16k.squeeze(),
|
audio,
|
||||||
|
sr,
|
||||||
|
self.inferencer_block_size,
|
||||||
|
self.inferencer_sampling_rate,
|
||||||
pitchf,
|
pitchf,
|
||||||
f0_up_key,
|
f0_up_key,
|
||||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
|
||||||
silence_front=silence_front,
|
silence_front=silence_front,
|
||||||
)
|
)
|
||||||
|
# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
|
|
||||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
|
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
|
||||||
except IndexError as e: # NOQA
|
except IndexError as e: # NOQA
|
||||||
|
@ -3,9 +3,9 @@ from const import PitchExtractorType
|
|||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||||
import onnxruntime
|
import onnxruntime
|
||||||
import torch
|
|
||||||
|
|
||||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class CrepeOnnxPitchExtractor(PitchExtractor):
|
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||||
@ -26,18 +26,20 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
|
|||||||
self.sapmle_rate = 16000
|
self.sapmle_rate = 16000
|
||||||
self.uv_interp = True
|
self.uv_interp = True
|
||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
hop_size = block_size * sr / model_sr
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
|
||||||
|
|
||||||
precision = (1000 * window / self.sapmle_rate)
|
offset_frame_number = silence_front * sr
|
||||||
|
start_frame = int(offset_frame_number / hop_size) # frame
|
||||||
|
real_silence_front = start_frame * hop_size / sr # 秒
|
||||||
|
audio = audio[int(np.round(real_silence_front * sr)):].astype(np.float32)
|
||||||
|
|
||||||
|
precision = (1000 * hop_size / sr)
|
||||||
|
|
||||||
audio_num = audio.cpu()
|
|
||||||
onnx_f0, onnx_pd = onnxcrepe.predict(
|
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||||
self.onnx_session,
|
self.onnx_session,
|
||||||
audio_num,
|
audio,
|
||||||
self.sapmle_rate,
|
sr,
|
||||||
precision=precision,
|
precision=precision,
|
||||||
fmin=self.f0_min,
|
fmin=self.f0_min,
|
||||||
fmax=self.f0_max,
|
fmax=self.f0_max,
|
||||||
|
@ -3,6 +3,7 @@ import torch
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
from const import PitchExtractorType
|
from const import PitchExtractorType
|
||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class CrepePitchExtractor(PitchExtractor):
|
class CrepePitchExtractor(PitchExtractor):
|
||||||
@ -12,22 +13,25 @@ class CrepePitchExtractor(PitchExtractor):
|
|||||||
self.pitchExtractorType: PitchExtractorType = "crepe"
|
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||||
self.f0_min = 50
|
self.f0_min = 50
|
||||||
self.f0_max = 1100
|
self.f0_max = 1100
|
||||||
self.sapmle_rate = 16000
|
|
||||||
self.uv_interp = True
|
self.uv_interp = True
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||||
else:
|
else:
|
||||||
self.device = torch.device("cpu")
|
self.device = torch.device("cpu")
|
||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
hop_size = block_size * sr / model_sr
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
|
||||||
|
offset_frame_number = silence_front * 16000
|
||||||
|
start_frame = int(offset_frame_number / hop_size) # frame
|
||||||
|
real_silence_front = start_frame * hop_size / 16000 # 秒
|
||||||
|
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
|
||||||
|
|
||||||
f0, pd = torchcrepe.predict(
|
f0, pd = torchcrepe.predict(
|
||||||
audio.unsqueeze(0),
|
audio_t,
|
||||||
self.sapmle_rate,
|
sr,
|
||||||
hop_length=window,
|
hop_length=hop_size,
|
||||||
fmin=self.f0_min,
|
fmin=self.f0_min,
|
||||||
fmax=self.f0_max,
|
fmax=self.f0_max,
|
||||||
# model="tiny",
|
# model="tiny",
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import pyworld
|
import pyworld
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from const import PitchExtractorType
|
from const import PitchExtractorType
|
||||||
import torch
|
|
||||||
|
|
||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class DioPitchExtractor(PitchExtractor):
|
class DioPitchExtractor(PitchExtractor):
|
||||||
@ -13,25 +13,28 @@ class DioPitchExtractor(PitchExtractor):
|
|||||||
self.pitchExtractorType: PitchExtractorType = "dio"
|
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||||
self.f0_min = 50
|
self.f0_min = 50
|
||||||
self.f0_max = 1100
|
self.f0_max = 1100
|
||||||
self.sapmle_rate = 16000
|
# self.sapmle_rate = 44100
|
||||||
|
# self.sapmle_rate = 16000
|
||||||
self.uv_interp = True
|
self.uv_interp = True
|
||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
audio = audio.detach().cpu().numpy()
|
silence_front: int = 0. # TODO: chunkサイズが小さいときに音程を取れなくなる対策
|
||||||
silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策
|
hop_size = block_size * sr / model_sr
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
offset_frame_number = silence_front * sr
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
start_frame = int(offset_frame_number / hop_size) # frame
|
||||||
|
real_silence_front = start_frame * hop_size / sr # 秒
|
||||||
|
audio = audio[int(np.round(real_silence_front * sr)):]
|
||||||
|
|
||||||
_f0, t = pyworld.dio(
|
_f0, t = pyworld.dio(
|
||||||
audio.astype(np.double),
|
audio.astype(np.double),
|
||||||
self.sapmle_rate,
|
sr,
|
||||||
f0_floor=self.f0_min,
|
f0_floor=self.f0_min,
|
||||||
f0_ceil=self.f0_max,
|
f0_ceil=self.f0_max,
|
||||||
channels_in_octave=2,
|
channels_in_octave=2,
|
||||||
frame_period=(1000 * window / self.sapmle_rate)
|
frame_period=(1000 * hop_size / sr)
|
||||||
)
|
)
|
||||||
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sapmle_rate)
|
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
||||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||||
f0 = pitch
|
f0 = pitch
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
import pyworld
|
import pyworld
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from const import PitchExtractorType
|
from const import PitchExtractorType
|
||||||
import torch
|
|
||||||
|
|
||||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class HarvestPitchExtractor(PitchExtractor):
|
class HarvestPitchExtractor(PitchExtractor):
|
||||||
@ -13,20 +13,22 @@ class HarvestPitchExtractor(PitchExtractor):
|
|||||||
self.pitchExtractorType: PitchExtractorType = "harvest"
|
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||||
self.f0_min = 50
|
self.f0_min = 50
|
||||||
self.f0_max = 1100
|
self.f0_max = 1100
|
||||||
self.sapmle_rate = 16000
|
|
||||||
self.uv_interp = True
|
self.uv_interp = True
|
||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
audio = audio.detach().cpu().numpy()
|
hop_size = block_size * sr / model_sr
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
offset_frame_number = silence_front * sr
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
start_frame = int(offset_frame_number / hop_size) # frame
|
||||||
|
real_silence_front = start_frame * hop_size / sr # 秒
|
||||||
|
audio = audio[int(np.round(real_silence_front * sr)):]
|
||||||
|
|
||||||
f0, _ = pyworld.harvest(
|
f0, _ = pyworld.harvest(
|
||||||
audio.astype('double'),
|
audio.astype('double'),
|
||||||
self.sapmle_rate,
|
sr,
|
||||||
f0_floor=self.f0_min,
|
f0_floor=self.f0_min,
|
||||||
f0_ceil=self.f0_max,
|
f0_ceil=self.f0_max,
|
||||||
frame_period=(1000 * window / self.sapmle_rate))
|
frame_period=(1000 * hop_size / sr))
|
||||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||||
f0 = pitch
|
f0 = pitch
|
||||||
|
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
from typing import Protocol
|
from typing import Protocol
|
||||||
|
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class PitchExtractor(Protocol):
|
class PitchExtractor(Protocol):
|
||||||
|
|
||||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
...
|
...
|
||||||
|
|
||||||
def getPitchExtractorInfo(self):
|
def getPitchExtractorInfo(self):
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
from torchaudio.transforms import Resample
|
||||||
import torch
|
import torch
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from const import PitchExtractorType
|
from const import PitchExtractorType
|
||||||
@ -5,6 +6,8 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
|
|||||||
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
|
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
|
||||||
from scipy.ndimage import zoom
|
from scipy.ndimage import zoom
|
||||||
|
|
||||||
|
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||||
|
|
||||||
|
|
||||||
class RMVPEPitchExtractor(PitchExtractor):
|
class RMVPEPitchExtractor(PitchExtractor):
|
||||||
|
|
||||||
@ -13,8 +16,8 @@ class RMVPEPitchExtractor(PitchExtractor):
|
|||||||
self.pitchExtractorType: PitchExtractorType = "rmvpe"
|
self.pitchExtractorType: PitchExtractorType = "rmvpe"
|
||||||
self.f0_min = 50
|
self.f0_min = 50
|
||||||
self.f0_max = 1100
|
self.f0_max = 1100
|
||||||
self.sapmle_rate = 16000
|
|
||||||
self.uv_interp = True
|
self.uv_interp = True
|
||||||
|
self.input_sr = -1
|
||||||
if torch.cuda.is_available() and gpu >= 0:
|
if torch.cuda.is_available() and gpu >= 0:
|
||||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||||
else:
|
else:
|
||||||
@ -22,32 +25,24 @@ class RMVPEPitchExtractor(PitchExtractor):
|
|||||||
|
|
||||||
self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device)
|
self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device)
|
||||||
|
|
||||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
if sr != self.input_sr:
|
||||||
real_silence_front = start_frame * window / self.sapmle_rate
|
self.resamle = Resample(sr, 16000, dtype=torch.int16).to(self.device)
|
||||||
|
self.input_sr = sr
|
||||||
|
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||||
|
audio_t = self.resamle(audio_t)
|
||||||
|
hop_size = 160 # RMVPE固定
|
||||||
|
|
||||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
offset_frame_number = silence_front * 16000
|
||||||
silented_frames = int(audio.size(0) // window) + 1
|
start_frame = int(offset_frame_number / hop_size) # frame
|
||||||
|
real_silence_front = start_frame * hop_size / 16000 # 秒
|
||||||
|
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
|
||||||
|
|
||||||
f0 = self.rmvpe.infer_from_audio_t(audio, thred=0.03)
|
f0 = self.rmvpe.infer_from_audio_t(audio_t.squeeze(), thred=0.03)
|
||||||
# f0, pd = torchcrepe.predict(
|
|
||||||
# audio.unsqueeze(0),
|
desired_hop_size = block_size * 16000 / model_sr
|
||||||
# self.sapmle_rate,
|
desired_f0_length = int(audio_t.shape[1] // desired_hop_size) + 1
|
||||||
# hop_length=window,
|
resize_factor = desired_f0_length / len(f0)
|
||||||
# fmin=self.f0_min,
|
|
||||||
# fmax=self.f0_max,
|
|
||||||
# # model="tiny",
|
|
||||||
# model="full",
|
|
||||||
# batch_size=256,
|
|
||||||
# decoder=torchcrepe.decode.weighted_argmax,
|
|
||||||
# device=self.device,
|
|
||||||
# return_periodicity=True,
|
|
||||||
# )
|
|
||||||
# f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
|
|
||||||
# pd = torchcrepe.filter.median(pd, 3)
|
|
||||||
# f0[pd < 0.1] = 0
|
|
||||||
# f0 = f0.squeeze()
|
|
||||||
resize_factor = silented_frames / len(f0)
|
|
||||||
f0 = zoom(f0, resize_factor, order=0)
|
f0 = zoom(f0, resize_factor, order=0)
|
||||||
|
|
||||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||||
|
@ -240,7 +240,6 @@ class E2E(nn.Module):
|
|||||||
)
|
)
|
||||||
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
|
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
|
||||||
if n_gru:
|
if n_gru:
|
||||||
print("N_GRUE")
|
|
||||||
self.fc = nn.Sequential(
|
self.fc = nn.Sequential(
|
||||||
BiGRU(3 * 128, 256, n_gru),
|
BiGRU(3 * 128, 256, n_gru),
|
||||||
nn.Linear(512, 360),
|
nn.Linear(512, 360),
|
||||||
|
Loading…
Reference in New Issue
Block a user