mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
Diffusion SVC:
pitch extractor sr is changed from fixed(16k) to audio sampl rate
This commit is contained in:
parent
e8aeb1eaa7
commit
371e1b8cac
2
client/demo/dist/index.js
vendored
2
client/demo/dist/index.js
vendored
File diff suppressed because one or more lines are too long
2712
client/demo/package-lock.json
generated
2712
client/demo/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -21,21 +21,21 @@
|
||||
"author": "wataru.okada@flect.co.jp",
|
||||
"license": "ISC",
|
||||
"devDependencies": {
|
||||
"@babel/core": "^7.22.8",
|
||||
"@babel/plugin-transform-runtime": "^7.22.7",
|
||||
"@babel/preset-env": "^7.22.7",
|
||||
"@babel/core": "^7.22.9",
|
||||
"@babel/plugin-transform-runtime": "^7.22.9",
|
||||
"@babel/preset-env": "^7.22.9",
|
||||
"@babel/preset-react": "^7.22.5",
|
||||
"@babel/preset-typescript": "^7.22.5",
|
||||
"@types/node": "^20.4.1",
|
||||
"@types/react": "^18.2.14",
|
||||
"@types/react-dom": "^18.2.6",
|
||||
"@types/node": "^20.4.2",
|
||||
"@types/react": "^18.2.15",
|
||||
"@types/react-dom": "^18.2.7",
|
||||
"autoprefixer": "^10.4.14",
|
||||
"babel-loader": "^9.1.3",
|
||||
"copy-webpack-plugin": "^11.0.0",
|
||||
"css-loader": "^6.8.1",
|
||||
"eslint": "^8.44.0",
|
||||
"eslint": "^8.45.0",
|
||||
"eslint-config-prettier": "^8.8.0",
|
||||
"eslint-plugin-prettier": "^4.2.1",
|
||||
"eslint-plugin-prettier": "^5.0.0",
|
||||
"eslint-plugin-react": "^7.32.2",
|
||||
"eslint-webpack-plugin": "^4.0.1",
|
||||
"html-loader": "^4.2.0",
|
||||
@ -54,12 +54,13 @@
|
||||
"webpack-dev-server": "^4.15.1"
|
||||
},
|
||||
"dependencies": {
|
||||
"@dannadori/voice-changer-client-js": "^1.0.160",
|
||||
"@dannadori/voice-changer-client-js": "^1.0.161",
|
||||
"@fortawesome/fontawesome-svg-core": "^6.4.0",
|
||||
"@fortawesome/free-brands-svg-icons": "^6.4.0",
|
||||
"@fortawesome/free-regular-svg-icons": "^6.4.0",
|
||||
"@fortawesome/free-solid-svg-icons": "^6.4.0",
|
||||
"@fortawesome/react-fontawesome": "^0.2.0",
|
||||
"protobufjs": "^7.2.4",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0"
|
||||
}
|
||||
|
2635
client/lib/package-lock.json
generated
2635
client/lib/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@dannadori/voice-changer-client-js",
|
||||
"version": "1.0.160",
|
||||
"version": "1.0.161",
|
||||
"description": "",
|
||||
"main": "dist/index.js",
|
||||
"directories": {
|
||||
@ -27,15 +27,15 @@
|
||||
"license": "ISC",
|
||||
"devDependencies": {
|
||||
"@types/audioworklet": "^0.0.48",
|
||||
"@types/node": "^20.4.1",
|
||||
"@types/react": "18.2.14",
|
||||
"@types/react-dom": "18.2.6",
|
||||
"eslint": "^8.44.0",
|
||||
"@types/node": "^20.4.2",
|
||||
"@types/react": "18.2.15",
|
||||
"@types/react-dom": "18.2.7",
|
||||
"eslint": "^8.45.0",
|
||||
"eslint-config-prettier": "^8.8.0",
|
||||
"eslint-plugin-prettier": "^4.2.1",
|
||||
"eslint-plugin-react": "^7.25.3",
|
||||
"eslint-plugin-prettier": "^5.0.0",
|
||||
"eslint-plugin-react": "^7.32.2",
|
||||
"eslint-webpack-plugin": "^4.0.1",
|
||||
"npm-run-all": "^4.1.2",
|
||||
"npm-run-all": "^4.1.5",
|
||||
"prettier": "^3.0.0",
|
||||
"raw-loader": "^4.0.2",
|
||||
"rimraf": "^5.0.1",
|
||||
@ -47,9 +47,10 @@
|
||||
},
|
||||
"dependencies": {
|
||||
"@types/readable-stream": "^2.3.15",
|
||||
"amazon-chime-sdk-js": "^2.7.0",
|
||||
"amazon-chime-sdk-js": "^3.15.0",
|
||||
"buffer": "^6.0.3",
|
||||
"localforage": "^1.10.0",
|
||||
"protobufjs": "^7.2.4",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0",
|
||||
"socket.io-client": "^4.7.1"
|
||||
|
@ -156,6 +156,7 @@ class DiffusionSVC(VoiceChangerModel):
|
||||
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
|
||||
sid,
|
||||
audio,
|
||||
self.inputSampleRate,
|
||||
pitchf,
|
||||
feature,
|
||||
f0_up_key,
|
||||
|
@ -104,6 +104,7 @@ class Pipeline(object):
|
||||
self,
|
||||
sid,
|
||||
audio, # torch.tensor [n]
|
||||
sr,
|
||||
pitchf, # np.array [m]
|
||||
feature, # np.array [m, feat]
|
||||
f0_up_key,
|
||||
@ -126,13 +127,23 @@ class Pipeline(object):
|
||||
with Timer("pre-process") as t:
|
||||
# ピッチ検出
|
||||
try:
|
||||
# pitch = self.pitchExtractor.extract(
|
||||
# audio16k.squeeze(),
|
||||
# pitchf,
|
||||
# f0_up_key,
|
||||
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||
# silence_front=silence_front,
|
||||
# )
|
||||
pitch = self.pitchExtractor.extract(
|
||||
audio16k.squeeze(),
|
||||
audio,
|
||||
sr,
|
||||
self.inferencer_block_size,
|
||||
self.inferencer_sampling_rate,
|
||||
pitchf,
|
||||
f0_up_key,
|
||||
int(self.hop_size), # 処理のwindowサイズ (44100における512)
|
||||
silence_front=silence_front,
|
||||
)
|
||||
# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
|
||||
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
|
||||
except IndexError as e: # NOQA
|
||||
|
@ -3,9 +3,9 @@ from const import PitchExtractorType
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
|
||||
import onnxruntime
|
||||
import torch
|
||||
|
||||
from voice_changer.RVC.pitchExtractor import onnxcrepe
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||
@ -26,18 +26,20 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
|
||||
self.sapmle_rate = 16000
|
||||
self.uv_interp = True
|
||||
|
||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||
real_silence_front = start_frame * window / self.sapmle_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
hop_size = block_size * sr / model_sr
|
||||
|
||||
precision = (1000 * window / self.sapmle_rate)
|
||||
offset_frame_number = silence_front * sr
|
||||
start_frame = int(offset_frame_number / hop_size) # frame
|
||||
real_silence_front = start_frame * hop_size / sr # 秒
|
||||
audio = audio[int(np.round(real_silence_front * sr)):].astype(np.float32)
|
||||
|
||||
precision = (1000 * hop_size / sr)
|
||||
|
||||
audio_num = audio.cpu()
|
||||
onnx_f0, onnx_pd = onnxcrepe.predict(
|
||||
self.onnx_session,
|
||||
audio_num,
|
||||
self.sapmle_rate,
|
||||
audio,
|
||||
sr,
|
||||
precision=precision,
|
||||
fmin=self.f0_min,
|
||||
fmax=self.f0_max,
|
||||
|
@ -3,6 +3,7 @@ import torch
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class CrepePitchExtractor(PitchExtractor):
|
||||
@ -12,22 +13,25 @@ class CrepePitchExtractor(PitchExtractor):
|
||||
self.pitchExtractorType: PitchExtractorType = "crepe"
|
||||
self.f0_min = 50
|
||||
self.f0_max = 1100
|
||||
self.sapmle_rate = 16000
|
||||
self.uv_interp = True
|
||||
if torch.cuda.is_available():
|
||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||
else:
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||
real_silence_front = start_frame * window / self.sapmle_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
hop_size = block_size * sr / model_sr
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
|
||||
offset_frame_number = silence_front * 16000
|
||||
start_frame = int(offset_frame_number / hop_size) # frame
|
||||
real_silence_front = start_frame * hop_size / 16000 # 秒
|
||||
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
|
||||
|
||||
f0, pd = torchcrepe.predict(
|
||||
audio.unsqueeze(0),
|
||||
self.sapmle_rate,
|
||||
hop_length=window,
|
||||
audio_t,
|
||||
sr,
|
||||
hop_length=hop_size,
|
||||
fmin=self.f0_min,
|
||||
fmax=self.f0_max,
|
||||
# model="tiny",
|
||||
|
@ -1,9 +1,9 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
import torch
|
||||
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class DioPitchExtractor(PitchExtractor):
|
||||
@ -13,25 +13,28 @@ class DioPitchExtractor(PitchExtractor):
|
||||
self.pitchExtractorType: PitchExtractorType = "dio"
|
||||
self.f0_min = 50
|
||||
self.f0_max = 1100
|
||||
self.sapmle_rate = 16000
|
||||
# self.sapmle_rate = 44100
|
||||
# self.sapmle_rate = 16000
|
||||
self.uv_interp = True
|
||||
|
||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策
|
||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||
real_silence_front = start_frame * window / self.sapmle_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
silence_front: int = 0. # TODO: chunkサイズが小さいときに音程を取れなくなる対策
|
||||
hop_size = block_size * sr / model_sr
|
||||
|
||||
offset_frame_number = silence_front * sr
|
||||
start_frame = int(offset_frame_number / hop_size) # frame
|
||||
real_silence_front = start_frame * hop_size / sr # 秒
|
||||
audio = audio[int(np.round(real_silence_front * sr)):]
|
||||
|
||||
_f0, t = pyworld.dio(
|
||||
audio.astype(np.double),
|
||||
self.sapmle_rate,
|
||||
sr,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
channels_in_octave=2,
|
||||
frame_period=(1000 * window / self.sapmle_rate)
|
||||
frame_period=(1000 * hop_size / sr)
|
||||
)
|
||||
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sapmle_rate)
|
||||
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
|
||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||
f0 = pitch
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
import pyworld
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
import torch
|
||||
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class HarvestPitchExtractor(PitchExtractor):
|
||||
@ -13,20 +13,22 @@ class HarvestPitchExtractor(PitchExtractor):
|
||||
self.pitchExtractorType: PitchExtractorType = "harvest"
|
||||
self.f0_min = 50
|
||||
self.f0_max = 1100
|
||||
self.sapmle_rate = 16000
|
||||
self.uv_interp = True
|
||||
|
||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||
audio = audio.detach().cpu().numpy()
|
||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||
real_silence_front = start_frame * window / self.sapmle_rate
|
||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
hop_size = block_size * sr / model_sr
|
||||
|
||||
offset_frame_number = silence_front * sr
|
||||
start_frame = int(offset_frame_number / hop_size) # frame
|
||||
real_silence_front = start_frame * hop_size / sr # 秒
|
||||
audio = audio[int(np.round(real_silence_front * sr)):]
|
||||
|
||||
f0, _ = pyworld.harvest(
|
||||
audio.astype('double'),
|
||||
self.sapmle_rate,
|
||||
sr,
|
||||
f0_floor=self.f0_min,
|
||||
f0_ceil=self.f0_max,
|
||||
frame_period=(1000 * window / self.sapmle_rate))
|
||||
frame_period=(1000 * hop_size / sr))
|
||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||
f0 = pitch
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
from typing import Protocol
|
||||
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class PitchExtractor(Protocol):
|
||||
|
||||
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
...
|
||||
|
||||
def getPitchExtractorInfo(self):
|
||||
|
@ -1,3 +1,4 @@
|
||||
from torchaudio.transforms import Resample
|
||||
import torch
|
||||
import numpy as np
|
||||
from const import PitchExtractorType
|
||||
@ -5,6 +6,8 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
|
||||
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
|
||||
from scipy.ndimage import zoom
|
||||
|
||||
from voice_changer.utils.VoiceChangerModel import AudioInOut
|
||||
|
||||
|
||||
class RMVPEPitchExtractor(PitchExtractor):
|
||||
|
||||
@ -13,8 +16,8 @@ class RMVPEPitchExtractor(PitchExtractor):
|
||||
self.pitchExtractorType: PitchExtractorType = "rmvpe"
|
||||
self.f0_min = 50
|
||||
self.f0_max = 1100
|
||||
self.sapmle_rate = 16000
|
||||
self.uv_interp = True
|
||||
self.input_sr = -1
|
||||
if torch.cuda.is_available() and gpu >= 0:
|
||||
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
|
||||
else:
|
||||
@ -22,32 +25,24 @@ class RMVPEPitchExtractor(PitchExtractor):
|
||||
|
||||
self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device)
|
||||
|
||||
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
|
||||
start_frame = int(silence_front * self.sapmle_rate / window)
|
||||
real_silence_front = start_frame * window / self.sapmle_rate
|
||||
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
|
||||
if sr != self.input_sr:
|
||||
self.resamle = Resample(sr, 16000, dtype=torch.int16).to(self.device)
|
||||
self.input_sr = sr
|
||||
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
||||
audio_t = self.resamle(audio_t)
|
||||
hop_size = 160 # RMVPE固定
|
||||
|
||||
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
|
||||
silented_frames = int(audio.size(0) // window) + 1
|
||||
offset_frame_number = silence_front * 16000
|
||||
start_frame = int(offset_frame_number / hop_size) # frame
|
||||
real_silence_front = start_frame * hop_size / 16000 # 秒
|
||||
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
|
||||
|
||||
f0 = self.rmvpe.infer_from_audio_t(audio, thred=0.03)
|
||||
# f0, pd = torchcrepe.predict(
|
||||
# audio.unsqueeze(0),
|
||||
# self.sapmle_rate,
|
||||
# hop_length=window,
|
||||
# fmin=self.f0_min,
|
||||
# fmax=self.f0_max,
|
||||
# # model="tiny",
|
||||
# model="full",
|
||||
# batch_size=256,
|
||||
# decoder=torchcrepe.decode.weighted_argmax,
|
||||
# device=self.device,
|
||||
# return_periodicity=True,
|
||||
# )
|
||||
# f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
|
||||
# pd = torchcrepe.filter.median(pd, 3)
|
||||
# f0[pd < 0.1] = 0
|
||||
# f0 = f0.squeeze()
|
||||
resize_factor = silented_frames / len(f0)
|
||||
f0 = self.rmvpe.infer_from_audio_t(audio_t.squeeze(), thred=0.03)
|
||||
|
||||
desired_hop_size = block_size * 16000 / model_sr
|
||||
desired_f0_length = int(audio_t.shape[1] // desired_hop_size) + 1
|
||||
resize_factor = desired_f0_length / len(f0)
|
||||
f0 = zoom(f0, resize_factor, order=0)
|
||||
|
||||
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
|
||||
|
@ -240,7 +240,6 @@ class E2E(nn.Module):
|
||||
)
|
||||
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
|
||||
if n_gru:
|
||||
print("N_GRUE")
|
||||
self.fc = nn.Sequential(
|
||||
BiGRU(3 * 128, 256, n_gru),
|
||||
nn.Linear(512, 360),
|
||||
|
Loading…
Reference in New Issue
Block a user