Diffusion SVC:

pitch extractor sr is changed from fixed(16k) to audio sampl rate
This commit is contained in:
w-okada 2023-07-17 21:03:53 +09:00
parent e8aeb1eaa7
commit 371e1b8cac
14 changed files with 4902 additions and 636 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -21,21 +21,21 @@
"author": "wataru.okada@flect.co.jp",
"license": "ISC",
"devDependencies": {
"@babel/core": "^7.22.8",
"@babel/plugin-transform-runtime": "^7.22.7",
"@babel/preset-env": "^7.22.7",
"@babel/core": "^7.22.9",
"@babel/plugin-transform-runtime": "^7.22.9",
"@babel/preset-env": "^7.22.9",
"@babel/preset-react": "^7.22.5",
"@babel/preset-typescript": "^7.22.5",
"@types/node": "^20.4.1",
"@types/react": "^18.2.14",
"@types/react-dom": "^18.2.6",
"@types/node": "^20.4.2",
"@types/react": "^18.2.15",
"@types/react-dom": "^18.2.7",
"autoprefixer": "^10.4.14",
"babel-loader": "^9.1.3",
"copy-webpack-plugin": "^11.0.0",
"css-loader": "^6.8.1",
"eslint": "^8.44.0",
"eslint": "^8.45.0",
"eslint-config-prettier": "^8.8.0",
"eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-react": "^7.32.2",
"eslint-webpack-plugin": "^4.0.1",
"html-loader": "^4.2.0",
@ -54,12 +54,13 @@
"webpack-dev-server": "^4.15.1"
},
"dependencies": {
"@dannadori/voice-changer-client-js": "^1.0.160",
"@dannadori/voice-changer-client-js": "^1.0.161",
"@fortawesome/fontawesome-svg-core": "^6.4.0",
"@fortawesome/free-brands-svg-icons": "^6.4.0",
"@fortawesome/free-regular-svg-icons": "^6.4.0",
"@fortawesome/free-solid-svg-icons": "^6.4.0",
"@fortawesome/react-fontawesome": "^0.2.0",
"protobufjs": "^7.2.4",
"react": "^18.2.0",
"react-dom": "^18.2.0"
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{
"name": "@dannadori/voice-changer-client-js",
"version": "1.0.160",
"version": "1.0.161",
"description": "",
"main": "dist/index.js",
"directories": {
@ -27,15 +27,15 @@
"license": "ISC",
"devDependencies": {
"@types/audioworklet": "^0.0.48",
"@types/node": "^20.4.1",
"@types/react": "18.2.14",
"@types/react-dom": "18.2.6",
"eslint": "^8.44.0",
"@types/node": "^20.4.2",
"@types/react": "18.2.15",
"@types/react-dom": "18.2.7",
"eslint": "^8.45.0",
"eslint-config-prettier": "^8.8.0",
"eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-react": "^7.25.3",
"eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-react": "^7.32.2",
"eslint-webpack-plugin": "^4.0.1",
"npm-run-all": "^4.1.2",
"npm-run-all": "^4.1.5",
"prettier": "^3.0.0",
"raw-loader": "^4.0.2",
"rimraf": "^5.0.1",
@ -47,9 +47,10 @@
},
"dependencies": {
"@types/readable-stream": "^2.3.15",
"amazon-chime-sdk-js": "^2.7.0",
"amazon-chime-sdk-js": "^3.15.0",
"buffer": "^6.0.3",
"localforage": "^1.10.0",
"protobufjs": "^7.2.4",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"socket.io-client": "^4.7.1"

View File

@ -156,6 +156,7 @@ class DiffusionSVC(VoiceChangerModel):
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid,
audio,
self.inputSampleRate,
pitchf,
feature,
f0_up_key,

View File

@ -104,6 +104,7 @@ class Pipeline(object):
self,
sid,
audio, # torch.tensor [n]
sr,
pitchf, # np.array [m]
feature, # np.array [m, feat]
f0_up_key,
@ -126,13 +127,23 @@ class Pipeline(object):
with Timer("pre-process") as t:
# ピッチ検出
try:
# pitch = self.pitchExtractor.extract(
# audio16k.squeeze(),
# pitchf,
# f0_up_key,
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
# silence_front=silence_front,
# )
pitch = self.pitchExtractor.extract(
audio16k.squeeze(),
audio,
sr,
self.inferencer_block_size,
self.inferencer_sampling_rate,
pitchf,
f0_up_key,
int(self.hop_size), # 処理のwindowサイズ (44100における512)
silence_front=silence_front,
)
# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
except IndexError as e: # NOQA

View File

@ -3,9 +3,9 @@ from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
import onnxruntime
import torch
from voice_changer.RVC.pitchExtractor import onnxcrepe
from voice_changer.utils.VoiceChangerModel import AudioInOut
class CrepeOnnxPitchExtractor(PitchExtractor):
@ -26,18 +26,20 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
self.sapmle_rate = 16000
self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
hop_size = block_size * sr / model_sr
precision = (1000 * window / self.sapmle_rate)
offset_frame_number = silence_front * sr
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):].astype(np.float32)
precision = (1000 * hop_size / sr)
audio_num = audio.cpu()
onnx_f0, onnx_pd = onnxcrepe.predict(
self.onnx_session,
audio_num,
self.sapmle_rate,
audio,
sr,
precision=precision,
fmin=self.f0_min,
fmax=self.f0_max,

View File

@ -3,6 +3,7 @@ import torch
import numpy as np
from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class CrepePitchExtractor(PitchExtractor):
@ -12,22 +13,25 @@ class CrepePitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "crepe"
self.f0_min = 50
self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True
if torch.cuda.is_available():
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else:
self.device = torch.device("cpu")
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
hop_size = block_size * sr / model_sr
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
offset_frame_number = silence_front * 16000
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / 16000 # 秒
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
f0, pd = torchcrepe.predict(
audio.unsqueeze(0),
self.sapmle_rate,
hop_length=window,
audio_t,
sr,
hop_length=hop_size,
fmin=self.f0_min,
fmax=self.f0_max,
# model="tiny",

View File

@ -1,9 +1,9 @@
import pyworld
import numpy as np
from const import PitchExtractorType
import torch
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class DioPitchExtractor(PitchExtractor):
@ -13,25 +13,28 @@ class DioPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "dio"
self.f0_min = 50
self.f0_max = 1100
self.sapmle_rate = 16000
# self.sapmle_rate = 44100
# self.sapmle_rate = 16000
self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
audio = audio.detach().cpu().numpy()
silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
silence_front: int = 0. # TODO: chunkサイズが小さいときに音程を取れなくなる対策
hop_size = block_size * sr / model_sr
offset_frame_number = silence_front * sr
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):]
_f0, t = pyworld.dio(
audio.astype(np.double),
self.sapmle_rate,
sr,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
channels_in_octave=2,
frame_period=(1000 * window / self.sapmle_rate)
frame_period=(1000 * hop_size / sr)
)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sapmle_rate)
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
f0 = pitch

View File

@ -1,9 +1,9 @@
import pyworld
import numpy as np
from const import PitchExtractorType
import torch
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class HarvestPitchExtractor(PitchExtractor):
@ -13,20 +13,22 @@ class HarvestPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "harvest"
self.f0_min = 50
self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
audio = audio.detach().cpu().numpy()
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
hop_size = block_size * sr / model_sr
offset_frame_number = silence_front * sr
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):]
f0, _ = pyworld.harvest(
audio.astype('double'),
self.sapmle_rate,
sr,
f0_floor=self.f0_min,
f0_ceil=self.f0_max,
frame_period=(1000 * window / self.sapmle_rate))
frame_period=(1000 * hop_size / sr))
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
f0 = pitch

View File

@ -1,9 +1,11 @@
from typing import Protocol
from voice_changer.utils.VoiceChangerModel import AudioInOut
class PitchExtractor(Protocol):
def extract(self, audio, f0_up_key, sr, window, silence_front=0):
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
...
def getPitchExtractorInfo(self):

View File

@ -1,3 +1,4 @@
from torchaudio.transforms import Resample
import torch
import numpy as np
from const import PitchExtractorType
@ -5,6 +6,8 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
from scipy.ndimage import zoom
from voice_changer.utils.VoiceChangerModel import AudioInOut
class RMVPEPitchExtractor(PitchExtractor):
@ -13,8 +16,8 @@ class RMVPEPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "rmvpe"
self.f0_min = 50
self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True
self.input_sr = -1
if torch.cuda.is_available() and gpu >= 0:
self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else:
@ -22,32 +25,24 @@ class RMVPEPitchExtractor(PitchExtractor):
self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device)
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate
def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
if sr != self.input_sr:
self.resamle = Resample(sr, 16000, dtype=torch.int16).to(self.device)
self.input_sr = sr
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio_t = self.resamle(audio_t)
hop_size = 160 # RMVPE固定
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
silented_frames = int(audio.size(0) // window) + 1
offset_frame_number = silence_front * 16000
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / 16000 # 秒
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
f0 = self.rmvpe.infer_from_audio_t(audio, thred=0.03)
# f0, pd = torchcrepe.predict(
# audio.unsqueeze(0),
# self.sapmle_rate,
# hop_length=window,
# fmin=self.f0_min,
# fmax=self.f0_max,
# # model="tiny",
# model="full",
# batch_size=256,
# decoder=torchcrepe.decode.weighted_argmax,
# device=self.device,
# return_periodicity=True,
# )
# f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
# pd = torchcrepe.filter.median(pd, 3)
# f0[pd < 0.1] = 0
# f0 = f0.squeeze()
resize_factor = silented_frames / len(f0)
f0 = self.rmvpe.infer_from_audio_t(audio_t.squeeze(), thred=0.03)
desired_hop_size = block_size * 16000 / model_sr
desired_f0_length = int(audio_t.shape[1] // desired_hop_size) + 1
resize_factor = desired_f0_length / len(f0)
f0 = zoom(f0, resize_factor, order=0)
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]

View File

@ -240,7 +240,6 @@ class E2E(nn.Module):
)
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
if n_gru:
print("N_GRUE")
self.fc = nn.Sequential(
BiGRU(3 * 128, 256, n_gru),
nn.Linear(512, 360),