Diffusion SVC:

pitch extractor sr is changed from fixed(16k) to audio sampl rate
This commit is contained in:
w-okada 2023-07-17 21:03:53 +09:00
parent e8aeb1eaa7
commit 371e1b8cac
14 changed files with 4902 additions and 636 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -21,21 +21,21 @@
"author": "wataru.okada@flect.co.jp", "author": "wataru.okada@flect.co.jp",
"license": "ISC", "license": "ISC",
"devDependencies": { "devDependencies": {
"@babel/core": "^7.22.8", "@babel/core": "^7.22.9",
"@babel/plugin-transform-runtime": "^7.22.7", "@babel/plugin-transform-runtime": "^7.22.9",
"@babel/preset-env": "^7.22.7", "@babel/preset-env": "^7.22.9",
"@babel/preset-react": "^7.22.5", "@babel/preset-react": "^7.22.5",
"@babel/preset-typescript": "^7.22.5", "@babel/preset-typescript": "^7.22.5",
"@types/node": "^20.4.1", "@types/node": "^20.4.2",
"@types/react": "^18.2.14", "@types/react": "^18.2.15",
"@types/react-dom": "^18.2.6", "@types/react-dom": "^18.2.7",
"autoprefixer": "^10.4.14", "autoprefixer": "^10.4.14",
"babel-loader": "^9.1.3", "babel-loader": "^9.1.3",
"copy-webpack-plugin": "^11.0.0", "copy-webpack-plugin": "^11.0.0",
"css-loader": "^6.8.1", "css-loader": "^6.8.1",
"eslint": "^8.44.0", "eslint": "^8.45.0",
"eslint-config-prettier": "^8.8.0", "eslint-config-prettier": "^8.8.0",
"eslint-plugin-prettier": "^4.2.1", "eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-react": "^7.32.2", "eslint-plugin-react": "^7.32.2",
"eslint-webpack-plugin": "^4.0.1", "eslint-webpack-plugin": "^4.0.1",
"html-loader": "^4.2.0", "html-loader": "^4.2.0",
@ -54,12 +54,13 @@
"webpack-dev-server": "^4.15.1" "webpack-dev-server": "^4.15.1"
}, },
"dependencies": { "dependencies": {
"@dannadori/voice-changer-client-js": "^1.0.160", "@dannadori/voice-changer-client-js": "^1.0.161",
"@fortawesome/fontawesome-svg-core": "^6.4.0", "@fortawesome/fontawesome-svg-core": "^6.4.0",
"@fortawesome/free-brands-svg-icons": "^6.4.0", "@fortawesome/free-brands-svg-icons": "^6.4.0",
"@fortawesome/free-regular-svg-icons": "^6.4.0", "@fortawesome/free-regular-svg-icons": "^6.4.0",
"@fortawesome/free-solid-svg-icons": "^6.4.0", "@fortawesome/free-solid-svg-icons": "^6.4.0",
"@fortawesome/react-fontawesome": "^0.2.0", "@fortawesome/react-fontawesome": "^0.2.0",
"protobufjs": "^7.2.4",
"react": "^18.2.0", "react": "^18.2.0",
"react-dom": "^18.2.0" "react-dom": "^18.2.0"
} }

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
{ {
"name": "@dannadori/voice-changer-client-js", "name": "@dannadori/voice-changer-client-js",
"version": "1.0.160", "version": "1.0.161",
"description": "", "description": "",
"main": "dist/index.js", "main": "dist/index.js",
"directories": { "directories": {
@ -27,15 +27,15 @@
"license": "ISC", "license": "ISC",
"devDependencies": { "devDependencies": {
"@types/audioworklet": "^0.0.48", "@types/audioworklet": "^0.0.48",
"@types/node": "^20.4.1", "@types/node": "^20.4.2",
"@types/react": "18.2.14", "@types/react": "18.2.15",
"@types/react-dom": "18.2.6", "@types/react-dom": "18.2.7",
"eslint": "^8.44.0", "eslint": "^8.45.0",
"eslint-config-prettier": "^8.8.0", "eslint-config-prettier": "^8.8.0",
"eslint-plugin-prettier": "^4.2.1", "eslint-plugin-prettier": "^5.0.0",
"eslint-plugin-react": "^7.25.3", "eslint-plugin-react": "^7.32.2",
"eslint-webpack-plugin": "^4.0.1", "eslint-webpack-plugin": "^4.0.1",
"npm-run-all": "^4.1.2", "npm-run-all": "^4.1.5",
"prettier": "^3.0.0", "prettier": "^3.0.0",
"raw-loader": "^4.0.2", "raw-loader": "^4.0.2",
"rimraf": "^5.0.1", "rimraf": "^5.0.1",
@ -47,9 +47,10 @@
}, },
"dependencies": { "dependencies": {
"@types/readable-stream": "^2.3.15", "@types/readable-stream": "^2.3.15",
"amazon-chime-sdk-js": "^2.7.0", "amazon-chime-sdk-js": "^3.15.0",
"buffer": "^6.0.3", "buffer": "^6.0.3",
"localforage": "^1.10.0", "localforage": "^1.10.0",
"protobufjs": "^7.2.4",
"react": "^18.2.0", "react": "^18.2.0",
"react-dom": "^18.2.0", "react-dom": "^18.2.0",
"socket.io-client": "^4.7.1" "socket.io-client": "^4.7.1"

View File

@ -156,6 +156,7 @@ class DiffusionSVC(VoiceChangerModel):
audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec( audio_out, self.pitchf_buffer, self.feature_buffer = self.pipeline.exec(
sid, sid,
audio, audio,
self.inputSampleRate,
pitchf, pitchf,
feature, feature,
f0_up_key, f0_up_key,

View File

@ -104,6 +104,7 @@ class Pipeline(object):
self, self,
sid, sid,
audio, # torch.tensor [n] audio, # torch.tensor [n]
sr,
pitchf, # np.array [m] pitchf, # np.array [m]
feature, # np.array [m, feat] feature, # np.array [m, feat]
f0_up_key, f0_up_key,
@ -126,13 +127,23 @@ class Pipeline(object):
with Timer("pre-process") as t: with Timer("pre-process") as t:
# ピッチ検出 # ピッチ検出
try: try:
# pitch = self.pitchExtractor.extract(
# audio16k.squeeze(),
# pitchf,
# f0_up_key,
# int(self.hop_size), # 処理のwindowサイズ (44100における512)
# silence_front=silence_front,
# )
pitch = self.pitchExtractor.extract( pitch = self.pitchExtractor.extract(
audio16k.squeeze(), audio,
sr,
self.inferencer_block_size,
self.inferencer_sampling_rate,
pitchf, pitchf,
f0_up_key, f0_up_key,
int(self.hop_size), # 処理のwindowサイズ (44100における512)
silence_front=silence_front, silence_front=silence_front,
) )
# def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long() pitch = torch.tensor(pitch[-n_frames:], device=self.device).unsqueeze(0).long()
except IndexError as e: # NOQA except IndexError as e: # NOQA

View File

@ -3,9 +3,9 @@ from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager from voice_changer.RVC.deviceManager.DeviceManager import DeviceManager
import onnxruntime import onnxruntime
import torch
from voice_changer.RVC.pitchExtractor import onnxcrepe from voice_changer.RVC.pitchExtractor import onnxcrepe
from voice_changer.utils.VoiceChangerModel import AudioInOut
class CrepeOnnxPitchExtractor(PitchExtractor): class CrepeOnnxPitchExtractor(PitchExtractor):
@ -26,18 +26,20 @@ class CrepeOnnxPitchExtractor(PitchExtractor):
self.sapmle_rate = 16000 self.sapmle_rate = 16000
self.uv_interp = True self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window) hop_size = block_size * sr / model_sr
real_silence_front = start_frame * window / self.sapmle_rate
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
precision = (1000 * window / self.sapmle_rate) offset_frame_number = silence_front * sr
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):].astype(np.float32)
precision = (1000 * hop_size / sr)
audio_num = audio.cpu()
onnx_f0, onnx_pd = onnxcrepe.predict( onnx_f0, onnx_pd = onnxcrepe.predict(
self.onnx_session, self.onnx_session,
audio_num, audio,
self.sapmle_rate, sr,
precision=precision, precision=precision,
fmin=self.f0_min, fmin=self.f0_min,
fmax=self.f0_max, fmax=self.f0_max,

View File

@ -3,6 +3,7 @@ import torch
import numpy as np import numpy as np
from const import PitchExtractorType from const import PitchExtractorType
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class CrepePitchExtractor(PitchExtractor): class CrepePitchExtractor(PitchExtractor):
@ -12,22 +13,25 @@ class CrepePitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "crepe" self.pitchExtractorType: PitchExtractorType = "crepe"
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True self.uv_interp = True
if torch.cuda.is_available(): if torch.cuda.is_available():
self.device = torch.device("cuda:" + str(torch.cuda.current_device())) self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else: else:
self.device = torch.device("cpu") self.device = torch.device("cpu")
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window) hop_size = block_size * sr / model_sr
real_silence_front = start_frame * window / self.sapmle_rate audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):]
offset_frame_number = silence_front * 16000
start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / 16000 # 秒
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
f0, pd = torchcrepe.predict( f0, pd = torchcrepe.predict(
audio.unsqueeze(0), audio_t,
self.sapmle_rate, sr,
hop_length=window, hop_length=hop_size,
fmin=self.f0_min, fmin=self.f0_min,
fmax=self.f0_max, fmax=self.f0_max,
# model="tiny", # model="tiny",

View File

@ -1,9 +1,9 @@
import pyworld import pyworld
import numpy as np import numpy as np
from const import PitchExtractorType from const import PitchExtractorType
import torch
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class DioPitchExtractor(PitchExtractor): class DioPitchExtractor(PitchExtractor):
@ -13,25 +13,28 @@ class DioPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "dio" self.pitchExtractorType: PitchExtractorType = "dio"
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.sapmle_rate = 16000 # self.sapmle_rate = 44100
# self.sapmle_rate = 16000
self.uv_interp = True self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
audio = audio.detach().cpu().numpy() silence_front: int = 0. # TODO: chunkサイズが小さいときに音程を取れなくなる対策
silence_front = 0 # TODO: chunkサイズが小さいときに音程を取れなくなる対策 hop_size = block_size * sr / model_sr
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate offset_frame_number = silence_front * sr
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):]
_f0, t = pyworld.dio( _f0, t = pyworld.dio(
audio.astype(np.double), audio.astype(np.double),
self.sapmle_rate, sr,
f0_floor=self.f0_min, f0_floor=self.f0_min,
f0_ceil=self.f0_max, f0_ceil=self.f0_max,
channels_in_octave=2, channels_in_octave=2,
frame_period=(1000 * window / self.sapmle_rate) frame_period=(1000 * hop_size / sr)
) )
f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, self.sapmle_rate) f0 = pyworld.stonemask(audio.astype(np.double), _f0, t, sr)
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]] pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
f0 = pitch f0 = pitch

View File

@ -1,9 +1,9 @@
import pyworld import pyworld
import numpy as np import numpy as np
from const import PitchExtractorType from const import PitchExtractorType
import torch
from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtractor
from voice_changer.utils.VoiceChangerModel import AudioInOut
class HarvestPitchExtractor(PitchExtractor): class HarvestPitchExtractor(PitchExtractor):
@ -13,20 +13,22 @@ class HarvestPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "harvest" self.pitchExtractorType: PitchExtractorType = "harvest"
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True self.uv_interp = True
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
audio = audio.detach().cpu().numpy() hop_size = block_size * sr / model_sr
start_frame = int(silence_front * self.sapmle_rate / window)
real_silence_front = start_frame * window / self.sapmle_rate offset_frame_number = silence_front * sr
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / sr # 秒
audio = audio[int(np.round(real_silence_front * sr)):]
f0, _ = pyworld.harvest( f0, _ = pyworld.harvest(
audio.astype('double'), audio.astype('double'),
self.sapmle_rate, sr,
f0_floor=self.f0_min, f0_floor=self.f0_min,
f0_ceil=self.f0_max, f0_ceil=self.f0_max,
frame_period=(1000 * window / self.sapmle_rate)) frame_period=(1000 * hop_size / sr))
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]] pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]
f0 = pitch f0 = pitch

View File

@ -1,9 +1,11 @@
from typing import Protocol from typing import Protocol
from voice_changer.utils.VoiceChangerModel import AudioInOut
class PitchExtractor(Protocol): class PitchExtractor(Protocol):
def extract(self, audio, f0_up_key, sr, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
... ...
def getPitchExtractorInfo(self): def getPitchExtractorInfo(self):

View File

@ -1,3 +1,4 @@
from torchaudio.transforms import Resample
import torch import torch
import numpy as np import numpy as np
from const import PitchExtractorType from const import PitchExtractorType
@ -5,6 +6,8 @@ from voice_changer.DiffusionSVC.pitchExtractor.PitchExtractor import PitchExtrac
from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE from voice_changer.DiffusionSVC.pitchExtractor.rmvpe.rmvpe import RMVPE
from scipy.ndimage import zoom from scipy.ndimage import zoom
from voice_changer.utils.VoiceChangerModel import AudioInOut
class RMVPEPitchExtractor(PitchExtractor): class RMVPEPitchExtractor(PitchExtractor):
@ -13,8 +16,8 @@ class RMVPEPitchExtractor(PitchExtractor):
self.pitchExtractorType: PitchExtractorType = "rmvpe" self.pitchExtractorType: PitchExtractorType = "rmvpe"
self.f0_min = 50 self.f0_min = 50
self.f0_max = 1100 self.f0_max = 1100
self.sapmle_rate = 16000
self.uv_interp = True self.uv_interp = True
self.input_sr = -1
if torch.cuda.is_available() and gpu >= 0: if torch.cuda.is_available() and gpu >= 0:
self.device = torch.device("cuda:" + str(torch.cuda.current_device())) self.device = torch.device("cuda:" + str(torch.cuda.current_device()))
else: else:
@ -22,32 +25,24 @@ class RMVPEPitchExtractor(PitchExtractor):
self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device) self.rmvpe = RMVPE(model_path=file, is_half=False, device=self.device)
def extract(self, audio: torch.Tensor, pitch, f0_up_key, window, silence_front=0): def extract(self, audio: AudioInOut, sr: int, block_size: int, model_sr: int, pitch, f0_up_key, silence_front=0):
start_frame = int(silence_front * self.sapmle_rate / window) if sr != self.input_sr:
real_silence_front = start_frame * window / self.sapmle_rate self.resamle = Resample(sr, 16000, dtype=torch.int16).to(self.device)
self.input_sr = sr
audio_t = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
audio_t = self.resamle(audio_t)
hop_size = 160 # RMVPE固定
audio = audio[int(np.round(real_silence_front * self.sapmle_rate)):] offset_frame_number = silence_front * 16000
silented_frames = int(audio.size(0) // window) + 1 start_frame = int(offset_frame_number / hop_size) # frame
real_silence_front = start_frame * hop_size / 16000 # 秒
audio_t = audio_t[:, int(np.round(real_silence_front * 16000)):]
f0 = self.rmvpe.infer_from_audio_t(audio, thred=0.03) f0 = self.rmvpe.infer_from_audio_t(audio_t.squeeze(), thred=0.03)
# f0, pd = torchcrepe.predict(
# audio.unsqueeze(0), desired_hop_size = block_size * 16000 / model_sr
# self.sapmle_rate, desired_f0_length = int(audio_t.shape[1] // desired_hop_size) + 1
# hop_length=window, resize_factor = desired_f0_length / len(f0)
# fmin=self.f0_min,
# fmax=self.f0_max,
# # model="tiny",
# model="full",
# batch_size=256,
# decoder=torchcrepe.decode.weighted_argmax,
# device=self.device,
# return_periodicity=True,
# )
# f0 = torchcrepe.filter.median(f0, 3) # 本家だとmeanですが、harvestに合わせmedianフィルタ
# pd = torchcrepe.filter.median(pd, 3)
# f0[pd < 0.1] = 0
# f0 = f0.squeeze()
resize_factor = silented_frames / len(f0)
f0 = zoom(f0, resize_factor, order=0) f0 = zoom(f0, resize_factor, order=0)
pitch[-f0.shape[0]:] = f0[:pitch.shape[0]] pitch[-f0.shape[0]:] = f0[:pitch.shape[0]]

View File

@ -240,7 +240,6 @@ class E2E(nn.Module):
) )
self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
if n_gru: if n_gru:
print("N_GRUE")
self.fc = nn.Sequential( self.fc = nn.Sequential(
BiGRU(3 * 128, 256, n_gru), BiGRU(3 * 128, 256, n_gru),
nn.Linear(512, 360), nn.Linear(512, 360),