refactoring

This commit is contained in:
wataru 2023-01-06 01:37:29 +09:00
parent f85842d984
commit 32e21b1a7a
997 changed files with 3801 additions and 31571 deletions

2
.gitignore vendored
View File

@ -11,4 +11,4 @@ server/memo.txt
client/lib/dist
client/lib/worklet/dist
client/demo/dist/
# client/demo/dist/ # demo用に残す

View File

Before

Width:  |  Height:  |  Size: 473 B

After

Width:  |  Height:  |  Size: 473 B

View File

Before

Width:  |  Height:  |  Size: 1.3 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

Before

Width:  |  Height:  |  Size: 522 B

After

Width:  |  Height:  |  Size: 522 B

View File

Before

Width:  |  Height:  |  Size: 365 B

After

Width:  |  Height:  |  Size: 365 B

View File

Before

Width:  |  Height:  |  Size: 327 B

After

Width:  |  Height:  |  Size: 327 B

View File

Before

Width:  |  Height:  |  Size: 395 B

After

Width:  |  Height:  |  Size: 395 B

View File

Before

Width:  |  Height:  |  Size: 403 B

After

Width:  |  Height:  |  Size: 403 B

View File

Before

Width:  |  Height:  |  Size: 5.3 KiB

After

Width:  |  Height:  |  Size: 5.3 KiB

0
frontend/dist/favicon.ico → client/demo/dist/favicon.ico vendored Executable file → Normal file
View File

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

1
client/demo/dist/index.html vendored Normal file
View File

@ -0,0 +1 @@
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>

2
client/demo/dist/index.js vendored Normal file

File diff suppressed because one or more lines are too long

31
client/demo/dist/index.js.LICENSE.txt vendored Normal file
View File

@ -0,0 +1,31 @@
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

File diff suppressed because it is too large Load Diff

View File

@ -20,15 +20,15 @@
"@babel/preset-env": "^7.20.2",
"@babel/preset-react": "^7.18.6",
"@babel/preset-typescript": "^7.18.6",
"@types/node": "^18.11.17",
"@types/node": "^18.11.18",
"@types/react": "^18.0.26",
"@types/react-dom": "^18.0.9",
"@types/react-dom": "^18.0.10",
"autoprefixer": "^10.4.13",
"babel-loader": "^9.1.0",
"babel-loader": "^9.1.2",
"copy-webpack-plugin": "^11.0.0",
"css-loader": "^6.7.3",
"eslint": "^8.30.0",
"eslint-config-prettier": "^8.5.0",
"eslint": "^8.31.0",
"eslint-config-prettier": "^8.6.0",
"eslint-plugin-prettier": "^4.2.1",
"eslint-plugin-react": "^7.31.11",
"eslint-webpack-plugin": "^3.2.0",
@ -41,14 +41,14 @@
"rimraf": "^3.0.2",
"style-loader": "^3.3.1",
"ts-loader": "^9.4.2",
"tsconfig-paths": "^4.1.1",
"tsconfig-paths": "^4.1.2",
"typescript": "^4.9.4",
"webpack": "^5.75.0",
"webpack-cli": "^5.0.1",
"webpack-dev-server": "^4.11.1"
},
"dependencies": {
"@dannadori/voice-changer-client-js": "file:../lib",
"@dannadori/voice-changer-client-js": "^1.0.3",
"react": "^18.2.0",
"react-dom": "^18.2.0"
}

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-file-text"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"></path><polyline points="14 2 14 8 20 8"></polyline><line x1="16" y1="13" x2="8" y2="13"></line><line x1="16" y1="17" x2="8" y2="17"></line><polyline points="10 9 9 9 8 9"></polyline></svg>

After

Width:  |  Height:  |  Size: 473 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-github"><path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"></path></svg>

After

Width:  |  Height:  |  Size: 522 B

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-help-circle"><circle cx="12" cy="12" r="10"></circle><path d="M9.09 9a3 3 0 0 1 5.83 1c0 2-3 3-3 3"></path><line x1="12" y1="17" x2="12.01" y2="17"></line></svg>

After

Width:  |  Height:  |  Size: 365 B

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-home"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"></path><polyline points="9 22 9 12 15 12 15 22"></polyline></svg>

After

Width:  |  Height:  |  Size: 327 B

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-linkedin"><path d="M16 8a6 6 0 0 1 6 6v7h-4v-7a2 2 0 0 0-2-2 2 2 0 0 0-2 2v7h-4v-7a6 6 0 0 1 6-6z"></path><rect x="2" y="9" width="4" height="12"></rect><circle cx="4" cy="4" r="2"></circle></svg>

After

Width:  |  Height:  |  Size: 395 B

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-twitter"><path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path></svg>

After

Width:  |  Height:  |  Size: 403 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

BIN
client/demo/public/favicon.ico Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

View File

@ -41,5 +41,11 @@ module.exports = {
template: path.resolve(__dirname, "public/index.html"),
filename: "./index.html",
}),
new CopyPlugin({
patterns: [{ from: "public/assets", to: "asset" }],
}),
new CopyPlugin({
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
}),
]
};

5
client/lib/.npmignore Normal file
View File

@ -0,0 +1,5 @@
src
worklet
node_modules
webpack.*
tsconfig.*

View File

@ -1,12 +1,12 @@
{
"name": "@dannadori/voice-changer-client-js",
"version": "1.0.0",
"version": "1.0.3",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "@dannadori/voice-changer-client-js",
"version": "1.0.0",
"version": "1.0.3",
"license": "ISC",
"dependencies": {
"@types/readable-stream": "^2.3.15",

View File

@ -1,6 +1,6 @@
{
"name": "@dannadori/voice-changer-client-js",
"version": "1.0.0",
"version": "1.0.3",
"description": "",
"main": "dist/index.js",
"directories": {
@ -17,6 +17,7 @@
"webpack:prod": "webpack --config webpack.prod.js",
"build:dev": "npm-run-all build:worklet:dev clean webpack:dev",
"build:prod": "npm-run-all build:worklet:prod clean webpack:prod",
"release": "npm version patch && npm publish --access=public",
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],

View File

@ -1,19 +0,0 @@
Cython==0.29.32
fastapi==0.88.0
librosa==0.9.2
numpy==1.23.5
phonemizer==3.2.1
psutil==5.9.4
pyopenjtalk==0.3.0
pyOpenSSL==22.1.0
python-multipart==0.0.5
python-socketio==5.7.2
retry==0.9.2
scipy==1.9.3
tensorboard==2.11.0
torch==1.13.0
torchaudio==0.13.0
tqdm==4.64.1
Unidecode==1.3.6
uvicorn==0.20.0
websockets==10.4

View File

@ -1,516 +0,0 @@
import sys, os, struct, argparse, logging, shutil, base64, traceback
from dataclasses import dataclass
from datetime import datetime
from distutils.util import strtobool
import numpy as np
from scipy.io.wavfile import write, read
sys.path.append("MMVC_Trainer")
sys.path.append("MMVC_Trainer/text")
from fastapi.routing import APIRoute
from fastapi import HTTPException, Request, Response, FastAPI, UploadFile, File, Form
from fastapi.staticfiles import StaticFiles
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import socketio
from pydantic import BaseModel
from typing import Callable
from mods.Trainer_Speakers import mod_get_speakers
from mods.Trainer_Training import mod_post_pre_training, mod_post_start_training, mod_post_stop_training, mod_get_related_files, mod_get_tail_training_log
from mods.Trainer_Model import mod_get_model, mod_delete_model
from mods.Trainer_Models import mod_get_models
from mods.Trainer_MultiSpeakerSetting import mod_get_multi_speaker_setting, mod_post_multi_speaker_setting
from mods.Trainer_Speaker_Voice import mod_get_speaker_voice
from mods.Trainer_Speaker_Voices import mod_get_speaker_voices
from mods.Trainer_Speaker import mod_delete_speaker
from mods.FileUploader import upload_file, concat_file_chunks
from mods.VoiceChanger import VoiceChanger
from mods.ssl import create_self_signed_cert
# File Uploader
# Trainer Rest Internal
class UvicornSuppressFilter(logging.Filter):
def filter(self, record):
return False
logger = logging.getLogger("uvicorn.error")
logger.addFilter(UvicornSuppressFilter())
# logger.propagate = False
logger = logging.getLogger("multipart.multipart")
logger.propagate = False
@dataclass
class ExApplicationInfo():
external_tensorboard_port: int
exApplitionInfo = ExApplicationInfo(external_tensorboard_port=0)
class VoiceModel(BaseModel):
gpu: int
srcId: int
dstId: int
timestamp: int
prefixChunkSize: int
buffer: str
class MyCustomNamespace(socketio.AsyncNamespace):
def __init__(self, namespace):
super().__init__(namespace)
def loadModel(self, config, model):
if hasattr(self, 'voiceChanger') == True:
self.voiceChanger.destroy()
self.voiceChanger = VoiceChanger(config, model)
# def loadWhisperModel(self, model):
# self.whisper = Whisper()
# self.whisper.loadModel("tiny")
# print("load")
def changeVoice(self, gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData):
# if hasattr(self, 'whisper') == True:
# self.whisper.addData(unpackedData)
if hasattr(self, 'voiceChanger') == True:
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
else:
print("Voice Change is not loaded. Did you load a correct model?")
return np.zeros(1).astype(np.int16)
# def transcribe(self):
# if hasattr(self, 'whisper') == True:
# self.whisper.transcribe(0)
# else:
# print("whisper not found")
def on_connect(self, sid, environ):
# print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
pass
async def on_request_message(self, sid, msg):
# print("on_request_message", torch.cuda.memory_allocated())
gpu = int(msg[0])
srcId = int(msg[1])
dstId = int(msg[2])
timestamp = int(msg[3])
prefixChunkSize = int(msg[4])
data = msg[5]
# print(srcId, dstId, timestamp)
unpackedData = np.array(struct.unpack(
'<%sh' % (len(data) // struct.calcsize('<h')), data))
audio1 = self.changeVoice(
gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
bin = struct.pack('<%sh' % len(audio1), *audio1)
await self.emit('response', [timestamp, bin])
def on_disconnect(self, sid):
# print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
pass
def setupArgParser():
parser = argparse.ArgumentParser()
parser.add_argument("-t", type=str, default="MMVC",
help="Server type. MMVC|TRAIN")
parser.add_argument("-p", type=int, default=8080, help="port")
parser.add_argument("-c", type=str, help="path for the config.json")
parser.add_argument("-m", type=str, help="path for the model file")
parser.add_argument("--https", type=strtobool,
default=False, help="use https")
parser.add_argument("--httpsKey", type=str,
default="ssl.key", help="path for the key of https")
parser.add_argument("--httpsCert", type=str,
default="ssl.cert", help="path for the cert of https")
parser.add_argument("--httpsSelfSigned", type=strtobool,
default=True, help="generate self-signed certificate")
parser.add_argument("--colab", type=strtobool,
default=False, help="run on colab")
return parser
def printMessage(message, level=0):
if level == 0:
print(f"\033[17m{message}\033[0m")
elif level == 1:
print(f"\033[34m {message}\033[0m")
elif level == 2:
print(f"\033[32m {message}\033[0m")
else:
print(f"\033[47m {message}\033[0m")
global app_socketio
global app_fastapi
parser = setupArgParser()
args = parser.parse_args()
printMessage(f"Phase name:{__name__}", level=2)
thisFilename = os.path.basename(__file__)[:-3]
class ValidationErrorLoggingRoute(APIRoute):
def get_route_handler(self) -> Callable:
original_route_handler = super().get_route_handler()
async def custom_route_handler(request: Request) -> Response:
try:
return await original_route_handler(request)
except Exception as exc:
print("Exception", request.url, str(exc))
body = await request.body()
detail = {"errors": exc.errors(), "body": body.decode()}
raise HTTPException(status_code=422, detail=detail)
return custom_route_handler
if __name__ == thisFilename or args.colab == True:
printMessage(f"PHASE3:{__name__}", level=2)
TYPE = args.t
PORT = args.p
CONFIG = args.c
MODEL = args.m
if os.getenv("EX_TB_PORT"):
EX_TB_PORT = os.environ["EX_TB_PORT"]
exApplitionInfo.external_tensorboard_port = int(EX_TB_PORT)
app_fastapi = FastAPI()
app_fastapi.router.route_class = ValidationErrorLoggingRoute
app_fastapi.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app_fastapi.mount(
"/front", StaticFiles(directory="../frontend/dist", html=True), name="static")
app_fastapi.mount(
"/trainer", StaticFiles(directory="../frontend/dist", html=True), name="static")
app_fastapi.mount(
"/recorder", StaticFiles(directory="../frontend/dist", html=True), name="static")
sio = socketio.AsyncServer(
async_mode='asgi',
cors_allowed_origins='*'
)
namespace = MyCustomNamespace('/test')
sio.register_namespace(namespace)
if CONFIG and MODEL:
namespace.loadModel(CONFIG, MODEL)
# namespace.loadWhisperModel("base")
app_socketio = socketio.ASGIApp(
sio,
other_asgi_app=app_fastapi,
static_files={
'/assets/icons/github.svg': {
'filename': '../frontend/dist/assets/icons/github.svg',
'content_type': 'image/svg+xml'
},
'': '../frontend/dist',
'/': '../frontend/dist/index.html',
}
)
@app_fastapi.get("/api/hello")
async def index():
return {"result": "Index"}
############
# File Uploder
# ##########
UPLOAD_DIR = "upload_dir"
os.makedirs(UPLOAD_DIR, exist_ok=True)
MODEL_DIR = "MMVC_Trainer/logs"
os.makedirs(MODEL_DIR, exist_ok=True)
@app_fastapi.post("/upload_file")
async def post_upload_file(
file: UploadFile = File(...),
filename: str = Form(...)
):
return upload_file(UPLOAD_DIR, file, filename)
@app_fastapi.post("/load_model")
async def post_load_model(
modelFilename: str = Form(...),
modelFilenameChunkNum: int = Form(...),
configFilename: str = Form(...)
):
modelFilePath = concat_file_chunks(
UPLOAD_DIR, modelFilename, modelFilenameChunkNum, UPLOAD_DIR)
print(f'File saved to: {modelFilePath}')
configFilePath = os.path.join(UPLOAD_DIR, configFilename)
namespace.loadModel(configFilePath, modelFilePath)
return {"load": f"{modelFilePath}, {configFilePath}"}
@app_fastapi.post("/load_model_for_train")
async def post_load_model_for_train(
modelGFilename: str = Form(...),
modelGFilenameChunkNum: int = Form(...),
modelDFilename: str = Form(...),
modelDFilenameChunkNum: int = Form(...),
):
modelGFilePath = concat_file_chunks(
UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR)
modelDFilePath = concat_file_chunks(
UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR)
return {"File saved": f"{modelGFilePath}, {modelDFilePath}"}
@app_fastapi.post("/extract_voices")
async def post_load_model(
zipFilename: str = Form(...),
zipFileChunkNum: int = Form(...),
):
zipFilePath = concat_file_chunks(
UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR)
shutil.unpack_archive(zipFilePath, "MMVC_Trainer/dataset/textful/")
return {"Zip file unpacked": f"{zipFilePath}"}
############
# Voice Changer
# ##########
@app_fastapi.post("/test")
async def post_test(voice: VoiceModel):
try:
# print("POST REQUEST PROCESSING....")
gpu = voice.gpu
srcId = voice.srcId
dstId = voice.dstId
timestamp = voice.timestamp
prefixChunkSize = voice.prefixChunkSize
buffer = voice.buffer
wav = base64.b64decode(buffer)
if wav == 0:
samplerate, data = read("dummy.wav")
unpackedData = data
else:
unpackedData = np.array(struct.unpack(
'<%sh' % (len(wav) // struct.calcsize('<h')), wav))
write("logs/received_data.wav", 24000,
unpackedData.astype(np.int16))
changedVoice = namespace.changeVoice(
gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
data = {
"gpu": gpu,
"srcId": srcId,
"dstId": dstId,
"timestamp": timestamp,
"prefixChunkSize": prefixChunkSize,
"changedVoiceBase64": changedVoiceBase64
}
json_compatible_item_data = jsonable_encoder(data)
return JSONResponse(content=json_compatible_item_data)
except Exception as e:
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
# Trainer REST API ※ ColabがTop直下のパスにしかPOSTを投げれないようなので"REST風"
@app_fastapi.get("/get_speakers")
async def get_speakers():
return mod_get_speakers()
@app_fastapi.delete("/delete_speaker")
async def delete_speaker(speaker: str = Form(...)):
return mod_delete_speaker(speaker)
@app_fastapi.get("/get_speaker_voices")
async def get_speaker_voices(speaker: str):
return mod_get_speaker_voices(speaker)
@app_fastapi.get("/get_speaker_voice")
async def get_speaker_voices(speaker: str, voice: str):
return mod_get_speaker_voice(speaker, voice)
@app_fastapi.get("/get_multi_speaker_setting")
async def get_multi_speaker_setting():
return mod_get_multi_speaker_setting()
@app_fastapi.post("/post_multi_speaker_setting")
async def post_multi_speaker_setting(setting: str = Form(...)):
return mod_post_multi_speaker_setting(setting)
@app_fastapi.get("/get_models")
async def get_models():
return mod_get_models()
@app_fastapi.get("/get_model")
async def get_model(model: str):
return mod_get_model(model)
@app_fastapi.delete("/delete_model")
async def delete_model(model: str = Form(...)):
return mod_delete_model(model)
@app_fastapi.post("/post_pre_training")
async def post_pre_training(batch: int = Form(...)):
return mod_post_pre_training(batch)
@app_fastapi.post("/post_start_training")
async def post_start_training(enable_finetuning: bool = Form(...),GModel: str = Form(...),DModel: str = Form(...)):
print("POST START TRAINING..")
return mod_post_start_training(enable_finetuning, GModel, DModel)
@app_fastapi.post("/post_stop_training")
async def post_stop_training():
print("POST STOP TRAINING..")
return mod_post_stop_training()
@app_fastapi.get("/get_related_files")
async def get_related_files():
return mod_get_related_files()
@app_fastapi.get("/get_tail_training_log")
async def get_tail_training_log(num: int):
return mod_get_tail_training_log(num)
@app_fastapi.get("/get_ex_application_info")
async def get_ex_application_info():
json_compatible_item_data = jsonable_encoder(exApplitionInfo)
return JSONResponse(content=json_compatible_item_data)
if __name__ == '__mp_main__':
printMessage(f"PHASE2:{__name__}", level=2)
if __name__ == '__main__':
printMessage(f"PHASE1:{__name__}", level=2)
TYPE = args.t
PORT = args.p
CONFIG = args.c
MODEL = args.m
if TYPE != "MMVC" and TYPE != "TRAIN":
print("Type(-t) should be MMVC or TRAIN")
exit(1)
printMessage(f"Start MMVC SocketIO Server", level=0)
printMessage(f"CONFIG:{CONFIG}, MODEL:{MODEL}", level=1)
if args.colab == False:
if os.getenv("EX_PORT"):
EX_PORT = os.environ["EX_PORT"]
printMessage(
f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1)
else:
printMessage(f"Internal_Port:{PORT}", level=1)
if os.getenv("EX_TB_PORT"):
EX_TB_PORT = os.environ["EX_TB_PORT"]
printMessage(f"External_TeonsorBord_Port:{EX_TB_PORT}", level=1)
if os.getenv("EX_IP"):
EX_IP = os.environ["EX_IP"]
printMessage(f"External_IP:{EX_IP}", level=1)
# HTTPS key/cert作成
if args.https and args.httpsSelfSigned == 1:
# HTTPS(おれおれ証明書生成)
os.makedirs("./key", exist_ok=True)
key_base_name = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
keyname = f"{key_base_name}.key"
certname = f"{key_base_name}.cert"
create_self_signed_cert(certname, keyname, certargs={"Country": "JP",
"State": "Tokyo",
"City": "Chuo-ku",
"Organization": "F",
"Org. Unit": "F"}, cert_dir="./key")
key_path = os.path.join("./key", keyname)
cert_path = os.path.join("./key", certname)
printMessage(
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1)
elif args.https and args.httpsSelfSigned == 0:
# HTTPS
key_path = args.httpsKey
cert_path = args.httpsCert
printMessage(
f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1)
else:
# HTTP
printMessage(f"protocol: HTTP", level=1)
# アドレス表示
if args.https == 1:
printMessage(
f"open https://<IP>:<PORT>/ with your browser.", level=0)
else:
printMessage(
f"open http://<IP>:<PORT>/ with your browser.", level=0)
if TYPE == "MMVC":
path = ""
else:
path = "trainer"
if "EX_PORT" in locals() and "EX_IP" in locals() and args.https == 1:
printMessage(f"In many cases it is one of the following", level=1)
printMessage(f"https://localhost:{EX_PORT}/{path}", level=1)
for ip in EX_IP.strip().split(" "):
printMessage(f"https://{ip}:{EX_PORT}/{path}", level=1)
elif "EX_PORT" in locals() and "EX_IP" in locals() and args.https == 0:
printMessage(f"In many cases it is one of the following", level=1)
printMessage(f"http://localhost:{EX_PORT}/{path}", level=1)
# サーバ起動
if args.https:
# HTTPS サーバ起動
uvicorn.run(
f"{os.path.basename(__file__)[:-3]}:app_socketio",
host="0.0.0.0",
port=int(PORT),
reload=True,
ssl_keyfile=key_path,
ssl_certfile=cert_path,
log_level="critical"
)
else:
# HTTP サーバ起動
if args.colab == True:
uvicorn.run(
f"{os.path.basename(__file__)[:-3]}:app_fastapi",
host="0.0.0.0",
port=int(PORT),
log_level="critical"
)
else:
uvicorn.run(
f"{os.path.basename(__file__)[:-3]}:app_socketio",
host="0.0.0.0",
port=int(PORT),
reload=True,
log_level="critical"
)

View File

@ -1,43 +0,0 @@
MIT License
Copyright (c) 2022 Isle Tennos
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
MIT License
Copyright (c) 2021 Jaehyeon Kim
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,220 +0,0 @@
MMVC_Trainer
====
AIを使ったリアルタイムボイスチェンジャーのモデル学習用ツール
## Description
AIを使ったリアルタイムボイスチェンジャー「MMVC(RealTime-Many to Many Voice Conversion)」
で使用するモデルを学習するためのリポジトリです。
google colaboratoryを用いることで、個人の環境に依存せず、かつ簡単に機械学習の学習フェーズを実行可能です。
## MMVC_Client
MMVCを実際に動かすClient software
https://github.com/isletennos/MMVC_Client
## concept
「簡単」「だれでも」「好きな声に」「リアルタイムで」
## Demo
制作中 (v1.3.0.0)
https://www.nicovideo.jp/watch/sm40386035 (v1.2.0.0)
## MMVCの利用規約 及び MMVC用音源の配布先(2022/08/10)
本ソフトウェアの利用規約は基本的にMITライセンスに準拠します。
1. このソフトウェアは、コピー利用、配布、変更の追加、変更を加えたもの再配布、商用利用、有料販売など
どなたでも自由にお使いいただくことができます。
2. ライセンスの記載が可能なプラットフォームでの利用の場合、下記クレジットどちらかををご利用ください。
**VRCでの利用などライセンス記載が不可の場合、記載は不要です。**
(可能であればパターン2を使ってくれると製作者はうれしいです)
3. このソフトウェアについて、製作者はいかなる保証も致しません。
また、このソフトウェアを利用したことで問題が起きた際に、ソフトウェアの製作者は一切の責任を負いません。
4. このソフトウェアで利用する音声データは、必ず元の音声データの所持者の許諾を得たものを利用すること。
または音声データの配布元の利用規約内で利用すること。
### MMVC公式配布の音声データの利用規約とダウンロード先について
MMVCの利用規約とは別に、下記音声データを利用する場合、それぞれの音声ライブラリ提供者様の利用規約に同意する必要があります。
※本ソフトウェアでは下記企業様・団体様に特別に許可を頂き、音声データを本ソフトウェア用に改変、再配布を行っております。
#### SSS LLC.
[[利用規約](https://zunko.jp/guideline.html)][[ずんだもん 音声データ](https://drive.google.com/file/d/1h8Ajyvoig7Hl3LSSt2vYX0sUHX3JDF3R/view?usp=sharing)] ※本ソフトウェアに同梱しているものと同様の音声データになります
[[利用規約](https://zunko.jp/guideline.html)][[九州そら 音声データ](https://drive.google.com/file/d/1MXfMRG_sjbsaLihm7wEASG2PwuCponZF/view?usp=sharing)]
[[利用規約](https://zunko.jp/guideline.html)][[四国めたん 音声データ](https://drive.google.com/file/d/1iCrpzhqXm-0YdktOPM8M1pMtgQIDF3r4/view?usp=sharing)]
#### 春日部つむぎプロジェクト様
[[利用規約](https://tsumugi-official.studio.site/rule)][[春日部つむぎ 音声データ](https://drive.google.com/file/d/14zE0F_5ZCQWXf6m6SUPF5Y3gpL6yb7zk/view?usp=sharing)]
### ライセンス表記について
ずんだもん/四国めたん/九州そら/春日部つむぎ
の3キャラクターを利用する場合に限り、下記ライセンスパターンに加えて、どのツールで作られた音声かわかるように
```
MMVC:ずんだもん
MMVC:ずんだもん/四国めたん
```
等の記載を下記ライセンスパターンと一緒に記載ください。
こちらも**VRCでの利用などライセンス記載が不可の場合、記載は不要です。**
ライセンスパターン 1 
```
Copyright (c) 2022 Isle.Tennos 
Released under the MIT license 
https://opensource.org/licenses/mit-license.php
```
ライセンスパターン 2 
```
MMVCv1.x.x.x(使用バージョン) 
Copyright (c) 2022 Isle.Tennos 
Released under the MIT license 
https://opensource.org/licenses/mit-license.php
git:https://github.com/isletennos/MMVC_Trainer
community(discord):https://discord.gg/PgspuDSTEc
```
## Requirement
・Google アカウント
## Install
このリポジトリをダウンロードして、展開、展開したディレクトリをgoogle drive上にアップロードしてください。
## Usage
### チュートリアル : ずんだもんになる
本チュートリアルではずんだもん(SSS LLC.)の音声データを利用します。
そのため、MMVCの利用規約とは別に[[ずんだもん 利用規約](https://zunko.jp/guideline.html)]を遵守する必要があります。
#### Ph1. 自分の音声の録音と音声データの配置
1. 自分の声の音声データを録音します。
JVSコーパスやITAコーパス等を台本にし、100文程度読み上げます。
また、録音した音声は**24000Hz 16bit 1ch**である必要があります。
※MMVC用にテキストを分割したITAコーパスです。ご利用ください。
https://drive.google.com/file/d/14oXoQqLxRkP8NJK8qMYGee1_q2uEED1z/view?usp=sharing
2. dataset/textful/000_myvoice に音声データとテキストデータを配置します。
最終的に下記のようなディレクトリ構成になります。
```
dataset
├── textful
│   ├── 000_myvoice
│   │   ├── text
│   │   │   ├── s_voice_001.txt
│   │   │   ├── s_voice_002.txt
│   │   │   ├── ...
│   │   └── wav
│   │   ├── s_voice_001.wav
│   │   ├── s_voice_002.wav
│   │      ├── ...
│   │── 001_target
│   │   ├── text
│   │   └── wav
│   │
│   └── 1205_zundamon
│      ├── text
│      │   ├── t_voice_001.txt
│      │   ├── t_voice_002.txt
│      │   ├── ...
│      └── wav
│      ├── t_voice_001.wav
│      ├── t_voice_002.wav
│         ├── ...
│     
└── textless
```
#### Ph2. モデルの学習方法
1. 下記リンクより、「G_180000.pth」「D_180000.pth」をダウンロード。
https://drive.google.com/drive/folders/1vXdL1zSrgsuyACMkiTUtVbHgpMSA1Y5I?usp=sharing
2. 「G_180000.pth」「D_180000.pth」をfine_modelに配置します。**(良く忘れるポイントなので要注意!)**
3. notebookディレクトリにある「Create_Configfile_zundamon.ipynb」をgoogle colab 上で実行、学習に必要なconfigファイルを作成します
4. configsに作成されたtrain_config_zundamon.jsonの
- "eval_interval"
modelを保存する間隔です。
- "batch_size"
colabで割り当てたGPUに合わせて調整してください。
上記2項目を環境に応じて最適化してください。わからない方はそのままで大丈夫です。
5. notebookディレクトリにある「Train_MMVC.ipynb」をgoogle colab 上で実行してください。
logs/にモデルが生成されます。
#### Ph3. 学習したモデルの性能検証
1. notebookディレクトリにある「MMVC_Interface.ipynb」をgoogle colab 上で実行してください。
### 好きなキャラクターの声になる
#### Ph1. 自分の音声の録音と音声データの配置 及びターゲット音声データの配置
1. 自分の声の音声データとその音声データに対応するテキスト、変換したい声の音声データとその音声データに対応するテキストを用意します。
この時、用意する音声(自分の声の音声データ/変換したい声の音声データ共に)は**24000Hz 16bit 1ch**を強く推奨しております。
2. 下記のようなディレクトリ構成になるように音声データとテキストデータを配置します。
textfulの直下には2ディレクトリになります。
(1205_zundamonディレクトリは無くても問題ありません)
```
dataset
├── textful
│   ├── 000_myvoice
│   │   ├── text
│   │   │   ├── s_voice_001.txt
│   │   │   ├── s_voice_002.txt
│   │   │   ├── ...
│   │   └── wav
│   │   ├── s_voice_001.wav
│   │   ├── s_voice_002.wav
│   │      ├── ...
│   │── 001_target
│   │   ├── text
│   │   │   ├── t_voice_001.txt
│   │   │   ├── t_voice_002.txt
│   │   │   ├── ...
│   │   └── wav
│   │   ├── t_voice_001.wav
│   │   ├── t_voice_002.wav
│   │      ├── ...
│   └── 1205_zundamon
│      ├── text
│      │   ├── t_voice_001.txt
│      │   ├── t_voice_002.txt
│      │   ├── ...
│      └── wav
│      ├── t_voice_001.wav
│      ├── t_voice_002.wav
│         ├── ...
│     
└── textless
```
#### Ph2. モデルの学習方法
以降、「チュートリアル : ずんだもんになる Ph2.」と同様のため割愛
#### Ph3. 学習したモデルの性能検証
以降、「チュートリアル : ずんだもんになる Ph3.」と同様のため割愛
## 有志によるチュートリアル動画
### v1.2.1.x
| 前準備編    | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40415108) | [YouTube](https://www.youtube.com/watch?v=gq1Hpn5CARw&ab_channel=popi) |
|:--------------|:------------|:------------|
| 要修正音声   | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40420683)| [YouTube](https://youtu.be/NgzC7Nuk6gg) |
| 前準備編2    | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40445164)| [YouTube](https://youtu.be/m4Jew7sTs9w)
| 学習編_前1   | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40467662)| [YouTube](https://youtu.be/HRSPEy2jUvg)
| 学習編_前2   | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40473168)| [YouTube](https://youtu.be/zQW59vrOSuA)
| 学習編_後    | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40490554)| [YouTube](https://www.youtube.com/watch?v=uB3YfdKzo-g&ab_channel=popi)
| リアルタイム編 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40415108)| [YouTube](https://youtu.be/Al5DFCvKLFA)
| 質問編     | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40599514)| [YouTube](https://youtu.be/aGBcqu5M6-c)
| 応用編_九州そら| [ニコニコ動画](https://www.nicovideo.jp/watch/sm40647601)| [YouTube](https://youtu.be/MEXKZoHVd-A)
| 応用編_音街ウナ| [ニコニコ動画](https://www.nicovideo.jp/watch/sm40714406)| [YouTube](https://youtu.be/JDMlRz-PkSE)
## Q&A
下記サイトをご参考ください。
https://mmvc.readthedocs.io/ja/latest/index.html
## MMVCコミュニティサーバ(discord)
開発の最新情報や、不明点のお問合せ、MMVCの活用法などMMVCに関するコミュニティサーバです。
https://discord.gg/PgspuDSTEc
## Special thanks
- JVS (Japanese versatile speech) corpus
contributors : 高道 慎之介様/三井 健太郎様/齋藤 佑樹様/郡山 知樹様/丹治 尚子様/猿渡 洋様
https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus
- ITAコーパス マルチモーダルデータベース
contributors : 金井郁也様/千葉隆壱様/齊藤剛史様/森勢将雅様/小口純矢様/能勢隆様/尾上真惟子様/小田恭央様
CharacterVoice : 東北イタコ(木戸衣吹様)/ずんだもん(伊藤ゆいな様)/四国めたん(田中小雪様)/九州そら(西田望見)
https://zunko.jp/multimodal_dev/login.php
- つくよみちゃんコーパス
contributor : 夢前黎様
CharacterVoice : つくよみちゃん(夢前黎様)
https://tyc.rei-yumesaki.net/material/corpus/
## Reference
https://arxiv.org/abs/2106.06103
https://github.com/jaywalnut310/vits
## Author
Isle Tennos
Twitter : https://twitter.com/IsleTennos

View File

@ -1,303 +0,0 @@
import copy
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import commons
import modules
from modules import LayerNorm
class Encoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.attn_layers[i](x, x, attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert t_s == t_t, "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert t_s == t_t, "Local attention is only available for self-attention."
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
return x_final
def _attention_bias_proximal(self, length):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
if causal:
self.padding = self._causal_padding
else:
self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x

View File

@ -1,161 +0,0 @@
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size*dilation - dilation)/2)
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(
length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(num_timescales - 1))
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
device = duration.device
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path.unsqueeze(1).transpose(2,3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1. / norm_type)
return total_norm

View File

@ -1,110 +0,0 @@
{
"train": {
"log_interval": 1000,
"eval_interval": 2000,
"seed": 1234,
"epochs": 10000,
"learning_rate": 0.0002,
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"batch_size": 10,
"fp16_run": true,
"lr_decay": 0.999875,
"segment_size": 8192,
"init_lr_ratio": 1,
"warmup_epochs": 0,
"c_mel": 45,
"c_kl": 1.0
},
"data": {
"training_files": "filelists/dec_not_propagation_label_and_change_melspec_textful.txt",
"validation_files": "filelists/dec_not_propagation_label_and_change_melspec_textful_val.txt",
"training_files_notext": "filelists/dec_not_propagation_label_and_change_melspec_textless.txt",
"validation_files_notext": "filelists/dec_not_propagation_label_and_change_melspec_val_textless.txt",
"text_cleaners": [
"japanese_cleaners"
],
"max_wav_value": 32768.0,
"sampling_rate": 24000,
"filter_length": 512,
"hop_length": 128,
"win_length": 512,
"n_mel_channels": 80,
"mel_fmin": 0.0,
"mel_fmax": null,
"add_blank": true,
"n_speakers": 110,
"cleaned_text": false
},
"model": {
"inter_channels": 192,
"hidden_channels": 192,
"filter_channels": 768,
"n_heads": 2,
"n_layers": 6,
"kernel_size": 3,
"p_dropout": 0.1,
"resblock": "1",
"resblock_kernel_sizes": [
3,
7,
11
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates": [
8,
4,
2,
2
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
8,
8
],
"n_layers_q": 3,
"use_spectral_norm": false,
"n_flow": 8,
"gin_channels": 256
},
"others": {
"os_type": "linux"
},
"augmentation": {
"enable" : true,
"gain_p" : 0.5,
"min_gain_in_db" : -10,
"max_gain_in_db" : 10,
"time_stretch_p" : 0.5,
"min_rate" : 0.75,
"max_rate" : 1.25,
"pitch_shift_p" : 0.0,
"min_semitones" : -4.0,
"max_semitones" : 4.0,
"add_gaussian_noise_p" : 0.0,
"min_amplitude" : 0.001,
"max_amplitude" : 0.04,
"frequency_mask_p" : 0.0
}
}

View File

@ -1,343 +0,0 @@
import glob
import sys
import os
import argparse
import pyopenjtalk
import json
def mozi2phone(mozi):
text = pyopenjtalk.g2p(mozi)
text = "sil " + text + " sil"
text = text.replace(' ', '-')
return text
def create_json(filename, num_speakers, sr, config_path):
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
data = json.load(f)
data['data']['training_files'] = 'filelists/' + filename + '_textful.txt'
data['data']['validation_files'] = 'filelists/' + filename + '_textful_val.txt'
data['data']['training_files_notext'] = 'filelists/' + filename + '_textless.txt'
data['data']['validation_files_notext'] = 'filelists/' + filename + '_val_textless.txt'
data['data']['sampling_rate'] = sr
data['data']['n_speakers'] = num_speakers
with open("./configs/" + filename + ".json", 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def create_dataset(filename):
speaker_id = 107
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
for d in textful_dir_list:
wav_file_list = glob.glob(d+"/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
continue
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
if speaker_id > 108:
break
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return speaker_id
def create_dataset_zundamon(filename):
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
#paths
my_path = "dataset/textful/00_myvoice"
zundamon_path = "dataset/textful/1205_zundamon"
#set list wav and text
#myvoice
speaker_id = 107
d = my_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = 100
d = zundamon_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return 110
def create_dataset_character(filename, tid):
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
#paths
my_path = "dataset/textful/00_myvoice"
zundamon_path = "dataset/textful/01_target"
#set list wav and text
#myvoice
speaker_id = 107
d = my_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = tid
d = zundamon_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return 110
def create_dataset_multi_character(filename, file_path):
Correspondence_list = list()
textless_dir_list = glob.glob("dataset/textless/*")
textless_dir_list.sort()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
with open(file_path, "r") as f:
for line in f.readlines():
target_dir , sid = line.split("|")
sid = sid.rstrip('\n')
wav_file_list = glob.glob("dataset/textful/" + target_dir + "/wav/*.wav")
lab_file_list = glob.glob("dataset/textful/" + target_dir + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + target_dir + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f_text:
mozi = f_text.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(sid) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(sid) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(sid) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(sid)+"|"+ target_dir + "\n")
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return 110
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str, required=True,
help='filelist for configuration')
parser.add_argument('-s', '--sr', type=int, default=24000,
help='sampling rate (default = 24000)')
parser.add_argument('-t', '--target', type=int, default=9999,
help='pre_traind targetid (zundamon = 100, sora = 101, methane = 102, tsumugi = 103)')
parser.add_argument('-m', '--multi_target', type=str, default=None,
help='pre_traind targetid (zundamon = 100, sora = 101, methane = 102, tsumugi = 103)')
parser.add_argument('-c', '--config', type=str, default="./configs/baseconfig.json",
help='JSON file for configuration')
args = parser.parse_args()
filename = args.filename
print(filename)
if args.multi_target != None:
n_spk = create_dataset_multi_character(filename, args.multi_target)
elif args.target != 9999 and args.target == 100:
n_spk = create_dataset_zundamon(filename)
elif args.target != 9999:
n_spk = create_dataset_character(filename, args.target)
else:
n_spk = create_dataset(filename)
create_json(filename, n_spk, args.sr, args.config)
if __name__ == '__main__':
main()

View File

@ -1,305 +0,0 @@
import glob
import sys
import os
import argparse
import pyopenjtalk
import json
def mozi2phone(mozi):
text = pyopenjtalk.g2p(mozi)
text = "sil " + text + " sil"
text = text.replace(' ', '-')
return text
def create_json(filename, num_speakers, sr, config_path):
if os.path.exists(config_path):
with open(config_path, "r", encoding="utf-8") as f:
data = json.load(f)
data['data']['training_files'] = 'filelists/' + filename + '_textful.txt'
data['data']['validation_files'] = 'filelists/' + filename + '_textful_val.txt'
data['data']['training_files_notext'] = 'filelists/' + filename + '_textless.txt'
data['data']['validation_files_notext'] = 'filelists/' + filename + '_val_textless.txt'
data['data']['sampling_rate'] = sr
data['data']['n_speakers'] = num_speakers
with open("./configs/" + filename + ".json", 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def create_dataset(filename, my_sid):
speaker_id = my_sid
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
for d in textful_dir_list:
wav_file_list = glob.glob(d+"/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
continue
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
if speaker_id > 108:
break
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return speaker_id + 1
def create_dataset(filename, my_sid):
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
#paths
my_path = "dataset/textful/00_myvoice"
target_path = "dataset/textful/01_target"
print("myvoice : {}".format(str(os.path.isdir(my_path))))
print("target_path : {}".format(str(os.path.isdir(target_path))))
#set list wav and text
#myvoice
speaker_id = my_sid
d = my_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
if len(lab_file_list) == 0:
print("Error : " + d + "/text にテキストデータがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = 108
d = target_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return 110
def create_dataset_zundamon(filename, my_sid):
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
#paths
my_path = "dataset/textful/00_myvoice"
zundamon_path = "dataset/textful/1205_zundamon"
print("myvoice : {}".format(str(os.path.isdir(my_path))))
print("zundamon_path : {}".format(str(os.path.isdir(zundamon_path))))
#set list wav and text
#myvoice
speaker_id = my_sid
d = my_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
if len(lab_file_list) == 0:
print("Error : " + d + "/text にテキストデータがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = 100
d = zundamon_path
wav_file_list = glob.glob(d + "/wav/*.wav")
lab_file_list = glob.glob(d + "/text/*.txt")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
print("Error" + d + "/wav に音声データがありません")
exit()
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
with open(lab, 'r', encoding="utf-8") as f:
mozi = f.read().split("\n")
print(str(mozi))
test = mozi2phone(str(mozi))
print(test)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*.wav")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return 110
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-f', '--filename', type=str, required=True,
help='filelist for configuration')
parser.add_argument('-s', '--sr', type=int, default=24000,
help='sampling rate (default = 24000)')
parser.add_argument('-m', '--mysid', type=int, default=107,
help='sampling rate (default = 24000)')
parser.add_argument('-z', '--zundamon', type=bool, default=False,
help='U.N. zundamon Was Her? (default = False)')
parser.add_argument('-c', '--config', type=str, default="./configs/baseconfig.json",
help='JSON file for configuration')
args = parser.parse_args()
filename = args.filename
print(filename)
if args.zundamon:
n_spk = create_dataset_zundamon(filename, args.mysid)
else:
n_spk = create_dataset(filename, args.mysid)
create_json(filename, n_spk, args.sr, args.config)
if __name__ == '__main__':
main()

View File

@ -1,87 +0,0 @@
import glob
import sys
def read_lab(lab_f):
with open(lab_f, 'r') as f:
kw_list = f.read().split("\n")
out_phono = []
for i in range(len(kw_list)-1):
out_phono.append(kw_list[i].split()[2])
out_phono.append("-")
if out_phono[0] == 'silB' and out_phono[-2] == 'silE':
out_phono[0] = 'sil'
out_phono[-2] = 'sil'
out_phono = out_phono[0:-1]
out_phono_str = "".join(out_phono)
return out_phono_str
else:
print("Error!")
exit
def create_dataset(filename):
speaker_id = 0
textful_dir_list = glob.glob("dataset/textful/*")
textless_dir_list = glob.glob("dataset/textless/*")
textful_dir_list.sort()
textless_dir_list.sort()
Correspondence_list = list()
output_file_list = list()
output_file_list_val = list()
output_file_list_textless = list()
output_file_list_val_textless = list()
for d in textful_dir_list:
wav_file_list = glob.glob(d+"/wav/*")
lab_file_list = glob.glob(d + "/text/*")
wav_file_list.sort()
lab_file_list.sort()
if len(wav_file_list) == 0:
continue
counter = 0
for lab, wav in zip(lab_file_list, wav_file_list):
test = read_lab(lab)
print(wav + "|"+ str(speaker_id) + "|"+ test)
if counter % 10 != 0:
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
else:
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+d + "\n")
speaker_id = speaker_id + 1
for d in textless_dir_list:
wav_file_list = glob.glob(d+"/*")
wav_file_list.sort()
counter = 0
for wav in wav_file_list:
print(wav + "|"+ str(speaker_id) + "|a")
if counter % 10 != 0:
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
else:
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
counter = counter +1
Correspondence_list.append(str(speaker_id)+"|"+d + "\n")
speaker_id = speaker_id + 1
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list)
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val)
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_textless)
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(output_file_list_val_textless)
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
f.writelines(Correspondence_list)
return speaker_id -1
def main(argv):
filename = str(sys.argv[1])
print(filename)
n_spk = create_dataset(filename)
return filename, n_spk
if __name__ == '__main__':
sys.exit(main(sys.argv))

View File

@ -1,492 +0,0 @@
import time
import os
import random
import numpy as np
import torch
import torch.utils.data
import tqdm
import commons
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence, cleaned_text_to_sequence
#add
from retry import retry
import random
import torchaudio
class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams, use_test = True):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.use_test = use_test
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
random.seed(1234)
random.shuffle(self.audiopaths_and_text)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
text = self.get_text(text)
if self.use_test != True:
text = torch.as_tensor("a")
spec, wav = self.get_audio(audiopath)
return (text, spec, wav)
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
return spec, audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
"""Multi speaker version"""
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_sid_text, hparams, no_text=False, augmentation=False, augmentation_params=None, no_use_textfile = False):
if no_use_textfile:
self.audiopaths_sid_text = list()
else:
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.no_text = no_text
self.augmentation = augmentation
if augmentation :
self.gain_p = augmentation_params.gain_p
self.min_gain_in_db = augmentation_params.min_gain_in_db
self.max_gain_in_db = augmentation_params.max_gain_in_db
self.time_stretch_p = augmentation_params.time_stretch_p
self.min_rate = augmentation_params.min_rate
self.max_rate = augmentation_params.max_rate
self.pitch_shift_p = augmentation_params.pitch_shift_p
self.min_semitones = augmentation_params.min_semitones
self.max_semitones = augmentation_params.max_semitones
self.add_gaussian_noise_p = augmentation_params.add_gaussian_noise_p
self.min_amplitude = augmentation_params.min_amplitude
self.max_amplitude = augmentation_params.max_amplitude
self.frequency_mask_p = augmentation_params.frequency_mask_p
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 1000)
random.seed(1234)
random.shuffle(self.audiopaths_sid_text)
self._filter()
@retry(tries=30, delay=10)
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_sid_text_new = []
lengths = []
for audiopath, sid, text in tqdm.tqdm(self.audiopaths_sid_text):
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_sid_text_new.append([audiopath, sid, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
def get_audio_text_speaker_pair(self, audiopath_sid_text):
# separate filename, speaker_id and text
audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
text = self.get_text(text)
if self.no_text:
text = self.get_text("a")
spec, wav = self.get_audio(audiopath)
sid = self.get_sid(sid)
return (text, spec, wav, sid)
@retry(exceptions=(PermissionError), tries=100, delay=10)
def get_audio(self, filename):
# 音声データは±1.0内に正規化したtorchベクトルでunsqueeze(0)で外側1次元くるんだものを扱う
audio, sampling_rate = load_wav_to_torch(filename)
try:
if sampling_rate != self.sampling_rate:
raise ValueError("[Error] Exception: source {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
except ValueError as e:
print(e)
exit()
audio_norm = self.get_normalized_audio(audio, self.max_wav_value)
if self.augmentation:
audio_augmented = self.add_augmentation(audio_norm, sampling_rate)
audio_noised = self.add_noise(audio_augmented, sampling_rate)
# ーマライズ後のaugmentationとnoise付加で範囲外になったところを削る
audio_augmented = torch.clamp(audio_augmented, -1, 1)
audio_noised = torch.clamp(audio_noised, -1, 1)
# audio(音声波形)は教師信号となるのでイズは含まずaugmentationのみしたものを使用
audio_norm = audio_augmented
# spec(スペクトログラム)は入力信号となるのでaugmentationしてさらにイズを付加したものを使用
spec = spectrogram_torch(audio_noised, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec_noised = self.add_spectrogram_noise(spec)
spec = torch.squeeze(spec_noised, 0)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
return spec, audio_norm
def add_augmentation(self, audio, sampling_rate):
gain_in_db = 0.0
if random.random() <= self.gain_p:
gain_in_db = random.uniform(self.min_gain_in_db, self.max_gain_in_db)
time_stretch_rate = 1.0
if random.random() <= self.time_stretch_p:
time_stretch_rate = random.uniform(self.min_rate, self.max_rate)
pitch_shift_semitones = 0
if random.random() <= self.pitch_shift_p:
pitch_shift_semitones = random.uniform(self.min_semitones, self.max_semitones) * 100 # 1/100 semitone 単位指定のため
augmentation_effects = [
["gain", f"{gain_in_db}"],
["tempo", f"{time_stretch_rate}"],
["pitch", f"{pitch_shift_semitones}"],
["rate", f"{sampling_rate}"]
]
audio_augmented, _ = torchaudio.sox_effects.apply_effects_tensor(audio, sampling_rate, augmentation_effects)
return audio_augmented
def add_noise(self, audio, sampling_rate):
# AddGaussianNoise
audio = self.add_gaussian_noise(audio)
return audio
def add_gaussian_noise(self, audio):
assert self.min_amplitude >= 0.0
assert self.max_amplitude >= 0.0
assert self.max_amplitude >= self.min_amplitude
if random.random() > self.add_gaussian_noise_p:
return audio
amplitude = random.uniform(self.min_amplitude, self.max_amplitude)
noise = torch.randn(audio.size())
noised_audio = audio + amplitude * noise
return noised_audio
def add_spectrogram_noise(self, spec):
# FrequencyMask
masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
masked = masking(spec)
return masked
def get_normalized_audio(self, audio, max_wav_value):
audio_norm = audio / max_wav_value
audio_norm = audio_norm.unsqueeze(0)
return audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def __getitem__(self, index):
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
def __len__(self):
return len(self.audiopaths_sid_text)
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False, no_text = False):
self.return_ids = return_ids
self.no_text = no_text
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
sid = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
sid[i] = row[3]
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
Length groups are specified by boundaries.
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
It removes samples which are not included in the boundaries.
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
self.batch_size = batch_size
self.boundaries = boundaries
self.buckets, self.num_samples_per_bucket = self._create_buckets()
self.total_size = sum(self.num_samples_per_bucket)
self.num_samples = self.total_size // self.num_replicas
def _create_buckets(self):
buckets = [[] for _ in range(len(self.boundaries) - 1)]
for i in range(len(self.lengths)):
length = self.lengths[i]
idx_bucket = self._bisect(length)
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, 0, -1):
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i+1)
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
if self.shuffle:
for bucket in self.buckets:
indices.append(torch.randperm(len(bucket), generator=g).tolist())
else:
for bucket in self.buckets:
indices.append(list(range(len(bucket))))
batches = []
for i in range(len(self.buckets)):
next_bucket = (i+1) % len(self.buckets)
bucket = self.buckets[i]
len_bucket = len(bucket)
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
if len_bucket == 0:
print("[Warn] Exception: length of buckets {} is 0. ID:{} Skip.".format(i,i))
continue
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
# subsample
ids_bucket = ids_bucket[self.rank::self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
batches.append(batch)
if self.shuffle:
batch_ids = torch.randperm(len(batches), generator=g).tolist()
batches = [batches[i] for i in batch_ids]
self.batches = batches
assert len(self.batches) * self.batch_size == self.num_samples
return iter(self.batches)
def _bisect(self, x, lo=0, hi=None):
if hi is None:
hi = len(self.boundaries) - 1
if hi > lo:
mid = (hi + lo) // 2
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
return mid
elif x <= self.boundaries[mid]:
return self._bisect(x, lo, mid)
else:
return self._bisect(x, mid + 1, hi)
else:
return -1
def __len__(self):
return self.num_samples // self.batch_size

View File

@ -1,7 +0,0 @@
00_myvoice|107
01_target|108
02_target|109
03_target|0
04_target|1
05_target|2
1205_zundamon|100

View File

@ -1,2 +0,0 @@
*
!.gitignore

View File

@ -1,2 +0,0 @@
*
!.gitignore

View File

@ -1,2 +0,0 @@
*
!.gitignore

View File

@ -1,2 +0,0 @@
*
!.gitignore

View File

@ -1 +0,0 @@
オンナノコガキッキッウレシソー。

View File

@ -1 +0,0 @@
ツァツォニリョコーシタ。

View File

@ -1 +0,0 @@
ミンシュウガテュルリーキュウデンニシンニュウシタ。

View File

@ -1 +0,0 @@
ハイチキョーワコクデトゥーサンルーヴェルテュールガショーリヲオサメラレタノワ、ジッサイオーネツビョーノオカゲダッタ。

View File

@ -1 +0,0 @@
レジャンドルワミンシュウヲテュルリーキュウデンニマネータ。

View File

@ -1 +0,0 @@
ジョゲンワデキナイトデュパンワイッタ。

View File

@ -1 +0,0 @@
フランスジンシェフトニホンジンシェフワゼンゼンチガウ。

View File

@ -1 +0,0 @@
チュウゴクノガイコーダンニアタッシェトシテハケンサレタ。

View File

@ -1 +0,0 @@
ファシズムセーリョクトノソーリョクセンニノゾム。

View File

@ -1 +0,0 @@
カグショーニンノフィシェルワ、ニグルマトコウマヲカシテクレタ。

View File

@ -1 +0,0 @@
ローカルロセンニワファンモオオイ。

View File

@ -1 +0,0 @@
フェイントデアイテヲカワシテカラシュートデフィニッシュシタ。

View File

@ -1 +0,0 @@
センハッピャクナナジュウナナ、プフェファーニヨリシントーゲンショーガハッケンサレタ。

View File

@ -1 +0,0 @@
ユレルフェリーニノルノワワタシニトッテクギョーデス。

View File

@ -1 +0,0 @@
ホルロアラティタルッフォトユウトクベツナオリョーリモデマシタ。

View File

@ -1 +0,0 @@
フエノオトガナルトウサギノキッドガサッソクピョントハネタ。

View File

@ -1 +0,0 @@
アノリョキャクワウワサノキャフェニイクヨーデス。

View File

@ -1 +0,0 @@
モクヒョーワイットーショーデス。

View File

@ -1 +0,0 @@
ウサギノキッドワキブンヨクピョン、マタピョントトビツヅケタ。

View File

@ -1 +0,0 @@
アフタヌーンティーヲタノシミマショー。

View File

@ -1 +0,0 @@
カノジョワティピカルナフェミニストデス。

View File

@ -1 +0,0 @@
ジョシュタチトミッツィワサガシテイルショルイヲミツケラレナカッタ。

View File

@ -1 +0,0 @@
フィレンツェ、パドヴァ、ヴェネツィアワドレモイタリアノトシデス。

View File

@ -1 +0,0 @@
ガクフニツギノヨーニカイテアルノガ、エーフェリチェデス。

View File

@ -1 +0,0 @@
ショペンハウエルトニーチェノテツガクショヲホンダナカラトリダシタ。

View File

@ -1 +0,0 @@
サッソクメシツカイゼンインニシラセマショー。

View File

@ -1 +0,0 @@
オモイワタイレヲヌイデ、アワセニキガエル。

View File

@ -1 +0,0 @@
ボストンデ、トアルチョプスイヤエハイッテユウハンヲクッタ。

View File

@ -1 +0,0 @@
ロクスッポキュウケーヲトラズハタライタ。

View File

@ -1 +0,0 @@
カツテヒトリデコクフニシンニュウシタ。

View File

@ -1 +0,0 @@
ダガ、キョーオマエガココエゴジュライニナッタノワ、ドンナゴヨーナノカナ?

View File

@ -1 +0,0 @@
サブフランチャイザーヲフヤシテメザセヒャクテンポ。

View File

@ -1 +0,0 @@
シコクデオヘンロヲアンギャシヨー。

View File

@ -1 +0,0 @@
イツモノトオリギャンギャンナキダシマシタ。

View File

@ -1 +0,0 @@
センセーワ、タッタママニュースヲミテイマシタ。

View File

@ -1 +0,0 @@
ワタシワギョットメヲミヒライタ。

View File

@ -1 +0,0 @@
トモダチエニューイヤーカードヲオクロー。

View File

@ -1 +0,0 @@
カセーフワヤスミニオシャレナアウターウェアニミヲツツミヒトリデヤタイヲタノシミマシタ。

View File

@ -1 +0,0 @@
ウォッカノオトモニワシオヅケノキュウリガアイマス。

View File

@ -1 +0,0 @@
ヤマノムコーノミュンヒェンノヒトタチガコーゲキヲシカケタ。

View File

@ -1 +0,0 @@
ボスニアコッキョーカラノコーゲキニヨリ、ジュウイチガツニヴァリェヴォガセンリョーサレタ。

View File

@ -1 +0,0 @@
シルヴィウスワデュボアトヨバレテイタフランスノユグノーノイエニウマレタ。

View File

@ -1 +0,0 @@
ソノホカニワタシニデキルコトワナカッタノデス、ユリエワナミダゴエニナッタ。

View File

@ -1 +0,0 @@
ガルハカセヒャクタイチカク。

View File

@ -1 +0,0 @@
ニホンセーフカラノヒャクチョーエンヲコエルヨサンヨーキュウ。

View File

@ -1 +0,0 @@
シャキョーノウツクシサニワタシワギョーテンシテシマッタ。

View File

@ -1 +0,0 @@
ソプラノカシュポリランダチョワカゲキアイーダノトクベツメーカシュトヒョーバンデス。

View File

@ -1 +0,0 @@
アナタニワサイショヒャクポンドワタシマス。

View File

@ -1 +0,0 @@
シャチョーカラノシジデス。

View File

@ -1 +0,0 @@
ドーモキマグレトユウモノワタショーメフィスティックナモノデアルラシイ。

View File

@ -1 +0,0 @@
カエルガピョコピョコトビマワッテイマス。

View File

@ -1 +0,0 @@
マキョーニアシヲフミイレル。

View File

@ -1 +0,0 @@
ヴァンダーヴォットタイムチュウワ、イワユルパーティーノヨーデハレヤカデス。

View File

@ -1 +0,0 @@
スピリッツトワジョーリュウシュノコトデス。

View File

@ -1 +0,0 @@
ヌルシアノベネディクトゥスワアポロンシンデンヲコワシ、ベネディクトカイノシュウドーインヲタテタ。

View File

@ -1 +0,0 @@
チョードソノトキ、テストゥパーゴガコップヲモッテタチアガリマシタ。

Some files were not shown because too many files have changed in this diff Show More