refactoring
2
.gitignore
vendored
@ -11,4 +11,4 @@ server/memo.txt
|
|||||||
|
|
||||||
client/lib/dist
|
client/lib/dist
|
||||||
client/lib/worklet/dist
|
client/lib/worklet/dist
|
||||||
client/demo/dist/
|
# client/demo/dist/ # demo用に残す
|
||||||
|
0
frontend/dist/assets/icons/file-text.svg → client/demo/dist/asset/icons/file-text.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 473 B After Width: | Height: | Size: 473 B |
0
frontend/dist/assets/icons/flect.png → client/demo/dist/asset/icons/flect.png
vendored
Executable file → Normal file
Before Width: | Height: | Size: 1.3 KiB After Width: | Height: | Size: 1.3 KiB |
0
frontend/dist/assets/icons/github.svg → client/demo/dist/asset/icons/github.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 522 B After Width: | Height: | Size: 522 B |
0
frontend/dist/assets/icons/help-circle.svg → client/demo/dist/asset/icons/help-circle.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 365 B After Width: | Height: | Size: 365 B |
0
frontend/dist/assets/icons/home.svg → client/demo/dist/asset/icons/home.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 327 B After Width: | Height: | Size: 327 B |
0
frontend/dist/assets/icons/linkedin.svg → client/demo/dist/asset/icons/linkedin.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 395 B After Width: | Height: | Size: 395 B |
0
frontend/dist/assets/icons/twitter.svg → client/demo/dist/asset/icons/twitter.svg
vendored
Executable file → Normal file
Before Width: | Height: | Size: 403 B After Width: | Height: | Size: 403 B |
0
frontend/dist/assets/icons/zun.png → client/demo/dist/asset/icons/zun.png
vendored
Executable file → Normal file
Before Width: | Height: | Size: 5.3 KiB After Width: | Height: | Size: 5.3 KiB |
0
frontend/dist/favicon.ico → client/demo/dist/favicon.ico
vendored
Executable file → Normal file
Before Width: | Height: | Size: 17 KiB After Width: | Height: | Size: 17 KiB |
1
client/demo/dist/index.html
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
<!doctype html><html style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>Voice Changer Client Demo</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div></body></html>
|
2
client/demo/dist/index.js
vendored
Normal file
31
client/demo/dist/index.js.LICENSE.txt
vendored
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
/*! regenerator-runtime -- Copyright (c) 2014-present, Facebook, Inc. -- license (MIT): https://github.com/facebook/regenerator/blob/main/LICENSE */
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @license React
|
||||||
|
* react-dom.production.min.js
|
||||||
|
*
|
||||||
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||||
|
*
|
||||||
|
* This source code is licensed under the MIT license found in the
|
||||||
|
* LICENSE file in the root directory of this source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @license React
|
||||||
|
* react.production.min.js
|
||||||
|
*
|
||||||
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||||
|
*
|
||||||
|
* This source code is licensed under the MIT license found in the
|
||||||
|
* LICENSE file in the root directory of this source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @license React
|
||||||
|
* scheduler.production.min.js
|
||||||
|
*
|
||||||
|
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||||
|
*
|
||||||
|
* This source code is licensed under the MIT license found in the
|
||||||
|
* LICENSE file in the root directory of this source tree.
|
||||||
|
*/
|
3939
client/demo/package-lock.json
generated
@ -20,15 +20,15 @@
|
|||||||
"@babel/preset-env": "^7.20.2",
|
"@babel/preset-env": "^7.20.2",
|
||||||
"@babel/preset-react": "^7.18.6",
|
"@babel/preset-react": "^7.18.6",
|
||||||
"@babel/preset-typescript": "^7.18.6",
|
"@babel/preset-typescript": "^7.18.6",
|
||||||
"@types/node": "^18.11.17",
|
"@types/node": "^18.11.18",
|
||||||
"@types/react": "^18.0.26",
|
"@types/react": "^18.0.26",
|
||||||
"@types/react-dom": "^18.0.9",
|
"@types/react-dom": "^18.0.10",
|
||||||
"autoprefixer": "^10.4.13",
|
"autoprefixer": "^10.4.13",
|
||||||
"babel-loader": "^9.1.0",
|
"babel-loader": "^9.1.2",
|
||||||
"copy-webpack-plugin": "^11.0.0",
|
"copy-webpack-plugin": "^11.0.0",
|
||||||
"css-loader": "^6.7.3",
|
"css-loader": "^6.7.3",
|
||||||
"eslint": "^8.30.0",
|
"eslint": "^8.31.0",
|
||||||
"eslint-config-prettier": "^8.5.0",
|
"eslint-config-prettier": "^8.6.0",
|
||||||
"eslint-plugin-prettier": "^4.2.1",
|
"eslint-plugin-prettier": "^4.2.1",
|
||||||
"eslint-plugin-react": "^7.31.11",
|
"eslint-plugin-react": "^7.31.11",
|
||||||
"eslint-webpack-plugin": "^3.2.0",
|
"eslint-webpack-plugin": "^3.2.0",
|
||||||
@ -41,14 +41,14 @@
|
|||||||
"rimraf": "^3.0.2",
|
"rimraf": "^3.0.2",
|
||||||
"style-loader": "^3.3.1",
|
"style-loader": "^3.3.1",
|
||||||
"ts-loader": "^9.4.2",
|
"ts-loader": "^9.4.2",
|
||||||
"tsconfig-paths": "^4.1.1",
|
"tsconfig-paths": "^4.1.2",
|
||||||
"typescript": "^4.9.4",
|
"typescript": "^4.9.4",
|
||||||
"webpack": "^5.75.0",
|
"webpack": "^5.75.0",
|
||||||
"webpack-cli": "^5.0.1",
|
"webpack-cli": "^5.0.1",
|
||||||
"webpack-dev-server": "^4.11.1"
|
"webpack-dev-server": "^4.11.1"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@dannadori/voice-changer-client-js": "file:../lib",
|
"@dannadori/voice-changer-client-js": "^1.0.3",
|
||||||
"react": "^18.2.0",
|
"react": "^18.2.0",
|
||||||
"react-dom": "^18.2.0"
|
"react-dom": "^18.2.0"
|
||||||
}
|
}
|
||||||
|
1
client/demo/public/assets/icons/file-text.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-file-text"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"></path><polyline points="14 2 14 8 20 8"></polyline><line x1="16" y1="13" x2="8" y2="13"></line><line x1="16" y1="17" x2="8" y2="17"></line><polyline points="10 9 9 9 8 9"></polyline></svg>
|
After Width: | Height: | Size: 473 B |
BIN
client/demo/public/assets/icons/flect.png
Executable file
After Width: | Height: | Size: 1.3 KiB |
1
client/demo/public/assets/icons/github.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-github"><path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"></path></svg>
|
After Width: | Height: | Size: 522 B |
1
client/demo/public/assets/icons/help-circle.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-help-circle"><circle cx="12" cy="12" r="10"></circle><path d="M9.09 9a3 3 0 0 1 5.83 1c0 2-3 3-3 3"></path><line x1="12" y1="17" x2="12.01" y2="17"></line></svg>
|
After Width: | Height: | Size: 365 B |
1
client/demo/public/assets/icons/home.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-home"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"></path><polyline points="9 22 9 12 15 12 15 22"></polyline></svg>
|
After Width: | Height: | Size: 327 B |
1
client/demo/public/assets/icons/linkedin.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-linkedin"><path d="M16 8a6 6 0 0 1 6 6v7h-4v-7a2 2 0 0 0-2-2 2 2 0 0 0-2 2v7h-4v-7a6 6 0 0 1 6-6z"></path><rect x="2" y="9" width="4" height="12"></rect><circle cx="4" cy="4" r="2"></circle></svg>
|
After Width: | Height: | Size: 395 B |
1
client/demo/public/assets/icons/twitter.svg
Executable file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-twitter"><path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path></svg>
|
After Width: | Height: | Size: 403 B |
BIN
client/demo/public/assets/icons/zun.png
Executable file
After Width: | Height: | Size: 5.3 KiB |
BIN
client/demo/public/favicon.ico
Executable file
After Width: | Height: | Size: 17 KiB |
@ -41,5 +41,11 @@ module.exports = {
|
|||||||
template: path.resolve(__dirname, "public/index.html"),
|
template: path.resolve(__dirname, "public/index.html"),
|
||||||
filename: "./index.html",
|
filename: "./index.html",
|
||||||
}),
|
}),
|
||||||
|
new CopyPlugin({
|
||||||
|
patterns: [{ from: "public/assets", to: "asset" }],
|
||||||
|
}),
|
||||||
|
new CopyPlugin({
|
||||||
|
patterns: [{ from: "public/favicon.ico", to: "favicon.ico" }],
|
||||||
|
}),
|
||||||
]
|
]
|
||||||
};
|
};
|
||||||
|
5
client/lib/.npmignore
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
src
|
||||||
|
worklet
|
||||||
|
node_modules
|
||||||
|
webpack.*
|
||||||
|
tsconfig.*
|
4
client/lib/package-lock.json
generated
@ -1,12 +1,12 @@
|
|||||||
{
|
{
|
||||||
"name": "@dannadori/voice-changer-client-js",
|
"name": "@dannadori/voice-changer-client-js",
|
||||||
"version": "1.0.0",
|
"version": "1.0.3",
|
||||||
"lockfileVersion": 2,
|
"lockfileVersion": 2,
|
||||||
"requires": true,
|
"requires": true,
|
||||||
"packages": {
|
"packages": {
|
||||||
"": {
|
"": {
|
||||||
"name": "@dannadori/voice-changer-client-js",
|
"name": "@dannadori/voice-changer-client-js",
|
||||||
"version": "1.0.0",
|
"version": "1.0.3",
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/readable-stream": "^2.3.15",
|
"@types/readable-stream": "^2.3.15",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@dannadori/voice-changer-client-js",
|
"name": "@dannadori/voice-changer-client-js",
|
||||||
"version": "1.0.0",
|
"version": "1.0.3",
|
||||||
"description": "",
|
"description": "",
|
||||||
"main": "dist/index.js",
|
"main": "dist/index.js",
|
||||||
"directories": {
|
"directories": {
|
||||||
@ -17,6 +17,7 @@
|
|||||||
"webpack:prod": "webpack --config webpack.prod.js",
|
"webpack:prod": "webpack --config webpack.prod.js",
|
||||||
"build:dev": "npm-run-all build:worklet:dev clean webpack:dev",
|
"build:dev": "npm-run-all build:worklet:dev clean webpack:dev",
|
||||||
"build:prod": "npm-run-all build:worklet:prod clean webpack:prod",
|
"build:prod": "npm-run-all build:worklet:prod clean webpack:prod",
|
||||||
|
"release": "npm version patch && npm publish --access=public",
|
||||||
"test": "echo \"Error: no test specified\" && exit 1"
|
"test": "echo \"Error: no test specified\" && exit 1"
|
||||||
},
|
},
|
||||||
"keywords": [],
|
"keywords": [],
|
||||||
|
@ -1,19 +0,0 @@
|
|||||||
Cython==0.29.32
|
|
||||||
fastapi==0.88.0
|
|
||||||
librosa==0.9.2
|
|
||||||
numpy==1.23.5
|
|
||||||
phonemizer==3.2.1
|
|
||||||
psutil==5.9.4
|
|
||||||
pyopenjtalk==0.3.0
|
|
||||||
pyOpenSSL==22.1.0
|
|
||||||
python-multipart==0.0.5
|
|
||||||
python-socketio==5.7.2
|
|
||||||
retry==0.9.2
|
|
||||||
scipy==1.9.3
|
|
||||||
tensorboard==2.11.0
|
|
||||||
torch==1.13.0
|
|
||||||
torchaudio==0.13.0
|
|
||||||
tqdm==4.64.1
|
|
||||||
Unidecode==1.3.6
|
|
||||||
uvicorn==0.20.0
|
|
||||||
websockets==10.4
|
|
@ -1,516 +0,0 @@
|
|||||||
import sys, os, struct, argparse, logging, shutil, base64, traceback
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime
|
|
||||||
from distutils.util import strtobool
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from scipy.io.wavfile import write, read
|
|
||||||
|
|
||||||
sys.path.append("MMVC_Trainer")
|
|
||||||
sys.path.append("MMVC_Trainer/text")
|
|
||||||
|
|
||||||
from fastapi.routing import APIRoute
|
|
||||||
from fastapi import HTTPException, Request, Response, FastAPI, UploadFile, File, Form
|
|
||||||
from fastapi.staticfiles import StaticFiles
|
|
||||||
from fastapi.encoders import jsonable_encoder
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
|
||||||
import uvicorn
|
|
||||||
import socketio
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
from mods.Trainer_Speakers import mod_get_speakers
|
|
||||||
from mods.Trainer_Training import mod_post_pre_training, mod_post_start_training, mod_post_stop_training, mod_get_related_files, mod_get_tail_training_log
|
|
||||||
from mods.Trainer_Model import mod_get_model, mod_delete_model
|
|
||||||
from mods.Trainer_Models import mod_get_models
|
|
||||||
from mods.Trainer_MultiSpeakerSetting import mod_get_multi_speaker_setting, mod_post_multi_speaker_setting
|
|
||||||
from mods.Trainer_Speaker_Voice import mod_get_speaker_voice
|
|
||||||
from mods.Trainer_Speaker_Voices import mod_get_speaker_voices
|
|
||||||
from mods.Trainer_Speaker import mod_delete_speaker
|
|
||||||
from mods.FileUploader import upload_file, concat_file_chunks
|
|
||||||
from mods.VoiceChanger import VoiceChanger
|
|
||||||
from mods.ssl import create_self_signed_cert
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# File Uploader
|
|
||||||
|
|
||||||
# Trainer Rest Internal
|
|
||||||
|
|
||||||
|
|
||||||
class UvicornSuppressFilter(logging.Filter):
|
|
||||||
def filter(self, record):
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger("uvicorn.error")
|
|
||||||
logger.addFilter(UvicornSuppressFilter())
|
|
||||||
# logger.propagate = False
|
|
||||||
logger = logging.getLogger("multipart.multipart")
|
|
||||||
logger.propagate = False
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ExApplicationInfo():
|
|
||||||
external_tensorboard_port: int
|
|
||||||
|
|
||||||
|
|
||||||
exApplitionInfo = ExApplicationInfo(external_tensorboard_port=0)
|
|
||||||
|
|
||||||
|
|
||||||
class VoiceModel(BaseModel):
|
|
||||||
gpu: int
|
|
||||||
srcId: int
|
|
||||||
dstId: int
|
|
||||||
timestamp: int
|
|
||||||
prefixChunkSize: int
|
|
||||||
buffer: str
|
|
||||||
|
|
||||||
|
|
||||||
class MyCustomNamespace(socketio.AsyncNamespace):
|
|
||||||
def __init__(self, namespace):
|
|
||||||
super().__init__(namespace)
|
|
||||||
|
|
||||||
def loadModel(self, config, model):
|
|
||||||
if hasattr(self, 'voiceChanger') == True:
|
|
||||||
self.voiceChanger.destroy()
|
|
||||||
self.voiceChanger = VoiceChanger(config, model)
|
|
||||||
|
|
||||||
# def loadWhisperModel(self, model):
|
|
||||||
# self.whisper = Whisper()
|
|
||||||
# self.whisper.loadModel("tiny")
|
|
||||||
# print("load")
|
|
||||||
|
|
||||||
def changeVoice(self, gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData):
|
|
||||||
# if hasattr(self, 'whisper') == True:
|
|
||||||
# self.whisper.addData(unpackedData)
|
|
||||||
if hasattr(self, 'voiceChanger') == True:
|
|
||||||
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
|
||||||
else:
|
|
||||||
print("Voice Change is not loaded. Did you load a correct model?")
|
|
||||||
return np.zeros(1).astype(np.int16)
|
|
||||||
|
|
||||||
# def transcribe(self):
|
|
||||||
# if hasattr(self, 'whisper') == True:
|
|
||||||
# self.whisper.transcribe(0)
|
|
||||||
# else:
|
|
||||||
# print("whisper not found")
|
|
||||||
|
|
||||||
def on_connect(self, sid, environ):
|
|
||||||
# print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
|
|
||||||
pass
|
|
||||||
|
|
||||||
async def on_request_message(self, sid, msg):
|
|
||||||
# print("on_request_message", torch.cuda.memory_allocated())
|
|
||||||
gpu = int(msg[0])
|
|
||||||
srcId = int(msg[1])
|
|
||||||
dstId = int(msg[2])
|
|
||||||
timestamp = int(msg[3])
|
|
||||||
prefixChunkSize = int(msg[4])
|
|
||||||
data = msg[5]
|
|
||||||
# print(srcId, dstId, timestamp)
|
|
||||||
unpackedData = np.array(struct.unpack(
|
|
||||||
'<%sh' % (len(data) // struct.calcsize('<h')), data))
|
|
||||||
audio1 = self.changeVoice(
|
|
||||||
gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
|
||||||
|
|
||||||
bin = struct.pack('<%sh' % len(audio1), *audio1)
|
|
||||||
await self.emit('response', [timestamp, bin])
|
|
||||||
|
|
||||||
def on_disconnect(self, sid):
|
|
||||||
# print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def setupArgParser():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("-t", type=str, default="MMVC",
|
|
||||||
help="Server type. MMVC|TRAIN")
|
|
||||||
parser.add_argument("-p", type=int, default=8080, help="port")
|
|
||||||
parser.add_argument("-c", type=str, help="path for the config.json")
|
|
||||||
parser.add_argument("-m", type=str, help="path for the model file")
|
|
||||||
parser.add_argument("--https", type=strtobool,
|
|
||||||
default=False, help="use https")
|
|
||||||
parser.add_argument("--httpsKey", type=str,
|
|
||||||
default="ssl.key", help="path for the key of https")
|
|
||||||
parser.add_argument("--httpsCert", type=str,
|
|
||||||
default="ssl.cert", help="path for the cert of https")
|
|
||||||
parser.add_argument("--httpsSelfSigned", type=strtobool,
|
|
||||||
default=True, help="generate self-signed certificate")
|
|
||||||
parser.add_argument("--colab", type=strtobool,
|
|
||||||
default=False, help="run on colab")
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
def printMessage(message, level=0):
|
|
||||||
if level == 0:
|
|
||||||
print(f"\033[17m{message}\033[0m")
|
|
||||||
elif level == 1:
|
|
||||||
print(f"\033[34m {message}\033[0m")
|
|
||||||
elif level == 2:
|
|
||||||
print(f"\033[32m {message}\033[0m")
|
|
||||||
else:
|
|
||||||
print(f"\033[47m {message}\033[0m")
|
|
||||||
|
|
||||||
|
|
||||||
global app_socketio
|
|
||||||
global app_fastapi
|
|
||||||
|
|
||||||
parser = setupArgParser()
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
printMessage(f"Phase name:{__name__}", level=2)
|
|
||||||
thisFilename = os.path.basename(__file__)[:-3]
|
|
||||||
|
|
||||||
|
|
||||||
class ValidationErrorLoggingRoute(APIRoute):
|
|
||||||
def get_route_handler(self) -> Callable:
|
|
||||||
original_route_handler = super().get_route_handler()
|
|
||||||
|
|
||||||
async def custom_route_handler(request: Request) -> Response:
|
|
||||||
try:
|
|
||||||
return await original_route_handler(request)
|
|
||||||
except Exception as exc:
|
|
||||||
print("Exception", request.url, str(exc))
|
|
||||||
body = await request.body()
|
|
||||||
detail = {"errors": exc.errors(), "body": body.decode()}
|
|
||||||
raise HTTPException(status_code=422, detail=detail)
|
|
||||||
|
|
||||||
return custom_route_handler
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == thisFilename or args.colab == True:
|
|
||||||
printMessage(f"PHASE3:{__name__}", level=2)
|
|
||||||
TYPE = args.t
|
|
||||||
PORT = args.p
|
|
||||||
CONFIG = args.c
|
|
||||||
MODEL = args.m
|
|
||||||
|
|
||||||
if os.getenv("EX_TB_PORT"):
|
|
||||||
EX_TB_PORT = os.environ["EX_TB_PORT"]
|
|
||||||
exApplitionInfo.external_tensorboard_port = int(EX_TB_PORT)
|
|
||||||
|
|
||||||
|
|
||||||
app_fastapi = FastAPI()
|
|
||||||
app_fastapi.router.route_class = ValidationErrorLoggingRoute
|
|
||||||
app_fastapi.add_middleware(
|
|
||||||
CORSMiddleware,
|
|
||||||
allow_origins=["*"],
|
|
||||||
allow_credentials=True,
|
|
||||||
allow_methods=["*"],
|
|
||||||
allow_headers=["*"],
|
|
||||||
)
|
|
||||||
|
|
||||||
app_fastapi.mount(
|
|
||||||
"/front", StaticFiles(directory="../frontend/dist", html=True), name="static")
|
|
||||||
|
|
||||||
app_fastapi.mount(
|
|
||||||
"/trainer", StaticFiles(directory="../frontend/dist", html=True), name="static")
|
|
||||||
|
|
||||||
app_fastapi.mount(
|
|
||||||
"/recorder", StaticFiles(directory="../frontend/dist", html=True), name="static")
|
|
||||||
|
|
||||||
sio = socketio.AsyncServer(
|
|
||||||
async_mode='asgi',
|
|
||||||
cors_allowed_origins='*'
|
|
||||||
)
|
|
||||||
namespace = MyCustomNamespace('/test')
|
|
||||||
sio.register_namespace(namespace)
|
|
||||||
if CONFIG and MODEL:
|
|
||||||
namespace.loadModel(CONFIG, MODEL)
|
|
||||||
# namespace.loadWhisperModel("base")
|
|
||||||
|
|
||||||
app_socketio = socketio.ASGIApp(
|
|
||||||
sio,
|
|
||||||
other_asgi_app=app_fastapi,
|
|
||||||
static_files={
|
|
||||||
'/assets/icons/github.svg': {
|
|
||||||
'filename': '../frontend/dist/assets/icons/github.svg',
|
|
||||||
'content_type': 'image/svg+xml'
|
|
||||||
},
|
|
||||||
'': '../frontend/dist',
|
|
||||||
'/': '../frontend/dist/index.html',
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app_fastapi.get("/api/hello")
|
|
||||||
async def index():
|
|
||||||
return {"result": "Index"}
|
|
||||||
|
|
||||||
############
|
|
||||||
# File Uploder
|
|
||||||
# ##########
|
|
||||||
UPLOAD_DIR = "upload_dir"
|
|
||||||
os.makedirs(UPLOAD_DIR, exist_ok=True)
|
|
||||||
MODEL_DIR = "MMVC_Trainer/logs"
|
|
||||||
os.makedirs(MODEL_DIR, exist_ok=True)
|
|
||||||
|
|
||||||
@app_fastapi.post("/upload_file")
|
|
||||||
async def post_upload_file(
|
|
||||||
file: UploadFile = File(...),
|
|
||||||
filename: str = Form(...)
|
|
||||||
):
|
|
||||||
return upload_file(UPLOAD_DIR, file, filename)
|
|
||||||
|
|
||||||
@app_fastapi.post("/load_model")
|
|
||||||
async def post_load_model(
|
|
||||||
modelFilename: str = Form(...),
|
|
||||||
modelFilenameChunkNum: int = Form(...),
|
|
||||||
configFilename: str = Form(...)
|
|
||||||
):
|
|
||||||
|
|
||||||
modelFilePath = concat_file_chunks(
|
|
||||||
UPLOAD_DIR, modelFilename, modelFilenameChunkNum, UPLOAD_DIR)
|
|
||||||
print(f'File saved to: {modelFilePath}')
|
|
||||||
configFilePath = os.path.join(UPLOAD_DIR, configFilename)
|
|
||||||
|
|
||||||
namespace.loadModel(configFilePath, modelFilePath)
|
|
||||||
return {"load": f"{modelFilePath}, {configFilePath}"}
|
|
||||||
|
|
||||||
@app_fastapi.post("/load_model_for_train")
|
|
||||||
async def post_load_model_for_train(
|
|
||||||
modelGFilename: str = Form(...),
|
|
||||||
modelGFilenameChunkNum: int = Form(...),
|
|
||||||
modelDFilename: str = Form(...),
|
|
||||||
modelDFilenameChunkNum: int = Form(...),
|
|
||||||
):
|
|
||||||
|
|
||||||
modelGFilePath = concat_file_chunks(
|
|
||||||
UPLOAD_DIR, modelGFilename, modelGFilenameChunkNum, MODEL_DIR)
|
|
||||||
modelDFilePath = concat_file_chunks(
|
|
||||||
UPLOAD_DIR, modelDFilename, modelDFilenameChunkNum, MODEL_DIR)
|
|
||||||
return {"File saved": f"{modelGFilePath}, {modelDFilePath}"}
|
|
||||||
|
|
||||||
@app_fastapi.post("/extract_voices")
|
|
||||||
async def post_load_model(
|
|
||||||
zipFilename: str = Form(...),
|
|
||||||
zipFileChunkNum: int = Form(...),
|
|
||||||
):
|
|
||||||
zipFilePath = concat_file_chunks(
|
|
||||||
UPLOAD_DIR, zipFilename, zipFileChunkNum, UPLOAD_DIR)
|
|
||||||
shutil.unpack_archive(zipFilePath, "MMVC_Trainer/dataset/textful/")
|
|
||||||
return {"Zip file unpacked": f"{zipFilePath}"}
|
|
||||||
|
|
||||||
############
|
|
||||||
# Voice Changer
|
|
||||||
# ##########
|
|
||||||
|
|
||||||
@app_fastapi.post("/test")
|
|
||||||
async def post_test(voice: VoiceModel):
|
|
||||||
try:
|
|
||||||
# print("POST REQUEST PROCESSING....")
|
|
||||||
gpu = voice.gpu
|
|
||||||
srcId = voice.srcId
|
|
||||||
dstId = voice.dstId
|
|
||||||
timestamp = voice.timestamp
|
|
||||||
prefixChunkSize = voice.prefixChunkSize
|
|
||||||
buffer = voice.buffer
|
|
||||||
wav = base64.b64decode(buffer)
|
|
||||||
|
|
||||||
if wav == 0:
|
|
||||||
samplerate, data = read("dummy.wav")
|
|
||||||
unpackedData = data
|
|
||||||
else:
|
|
||||||
unpackedData = np.array(struct.unpack(
|
|
||||||
'<%sh' % (len(wav) // struct.calcsize('<h')), wav))
|
|
||||||
write("logs/received_data.wav", 24000,
|
|
||||||
unpackedData.astype(np.int16))
|
|
||||||
|
|
||||||
changedVoice = namespace.changeVoice(
|
|
||||||
gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
|
||||||
|
|
||||||
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
|
|
||||||
data = {
|
|
||||||
"gpu": gpu,
|
|
||||||
"srcId": srcId,
|
|
||||||
"dstId": dstId,
|
|
||||||
"timestamp": timestamp,
|
|
||||||
"prefixChunkSize": prefixChunkSize,
|
|
||||||
"changedVoiceBase64": changedVoiceBase64
|
|
||||||
}
|
|
||||||
|
|
||||||
json_compatible_item_data = jsonable_encoder(data)
|
|
||||||
|
|
||||||
return JSONResponse(content=json_compatible_item_data)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
|
|
||||||
print(traceback.format_exc())
|
|
||||||
return str(e)
|
|
||||||
|
|
||||||
# Trainer REST API ※ ColabがTop直下のパスにしかPOSTを投げれないようなので"REST風"
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_speakers")
|
|
||||||
async def get_speakers():
|
|
||||||
return mod_get_speakers()
|
|
||||||
|
|
||||||
@app_fastapi.delete("/delete_speaker")
|
|
||||||
async def delete_speaker(speaker: str = Form(...)):
|
|
||||||
return mod_delete_speaker(speaker)
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_speaker_voices")
|
|
||||||
async def get_speaker_voices(speaker: str):
|
|
||||||
return mod_get_speaker_voices(speaker)
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_speaker_voice")
|
|
||||||
async def get_speaker_voices(speaker: str, voice: str):
|
|
||||||
return mod_get_speaker_voice(speaker, voice)
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_multi_speaker_setting")
|
|
||||||
async def get_multi_speaker_setting():
|
|
||||||
return mod_get_multi_speaker_setting()
|
|
||||||
|
|
||||||
@app_fastapi.post("/post_multi_speaker_setting")
|
|
||||||
async def post_multi_speaker_setting(setting: str = Form(...)):
|
|
||||||
return mod_post_multi_speaker_setting(setting)
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_models")
|
|
||||||
async def get_models():
|
|
||||||
return mod_get_models()
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_model")
|
|
||||||
async def get_model(model: str):
|
|
||||||
return mod_get_model(model)
|
|
||||||
|
|
||||||
@app_fastapi.delete("/delete_model")
|
|
||||||
async def delete_model(model: str = Form(...)):
|
|
||||||
return mod_delete_model(model)
|
|
||||||
|
|
||||||
@app_fastapi.post("/post_pre_training")
|
|
||||||
async def post_pre_training(batch: int = Form(...)):
|
|
||||||
return mod_post_pre_training(batch)
|
|
||||||
|
|
||||||
@app_fastapi.post("/post_start_training")
|
|
||||||
async def post_start_training(enable_finetuning: bool = Form(...),GModel: str = Form(...),DModel: str = Form(...)):
|
|
||||||
print("POST START TRAINING..")
|
|
||||||
return mod_post_start_training(enable_finetuning, GModel, DModel)
|
|
||||||
|
|
||||||
@app_fastapi.post("/post_stop_training")
|
|
||||||
async def post_stop_training():
|
|
||||||
print("POST STOP TRAINING..")
|
|
||||||
return mod_post_stop_training()
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_related_files")
|
|
||||||
async def get_related_files():
|
|
||||||
return mod_get_related_files()
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_tail_training_log")
|
|
||||||
async def get_tail_training_log(num: int):
|
|
||||||
return mod_get_tail_training_log(num)
|
|
||||||
|
|
||||||
@app_fastapi.get("/get_ex_application_info")
|
|
||||||
async def get_ex_application_info():
|
|
||||||
json_compatible_item_data = jsonable_encoder(exApplitionInfo)
|
|
||||||
return JSONResponse(content=json_compatible_item_data)
|
|
||||||
|
|
||||||
if __name__ == '__mp_main__':
|
|
||||||
printMessage(f"PHASE2:{__name__}", level=2)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
printMessage(f"PHASE1:{__name__}", level=2)
|
|
||||||
TYPE = args.t
|
|
||||||
PORT = args.p
|
|
||||||
CONFIG = args.c
|
|
||||||
MODEL = args.m
|
|
||||||
|
|
||||||
if TYPE != "MMVC" and TYPE != "TRAIN":
|
|
||||||
print("Type(-t) should be MMVC or TRAIN")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
printMessage(f"Start MMVC SocketIO Server", level=0)
|
|
||||||
printMessage(f"CONFIG:{CONFIG}, MODEL:{MODEL}", level=1)
|
|
||||||
|
|
||||||
if args.colab == False:
|
|
||||||
if os.getenv("EX_PORT"):
|
|
||||||
EX_PORT = os.environ["EX_PORT"]
|
|
||||||
printMessage(
|
|
||||||
f"External_Port:{EX_PORT} Internal_Port:{PORT}", level=1)
|
|
||||||
else:
|
|
||||||
printMessage(f"Internal_Port:{PORT}", level=1)
|
|
||||||
|
|
||||||
if os.getenv("EX_TB_PORT"):
|
|
||||||
EX_TB_PORT = os.environ["EX_TB_PORT"]
|
|
||||||
printMessage(f"External_TeonsorBord_Port:{EX_TB_PORT}", level=1)
|
|
||||||
|
|
||||||
if os.getenv("EX_IP"):
|
|
||||||
EX_IP = os.environ["EX_IP"]
|
|
||||||
printMessage(f"External_IP:{EX_IP}", level=1)
|
|
||||||
|
|
||||||
# HTTPS key/cert作成
|
|
||||||
if args.https and args.httpsSelfSigned == 1:
|
|
||||||
# HTTPS(おれおれ証明書生成)
|
|
||||||
os.makedirs("./key", exist_ok=True)
|
|
||||||
key_base_name = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
||||||
keyname = f"{key_base_name}.key"
|
|
||||||
certname = f"{key_base_name}.cert"
|
|
||||||
create_self_signed_cert(certname, keyname, certargs={"Country": "JP",
|
|
||||||
"State": "Tokyo",
|
|
||||||
"City": "Chuo-ku",
|
|
||||||
"Organization": "F",
|
|
||||||
"Org. Unit": "F"}, cert_dir="./key")
|
|
||||||
key_path = os.path.join("./key", keyname)
|
|
||||||
cert_path = os.path.join("./key", certname)
|
|
||||||
printMessage(
|
|
||||||
f"protocol: HTTPS(self-signed), key:{key_path}, cert:{cert_path}", level=1)
|
|
||||||
elif args.https and args.httpsSelfSigned == 0:
|
|
||||||
# HTTPS
|
|
||||||
key_path = args.httpsKey
|
|
||||||
cert_path = args.httpsCert
|
|
||||||
printMessage(
|
|
||||||
f"protocol: HTTPS, key:{key_path}, cert:{cert_path}", level=1)
|
|
||||||
else:
|
|
||||||
# HTTP
|
|
||||||
printMessage(f"protocol: HTTP", level=1)
|
|
||||||
|
|
||||||
# アドレス表示
|
|
||||||
if args.https == 1:
|
|
||||||
printMessage(
|
|
||||||
f"open https://<IP>:<PORT>/ with your browser.", level=0)
|
|
||||||
else:
|
|
||||||
printMessage(
|
|
||||||
f"open http://<IP>:<PORT>/ with your browser.", level=0)
|
|
||||||
|
|
||||||
if TYPE == "MMVC":
|
|
||||||
path = ""
|
|
||||||
else:
|
|
||||||
path = "trainer"
|
|
||||||
if "EX_PORT" in locals() and "EX_IP" in locals() and args.https == 1:
|
|
||||||
printMessage(f"In many cases it is one of the following", level=1)
|
|
||||||
printMessage(f"https://localhost:{EX_PORT}/{path}", level=1)
|
|
||||||
for ip in EX_IP.strip().split(" "):
|
|
||||||
printMessage(f"https://{ip}:{EX_PORT}/{path}", level=1)
|
|
||||||
elif "EX_PORT" in locals() and "EX_IP" in locals() and args.https == 0:
|
|
||||||
printMessage(f"In many cases it is one of the following", level=1)
|
|
||||||
printMessage(f"http://localhost:{EX_PORT}/{path}", level=1)
|
|
||||||
|
|
||||||
# サーバ起動
|
|
||||||
if args.https:
|
|
||||||
# HTTPS サーバ起動
|
|
||||||
uvicorn.run(
|
|
||||||
f"{os.path.basename(__file__)[:-3]}:app_socketio",
|
|
||||||
host="0.0.0.0",
|
|
||||||
port=int(PORT),
|
|
||||||
reload=True,
|
|
||||||
ssl_keyfile=key_path,
|
|
||||||
ssl_certfile=cert_path,
|
|
||||||
log_level="critical"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# HTTP サーバ起動
|
|
||||||
if args.colab == True:
|
|
||||||
uvicorn.run(
|
|
||||||
f"{os.path.basename(__file__)[:-3]}:app_fastapi",
|
|
||||||
host="0.0.0.0",
|
|
||||||
port=int(PORT),
|
|
||||||
log_level="critical"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
uvicorn.run(
|
|
||||||
f"{os.path.basename(__file__)[:-3]}:app_socketio",
|
|
||||||
host="0.0.0.0",
|
|
||||||
port=int(PORT),
|
|
||||||
reload=True,
|
|
||||||
log_level="critical"
|
|
||||||
)
|
|
@ -1,43 +0,0 @@
|
|||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2022 Isle Tennos
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|
||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2021 Jaehyeon Kim
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
@ -1,220 +0,0 @@
|
|||||||
MMVC_Trainer
|
|
||||||
====
|
|
||||||
|
|
||||||
AIを使ったリアルタイムボイスチェンジャーのモデル学習用ツール
|
|
||||||
|
|
||||||
## Description
|
|
||||||
AIを使ったリアルタイムボイスチェンジャー「MMVC(RealTime-Many to Many Voice Conversion)」
|
|
||||||
で使用するモデルを学習するためのリポジトリです。
|
|
||||||
google colaboratoryを用いることで、個人の環境に依存せず、かつ簡単に機械学習の学習フェーズを実行可能です。
|
|
||||||
## MMVC_Client
|
|
||||||
MMVCを実際に動かすClient software
|
|
||||||
https://github.com/isletennos/MMVC_Client
|
|
||||||
## concept
|
|
||||||
「簡単」「だれでも」「好きな声に」「リアルタイムで」
|
|
||||||
## Demo
|
|
||||||
制作中 (v1.3.0.0)
|
|
||||||
https://www.nicovideo.jp/watch/sm40386035 (v1.2.0.0)
|
|
||||||
|
|
||||||
## MMVCの利用規約 及び MMVC用音源の配布先(2022/08/10)
|
|
||||||
本ソフトウェアの利用規約は基本的にMITライセンスに準拠します。
|
|
||||||
1. このソフトウェアは、コピー利用、配布、変更の追加、変更を加えたもの再配布、商用利用、有料販売など
|
|
||||||
どなたでも自由にお使いいただくことができます。
|
|
||||||
2. ライセンスの記載が可能なプラットフォームでの利用の場合、下記クレジットどちらかををご利用ください。
|
|
||||||
**VRCでの利用などライセンス記載が不可の場合、記載は不要です。**
|
|
||||||
(可能であればパターン2を使ってくれると製作者はうれしいです)
|
|
||||||
3. このソフトウェアについて、製作者はいかなる保証も致しません。
|
|
||||||
また、このソフトウェアを利用したことで問題が起きた際に、ソフトウェアの製作者は一切の責任を負いません。
|
|
||||||
4. このソフトウェアで利用する音声データは、必ず元の音声データの所持者の許諾を得たものを利用すること。
|
|
||||||
または音声データの配布元の利用規約内で利用すること。
|
|
||||||
|
|
||||||
### MMVC公式配布の音声データの利用規約とダウンロード先について
|
|
||||||
MMVCの利用規約とは別に、下記音声データを利用する場合、それぞれの音声ライブラリ提供者様の利用規約に同意する必要があります。
|
|
||||||
※本ソフトウェアでは下記企業様・団体様に特別に許可を頂き、音声データを本ソフトウェア用に改変、再配布を行っております。
|
|
||||||
#### SSS LLC.
|
|
||||||
[[利用規約](https://zunko.jp/guideline.html)][[ずんだもん 音声データ](https://drive.google.com/file/d/1h8Ajyvoig7Hl3LSSt2vYX0sUHX3JDF3R/view?usp=sharing)] ※本ソフトウェアに同梱しているものと同様の音声データになります
|
|
||||||
[[利用規約](https://zunko.jp/guideline.html)][[九州そら 音声データ](https://drive.google.com/file/d/1MXfMRG_sjbsaLihm7wEASG2PwuCponZF/view?usp=sharing)]
|
|
||||||
[[利用規約](https://zunko.jp/guideline.html)][[四国めたん 音声データ](https://drive.google.com/file/d/1iCrpzhqXm-0YdktOPM8M1pMtgQIDF3r4/view?usp=sharing)]
|
|
||||||
#### 春日部つむぎプロジェクト様
|
|
||||||
[[利用規約](https://tsumugi-official.studio.site/rule)][[春日部つむぎ 音声データ](https://drive.google.com/file/d/14zE0F_5ZCQWXf6m6SUPF5Y3gpL6yb7zk/view?usp=sharing)]
|
|
||||||
|
|
||||||
### ライセンス表記について
|
|
||||||
ずんだもん/四国めたん/九州そら/春日部つむぎ
|
|
||||||
の3キャラクターを利用する場合に限り、下記ライセンスパターンに加えて、どのツールで作られた音声かわかるように
|
|
||||||
```
|
|
||||||
MMVC:ずんだもん
|
|
||||||
MMVC:ずんだもん/四国めたん
|
|
||||||
```
|
|
||||||
等の記載を下記ライセンスパターンと一緒に記載ください。
|
|
||||||
こちらも**VRCでの利用などライセンス記載が不可の場合、記載は不要です。**
|
|
||||||
|
|
||||||
ライセンスパターン 1
|
|
||||||
```
|
|
||||||
Copyright (c) 2022 Isle.Tennos
|
|
||||||
Released under the MIT license
|
|
||||||
https://opensource.org/licenses/mit-license.php
|
|
||||||
```
|
|
||||||
|
|
||||||
ライセンスパターン 2
|
|
||||||
```
|
|
||||||
MMVCv1.x.x.x(使用バージョン)
|
|
||||||
Copyright (c) 2022 Isle.Tennos
|
|
||||||
Released under the MIT license
|
|
||||||
https://opensource.org/licenses/mit-license.php
|
|
||||||
git:https://github.com/isletennos/MMVC_Trainer
|
|
||||||
community(discord):https://discord.gg/PgspuDSTEc
|
|
||||||
```
|
|
||||||
## Requirement
|
|
||||||
・Google アカウント
|
|
||||||
## Install
|
|
||||||
このリポジトリをダウンロードして、展開、展開したディレクトリをgoogle drive上にアップロードしてください。
|
|
||||||
## Usage
|
|
||||||
### チュートリアル : ずんだもんになる
|
|
||||||
本チュートリアルではずんだもん(SSS LLC.)の音声データを利用します。
|
|
||||||
そのため、MMVCの利用規約とは別に[[ずんだもん 利用規約](https://zunko.jp/guideline.html)]を遵守する必要があります。
|
|
||||||
#### Ph1. 自分の音声の録音と音声データの配置
|
|
||||||
1. 自分の声の音声データを録音します。
|
|
||||||
JVSコーパスやITAコーパス等を台本にし、100文程度読み上げます。
|
|
||||||
また、録音した音声は**24000Hz 16bit 1ch**である必要があります。
|
|
||||||
※MMVC用にテキストを分割したITAコーパスです。ご利用ください。
|
|
||||||
https://drive.google.com/file/d/14oXoQqLxRkP8NJK8qMYGee1_q2uEED1z/view?usp=sharing
|
|
||||||
|
|
||||||
2. dataset/textful/000_myvoice に音声データとテキストデータを配置します。
|
|
||||||
最終的に下記のようなディレクトリ構成になります。
|
|
||||||
```
|
|
||||||
dataset
|
|
||||||
├── textful
|
|
||||||
│ ├── 000_myvoice
|
|
||||||
│ │ ├── text
|
|
||||||
│ │ │ ├── s_voice_001.txt
|
|
||||||
│ │ │ ├── s_voice_002.txt
|
|
||||||
│ │ │ ├── ...
|
|
||||||
│ │ └── wav
|
|
||||||
│ │ ├── s_voice_001.wav
|
|
||||||
│ │ ├── s_voice_002.wav
|
|
||||||
│ │ ├── ...
|
|
||||||
│ │── 001_target
|
|
||||||
│ │ ├── text
|
|
||||||
│ │ └── wav
|
|
||||||
│ │
|
|
||||||
│ └── 1205_zundamon
|
|
||||||
│ ├── text
|
|
||||||
│ │ ├── t_voice_001.txt
|
|
||||||
│ │ ├── t_voice_002.txt
|
|
||||||
│ │ ├── ...
|
|
||||||
│ └── wav
|
|
||||||
│ ├── t_voice_001.wav
|
|
||||||
│ ├── t_voice_002.wav
|
|
||||||
│ ├── ...
|
|
||||||
│
|
|
||||||
└── textless
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Ph2. モデルの学習方法
|
|
||||||
1. 下記リンクより、「G_180000.pth」「D_180000.pth」をダウンロード。
|
|
||||||
https://drive.google.com/drive/folders/1vXdL1zSrgsuyACMkiTUtVbHgpMSA1Y5I?usp=sharing
|
|
||||||
2. 「G_180000.pth」「D_180000.pth」をfine_modelに配置します。**(良く忘れるポイントなので要注意!)**
|
|
||||||
3. notebookディレクトリにある「Create_Configfile_zundamon.ipynb」をgoogle colab 上で実行、学習に必要なconfigファイルを作成します
|
|
||||||
4. configsに作成されたtrain_config_zundamon.jsonの
|
|
||||||
|
|
||||||
- "eval_interval"
|
|
||||||
modelを保存する間隔です。
|
|
||||||
- "batch_size"
|
|
||||||
colabで割り当てたGPUに合わせて調整してください。
|
|
||||||
|
|
||||||
上記2項目を環境に応じて最適化してください。わからない方はそのままで大丈夫です。
|
|
||||||
|
|
||||||
5. notebookディレクトリにある「Train_MMVC.ipynb」をgoogle colab 上で実行してください。
|
|
||||||
logs/にモデルが生成されます。
|
|
||||||
|
|
||||||
#### Ph3. 学習したモデルの性能検証
|
|
||||||
1. notebookディレクトリにある「MMVC_Interface.ipynb」をgoogle colab 上で実行してください。
|
|
||||||
### 好きなキャラクターの声になる
|
|
||||||
#### Ph1. 自分の音声の録音と音声データの配置 及びターゲット音声データの配置
|
|
||||||
1. 自分の声の音声データとその音声データに対応するテキスト、変換したい声の音声データとその音声データに対応するテキストを用意します。
|
|
||||||
この時、用意する音声(自分の声の音声データ/変換したい声の音声データ共に)は**24000Hz 16bit 1ch**を強く推奨しております。
|
|
||||||
2. 下記のようなディレクトリ構成になるように音声データとテキストデータを配置します。
|
|
||||||
textfulの直下には2ディレクトリになります。
|
|
||||||
(1205_zundamonディレクトリは無くても問題ありません)
|
|
||||||
|
|
||||||
```
|
|
||||||
dataset
|
|
||||||
├── textful
|
|
||||||
│ ├── 000_myvoice
|
|
||||||
│ │ ├── text
|
|
||||||
│ │ │ ├── s_voice_001.txt
|
|
||||||
│ │ │ ├── s_voice_002.txt
|
|
||||||
│ │ │ ├── ...
|
|
||||||
│ │ └── wav
|
|
||||||
│ │ ├── s_voice_001.wav
|
|
||||||
│ │ ├── s_voice_002.wav
|
|
||||||
│ │ ├── ...
|
|
||||||
│ │── 001_target
|
|
||||||
│ │ ├── text
|
|
||||||
│ │ │ ├── t_voice_001.txt
|
|
||||||
│ │ │ ├── t_voice_002.txt
|
|
||||||
│ │ │ ├── ...
|
|
||||||
│ │ └── wav
|
|
||||||
│ │ ├── t_voice_001.wav
|
|
||||||
│ │ ├── t_voice_002.wav
|
|
||||||
│ │ ├── ...
|
|
||||||
│ └── 1205_zundamon
|
|
||||||
│ ├── text
|
|
||||||
│ │ ├── t_voice_001.txt
|
|
||||||
│ │ ├── t_voice_002.txt
|
|
||||||
│ │ ├── ...
|
|
||||||
│ └── wav
|
|
||||||
│ ├── t_voice_001.wav
|
|
||||||
│ ├── t_voice_002.wav
|
|
||||||
│ ├── ...
|
|
||||||
│
|
|
||||||
└── textless
|
|
||||||
```
|
|
||||||
#### Ph2. モデルの学習方法
|
|
||||||
以降、「チュートリアル : ずんだもんになる Ph2.」と同様のため割愛
|
|
||||||
#### Ph3. 学習したモデルの性能検証
|
|
||||||
以降、「チュートリアル : ずんだもんになる Ph3.」と同様のため割愛
|
|
||||||
## 有志によるチュートリアル動画
|
|
||||||
### v1.2.1.x
|
|
||||||
| 前準備編 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40415108) | [YouTube](https://www.youtube.com/watch?v=gq1Hpn5CARw&ab_channel=popi) |
|
|
||||||
|:--------------|:------------|:------------|
|
|
||||||
| 要修正音声 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40420683)| [YouTube](https://youtu.be/NgzC7Nuk6gg) |
|
|
||||||
| 前準備編2 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40445164)| [YouTube](https://youtu.be/m4Jew7sTs9w)
|
|
||||||
| 学習編_前1 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40467662)| [YouTube](https://youtu.be/HRSPEy2jUvg)
|
|
||||||
| 学習編_前2 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40473168)| [YouTube](https://youtu.be/zQW59vrOSuA)
|
|
||||||
| 学習編_後 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40490554)| [YouTube](https://www.youtube.com/watch?v=uB3YfdKzo-g&ab_channel=popi)
|
|
||||||
| リアルタイム編 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40415108)| [YouTube](https://youtu.be/Al5DFCvKLFA)
|
|
||||||
| 質問編 | [ニコニコ動画](https://www.nicovideo.jp/watch/sm40599514)| [YouTube](https://youtu.be/aGBcqu5M6-c)
|
|
||||||
| 応用編_九州そら| [ニコニコ動画](https://www.nicovideo.jp/watch/sm40647601)| [YouTube](https://youtu.be/MEXKZoHVd-A)
|
|
||||||
| 応用編_音街ウナ| [ニコニコ動画](https://www.nicovideo.jp/watch/sm40714406)| [YouTube](https://youtu.be/JDMlRz-PkSE)
|
|
||||||
|
|
||||||
## Q&A
|
|
||||||
下記サイトをご参考ください。
|
|
||||||
https://mmvc.readthedocs.io/ja/latest/index.html
|
|
||||||
## MMVCコミュニティサーバ(discord)
|
|
||||||
開発の最新情報や、不明点のお問合せ、MMVCの活用法などMMVCに関するコミュニティサーバです。
|
|
||||||
https://discord.gg/PgspuDSTEc
|
|
||||||
|
|
||||||
## Special thanks
|
|
||||||
- JVS (Japanese versatile speech) corpus
|
|
||||||
contributors : 高道 慎之介様/三井 健太郎様/齋藤 佑樹様/郡山 知樹様/丹治 尚子様/猿渡 洋様
|
|
||||||
https://sites.google.com/site/shinnosuketakamichi/research-topics/jvs_corpus
|
|
||||||
|
|
||||||
- ITAコーパス マルチモーダルデータベース
|
|
||||||
contributors : 金井郁也様/千葉隆壱様/齊藤剛史様/森勢将雅様/小口純矢様/能勢隆様/尾上真惟子様/小田恭央様
|
|
||||||
CharacterVoice : 東北イタコ(木戸衣吹様)/ずんだもん(伊藤ゆいな様)/四国めたん(田中小雪様)/九州そら(西田望見)
|
|
||||||
https://zunko.jp/multimodal_dev/login.php
|
|
||||||
|
|
||||||
- つくよみちゃんコーパス
|
|
||||||
contributor : 夢前黎様
|
|
||||||
CharacterVoice : つくよみちゃん(夢前黎様)
|
|
||||||
https://tyc.rei-yumesaki.net/material/corpus/
|
|
||||||
|
|
||||||
## Reference
|
|
||||||
https://arxiv.org/abs/2106.06103
|
|
||||||
https://github.com/jaywalnut310/vits
|
|
||||||
|
|
||||||
## Author
|
|
||||||
Isle Tennos
|
|
||||||
Twitter : https://twitter.com/IsleTennos
|
|
||||||
|
|
@ -1,303 +0,0 @@
|
|||||||
import copy
|
|
||||||
import math
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
from torch.nn import functional as F
|
|
||||||
|
|
||||||
import commons
|
|
||||||
import modules
|
|
||||||
from modules import LayerNorm
|
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module):
|
|
||||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.hidden_channels = hidden_channels
|
|
||||||
self.filter_channels = filter_channels
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.p_dropout = p_dropout
|
|
||||||
self.window_size = window_size
|
|
||||||
|
|
||||||
self.drop = nn.Dropout(p_dropout)
|
|
||||||
self.attn_layers = nn.ModuleList()
|
|
||||||
self.norm_layers_1 = nn.ModuleList()
|
|
||||||
self.ffn_layers = nn.ModuleList()
|
|
||||||
self.norm_layers_2 = nn.ModuleList()
|
|
||||||
for i in range(self.n_layers):
|
|
||||||
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
|
|
||||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
|
||||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
|
||||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
|
||||||
|
|
||||||
def forward(self, x, x_mask):
|
|
||||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
|
||||||
x = x * x_mask
|
|
||||||
for i in range(self.n_layers):
|
|
||||||
y = self.attn_layers[i](x, x, attn_mask)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = self.norm_layers_1[i](x + y)
|
|
||||||
|
|
||||||
y = self.ffn_layers[i](x, x_mask)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = self.norm_layers_2[i](x + y)
|
|
||||||
x = x * x_mask
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class Decoder(nn.Module):
|
|
||||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
|
||||||
super().__init__()
|
|
||||||
self.hidden_channels = hidden_channels
|
|
||||||
self.filter_channels = filter_channels
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.n_layers = n_layers
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.p_dropout = p_dropout
|
|
||||||
self.proximal_bias = proximal_bias
|
|
||||||
self.proximal_init = proximal_init
|
|
||||||
|
|
||||||
self.drop = nn.Dropout(p_dropout)
|
|
||||||
self.self_attn_layers = nn.ModuleList()
|
|
||||||
self.norm_layers_0 = nn.ModuleList()
|
|
||||||
self.encdec_attn_layers = nn.ModuleList()
|
|
||||||
self.norm_layers_1 = nn.ModuleList()
|
|
||||||
self.ffn_layers = nn.ModuleList()
|
|
||||||
self.norm_layers_2 = nn.ModuleList()
|
|
||||||
for i in range(self.n_layers):
|
|
||||||
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
|
||||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
|
||||||
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
|
||||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
|
||||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
|
||||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
|
||||||
|
|
||||||
def forward(self, x, x_mask, h, h_mask):
|
|
||||||
"""
|
|
||||||
x: decoder input
|
|
||||||
h: encoder output
|
|
||||||
"""
|
|
||||||
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
|
||||||
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
|
||||||
x = x * x_mask
|
|
||||||
for i in range(self.n_layers):
|
|
||||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = self.norm_layers_0[i](x + y)
|
|
||||||
|
|
||||||
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = self.norm_layers_1[i](x + y)
|
|
||||||
|
|
||||||
y = self.ffn_layers[i](x, x_mask)
|
|
||||||
y = self.drop(y)
|
|
||||||
x = self.norm_layers_2[i](x + y)
|
|
||||||
x = x * x_mask
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
class MultiHeadAttention(nn.Module):
|
|
||||||
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
|
|
||||||
super().__init__()
|
|
||||||
assert channels % n_heads == 0
|
|
||||||
|
|
||||||
self.channels = channels
|
|
||||||
self.out_channels = out_channels
|
|
||||||
self.n_heads = n_heads
|
|
||||||
self.p_dropout = p_dropout
|
|
||||||
self.window_size = window_size
|
|
||||||
self.heads_share = heads_share
|
|
||||||
self.block_length = block_length
|
|
||||||
self.proximal_bias = proximal_bias
|
|
||||||
self.proximal_init = proximal_init
|
|
||||||
self.attn = None
|
|
||||||
|
|
||||||
self.k_channels = channels // n_heads
|
|
||||||
self.conv_q = nn.Conv1d(channels, channels, 1)
|
|
||||||
self.conv_k = nn.Conv1d(channels, channels, 1)
|
|
||||||
self.conv_v = nn.Conv1d(channels, channels, 1)
|
|
||||||
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
|
||||||
self.drop = nn.Dropout(p_dropout)
|
|
||||||
|
|
||||||
if window_size is not None:
|
|
||||||
n_heads_rel = 1 if heads_share else n_heads
|
|
||||||
rel_stddev = self.k_channels**-0.5
|
|
||||||
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
|
||||||
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
|
||||||
|
|
||||||
nn.init.xavier_uniform_(self.conv_q.weight)
|
|
||||||
nn.init.xavier_uniform_(self.conv_k.weight)
|
|
||||||
nn.init.xavier_uniform_(self.conv_v.weight)
|
|
||||||
if proximal_init:
|
|
||||||
with torch.no_grad():
|
|
||||||
self.conv_k.weight.copy_(self.conv_q.weight)
|
|
||||||
self.conv_k.bias.copy_(self.conv_q.bias)
|
|
||||||
|
|
||||||
def forward(self, x, c, attn_mask=None):
|
|
||||||
q = self.conv_q(x)
|
|
||||||
k = self.conv_k(c)
|
|
||||||
v = self.conv_v(c)
|
|
||||||
|
|
||||||
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
|
||||||
|
|
||||||
x = self.conv_o(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
def attention(self, query, key, value, mask=None):
|
|
||||||
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
|
||||||
b, d, t_s, t_t = (*key.size(), query.size(2))
|
|
||||||
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
|
||||||
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
|
||||||
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
|
||||||
|
|
||||||
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
|
||||||
if self.window_size is not None:
|
|
||||||
assert t_s == t_t, "Relative attention is only available for self-attention."
|
|
||||||
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
|
||||||
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
|
|
||||||
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
|
||||||
scores = scores + scores_local
|
|
||||||
if self.proximal_bias:
|
|
||||||
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
|
||||||
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
|
||||||
if mask is not None:
|
|
||||||
scores = scores.masked_fill(mask == 0, -1e4)
|
|
||||||
if self.block_length is not None:
|
|
||||||
assert t_s == t_t, "Local attention is only available for self-attention."
|
|
||||||
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
|
||||||
scores = scores.masked_fill(block_mask == 0, -1e4)
|
|
||||||
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
|
||||||
p_attn = self.drop(p_attn)
|
|
||||||
output = torch.matmul(p_attn, value)
|
|
||||||
if self.window_size is not None:
|
|
||||||
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
|
||||||
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
|
||||||
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
|
||||||
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
|
||||||
return output, p_attn
|
|
||||||
|
|
||||||
def _matmul_with_relative_values(self, x, y):
|
|
||||||
"""
|
|
||||||
x: [b, h, l, m]
|
|
||||||
y: [h or 1, m, d]
|
|
||||||
ret: [b, h, l, d]
|
|
||||||
"""
|
|
||||||
ret = torch.matmul(x, y.unsqueeze(0))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _matmul_with_relative_keys(self, x, y):
|
|
||||||
"""
|
|
||||||
x: [b, h, l, d]
|
|
||||||
y: [h or 1, m, d]
|
|
||||||
ret: [b, h, l, m]
|
|
||||||
"""
|
|
||||||
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def _get_relative_embeddings(self, relative_embeddings, length):
|
|
||||||
max_relative_position = 2 * self.window_size + 1
|
|
||||||
# Pad first before slice to avoid using cond ops.
|
|
||||||
pad_length = max(length - (self.window_size + 1), 0)
|
|
||||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
|
||||||
slice_end_position = slice_start_position + 2 * length - 1
|
|
||||||
if pad_length > 0:
|
|
||||||
padded_relative_embeddings = F.pad(
|
|
||||||
relative_embeddings,
|
|
||||||
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
|
||||||
else:
|
|
||||||
padded_relative_embeddings = relative_embeddings
|
|
||||||
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
|
|
||||||
return used_relative_embeddings
|
|
||||||
|
|
||||||
def _relative_position_to_absolute_position(self, x):
|
|
||||||
"""
|
|
||||||
x: [b, h, l, 2*l-1]
|
|
||||||
ret: [b, h, l, l]
|
|
||||||
"""
|
|
||||||
batch, heads, length, _ = x.size()
|
|
||||||
# Concat columns of pad to shift from relative to absolute indexing.
|
|
||||||
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
|
|
||||||
|
|
||||||
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
|
||||||
x_flat = x.view([batch, heads, length * 2 * length])
|
|
||||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
|
|
||||||
|
|
||||||
# Reshape and slice out the padded elements.
|
|
||||||
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
|
|
||||||
return x_final
|
|
||||||
|
|
||||||
def _absolute_position_to_relative_position(self, x):
|
|
||||||
"""
|
|
||||||
x: [b, h, l, l]
|
|
||||||
ret: [b, h, l, 2*l-1]
|
|
||||||
"""
|
|
||||||
batch, heads, length, _ = x.size()
|
|
||||||
# padd along column
|
|
||||||
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
|
|
||||||
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
|
|
||||||
# add 0's in the beginning that will skew the elements after reshape
|
|
||||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
|
||||||
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
|
|
||||||
return x_final
|
|
||||||
|
|
||||||
def _attention_bias_proximal(self, length):
|
|
||||||
"""Bias for self-attention to encourage attention to close positions.
|
|
||||||
Args:
|
|
||||||
length: an integer scalar.
|
|
||||||
Returns:
|
|
||||||
a Tensor with shape [1, 1, length, length]
|
|
||||||
"""
|
|
||||||
r = torch.arange(length, dtype=torch.float32)
|
|
||||||
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
|
||||||
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
|
||||||
|
|
||||||
|
|
||||||
class FFN(nn.Module):
|
|
||||||
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
|
|
||||||
super().__init__()
|
|
||||||
self.in_channels = in_channels
|
|
||||||
self.out_channels = out_channels
|
|
||||||
self.filter_channels = filter_channels
|
|
||||||
self.kernel_size = kernel_size
|
|
||||||
self.p_dropout = p_dropout
|
|
||||||
self.activation = activation
|
|
||||||
self.causal = causal
|
|
||||||
|
|
||||||
if causal:
|
|
||||||
self.padding = self._causal_padding
|
|
||||||
else:
|
|
||||||
self.padding = self._same_padding
|
|
||||||
|
|
||||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
|
||||||
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
|
||||||
self.drop = nn.Dropout(p_dropout)
|
|
||||||
|
|
||||||
def forward(self, x, x_mask):
|
|
||||||
x = self.conv_1(self.padding(x * x_mask))
|
|
||||||
if self.activation == "gelu":
|
|
||||||
x = x * torch.sigmoid(1.702 * x)
|
|
||||||
else:
|
|
||||||
x = torch.relu(x)
|
|
||||||
x = self.drop(x)
|
|
||||||
x = self.conv_2(self.padding(x * x_mask))
|
|
||||||
return x * x_mask
|
|
||||||
|
|
||||||
def _causal_padding(self, x):
|
|
||||||
if self.kernel_size == 1:
|
|
||||||
return x
|
|
||||||
pad_l = self.kernel_size - 1
|
|
||||||
pad_r = 0
|
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
|
||||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
|
||||||
return x
|
|
||||||
|
|
||||||
def _same_padding(self, x):
|
|
||||||
if self.kernel_size == 1:
|
|
||||||
return x
|
|
||||||
pad_l = (self.kernel_size - 1) // 2
|
|
||||||
pad_r = self.kernel_size // 2
|
|
||||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
|
||||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
|
||||||
return x
|
|
@ -1,161 +0,0 @@
|
|||||||
import math
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch import nn
|
|
||||||
from torch.nn import functional as F
|
|
||||||
|
|
||||||
|
|
||||||
def init_weights(m, mean=0.0, std=0.01):
|
|
||||||
classname = m.__class__.__name__
|
|
||||||
if classname.find("Conv") != -1:
|
|
||||||
m.weight.data.normal_(mean, std)
|
|
||||||
|
|
||||||
|
|
||||||
def get_padding(kernel_size, dilation=1):
|
|
||||||
return int((kernel_size*dilation - dilation)/2)
|
|
||||||
|
|
||||||
|
|
||||||
def convert_pad_shape(pad_shape):
|
|
||||||
l = pad_shape[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
def intersperse(lst, item):
|
|
||||||
result = [item] * (len(lst) * 2 + 1)
|
|
||||||
result[1::2] = lst
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
|
||||||
"""KL(P||Q)"""
|
|
||||||
kl = (logs_q - logs_p) - 0.5
|
|
||||||
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
|
|
||||||
return kl
|
|
||||||
|
|
||||||
|
|
||||||
def rand_gumbel(shape):
|
|
||||||
"""Sample from the Gumbel distribution, protect from overflows."""
|
|
||||||
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
|
||||||
return -torch.log(-torch.log(uniform_samples))
|
|
||||||
|
|
||||||
|
|
||||||
def rand_gumbel_like(x):
|
|
||||||
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
|
||||||
return g
|
|
||||||
|
|
||||||
|
|
||||||
def slice_segments(x, ids_str, segment_size=4):
|
|
||||||
ret = torch.zeros_like(x[:, :, :segment_size])
|
|
||||||
for i in range(x.size(0)):
|
|
||||||
idx_str = ids_str[i]
|
|
||||||
idx_end = idx_str + segment_size
|
|
||||||
ret[i] = x[i, :, idx_str:idx_end]
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
|
||||||
b, d, t = x.size()
|
|
||||||
if x_lengths is None:
|
|
||||||
x_lengths = t
|
|
||||||
ids_str_max = x_lengths - segment_size + 1
|
|
||||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
|
||||||
ret = slice_segments(x, ids_str, segment_size)
|
|
||||||
return ret, ids_str
|
|
||||||
|
|
||||||
|
|
||||||
def get_timing_signal_1d(
|
|
||||||
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
|
||||||
position = torch.arange(length, dtype=torch.float)
|
|
||||||
num_timescales = channels // 2
|
|
||||||
log_timescale_increment = (
|
|
||||||
math.log(float(max_timescale) / float(min_timescale)) /
|
|
||||||
(num_timescales - 1))
|
|
||||||
inv_timescales = min_timescale * torch.exp(
|
|
||||||
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
|
||||||
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
|
||||||
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
|
||||||
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
|
||||||
signal = signal.view(1, channels, length)
|
|
||||||
return signal
|
|
||||||
|
|
||||||
|
|
||||||
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
|
||||||
b, channels, length = x.size()
|
|
||||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
|
||||||
return x + signal.to(dtype=x.dtype, device=x.device)
|
|
||||||
|
|
||||||
|
|
||||||
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
|
||||||
b, channels, length = x.size()
|
|
||||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
|
||||||
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
|
||||||
|
|
||||||
|
|
||||||
def subsequent_mask(length):
|
|
||||||
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
|
||||||
return mask
|
|
||||||
|
|
||||||
|
|
||||||
@torch.jit.script
|
|
||||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
|
||||||
n_channels_int = n_channels[0]
|
|
||||||
in_act = input_a + input_b
|
|
||||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
|
||||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
|
||||||
acts = t_act * s_act
|
|
||||||
return acts
|
|
||||||
|
|
||||||
|
|
||||||
def convert_pad_shape(pad_shape):
|
|
||||||
l = pad_shape[::-1]
|
|
||||||
pad_shape = [item for sublist in l for item in sublist]
|
|
||||||
return pad_shape
|
|
||||||
|
|
||||||
|
|
||||||
def shift_1d(x):
|
|
||||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
def sequence_mask(length, max_length=None):
|
|
||||||
if max_length is None:
|
|
||||||
max_length = length.max()
|
|
||||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
|
||||||
return x.unsqueeze(0) < length.unsqueeze(1)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_path(duration, mask):
|
|
||||||
"""
|
|
||||||
duration: [b, 1, t_x]
|
|
||||||
mask: [b, 1, t_y, t_x]
|
|
||||||
"""
|
|
||||||
device = duration.device
|
|
||||||
|
|
||||||
b, _, t_y, t_x = mask.shape
|
|
||||||
cum_duration = torch.cumsum(duration, -1)
|
|
||||||
|
|
||||||
cum_duration_flat = cum_duration.view(b * t_x)
|
|
||||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
|
||||||
path = path.view(b, t_x, t_y)
|
|
||||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
|
||||||
path = path.unsqueeze(1).transpose(2,3) * mask
|
|
||||||
return path
|
|
||||||
|
|
||||||
|
|
||||||
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
|
||||||
if isinstance(parameters, torch.Tensor):
|
|
||||||
parameters = [parameters]
|
|
||||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
|
||||||
norm_type = float(norm_type)
|
|
||||||
if clip_value is not None:
|
|
||||||
clip_value = float(clip_value)
|
|
||||||
|
|
||||||
total_norm = 0
|
|
||||||
for p in parameters:
|
|
||||||
param_norm = p.grad.data.norm(norm_type)
|
|
||||||
total_norm += param_norm.item() ** norm_type
|
|
||||||
if clip_value is not None:
|
|
||||||
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
|
||||||
total_norm = total_norm ** (1. / norm_type)
|
|
||||||
return total_norm
|
|
@ -1,110 +0,0 @@
|
|||||||
{
|
|
||||||
"train": {
|
|
||||||
"log_interval": 1000,
|
|
||||||
"eval_interval": 2000,
|
|
||||||
"seed": 1234,
|
|
||||||
"epochs": 10000,
|
|
||||||
"learning_rate": 0.0002,
|
|
||||||
"betas": [
|
|
||||||
0.8,
|
|
||||||
0.99
|
|
||||||
],
|
|
||||||
"eps": 1e-09,
|
|
||||||
"batch_size": 10,
|
|
||||||
"fp16_run": true,
|
|
||||||
"lr_decay": 0.999875,
|
|
||||||
"segment_size": 8192,
|
|
||||||
"init_lr_ratio": 1,
|
|
||||||
"warmup_epochs": 0,
|
|
||||||
"c_mel": 45,
|
|
||||||
"c_kl": 1.0
|
|
||||||
},
|
|
||||||
"data": {
|
|
||||||
"training_files": "filelists/dec_not_propagation_label_and_change_melspec_textful.txt",
|
|
||||||
"validation_files": "filelists/dec_not_propagation_label_and_change_melspec_textful_val.txt",
|
|
||||||
"training_files_notext": "filelists/dec_not_propagation_label_and_change_melspec_textless.txt",
|
|
||||||
"validation_files_notext": "filelists/dec_not_propagation_label_and_change_melspec_val_textless.txt",
|
|
||||||
"text_cleaners": [
|
|
||||||
"japanese_cleaners"
|
|
||||||
],
|
|
||||||
"max_wav_value": 32768.0,
|
|
||||||
"sampling_rate": 24000,
|
|
||||||
"filter_length": 512,
|
|
||||||
"hop_length": 128,
|
|
||||||
"win_length": 512,
|
|
||||||
"n_mel_channels": 80,
|
|
||||||
"mel_fmin": 0.0,
|
|
||||||
"mel_fmax": null,
|
|
||||||
"add_blank": true,
|
|
||||||
"n_speakers": 110,
|
|
||||||
"cleaned_text": false
|
|
||||||
},
|
|
||||||
"model": {
|
|
||||||
"inter_channels": 192,
|
|
||||||
"hidden_channels": 192,
|
|
||||||
"filter_channels": 768,
|
|
||||||
"n_heads": 2,
|
|
||||||
"n_layers": 6,
|
|
||||||
"kernel_size": 3,
|
|
||||||
"p_dropout": 0.1,
|
|
||||||
"resblock": "1",
|
|
||||||
"resblock_kernel_sizes": [
|
|
||||||
3,
|
|
||||||
7,
|
|
||||||
11
|
|
||||||
],
|
|
||||||
"resblock_dilation_sizes": [
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
],
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
],
|
|
||||||
[
|
|
||||||
1,
|
|
||||||
3,
|
|
||||||
5
|
|
||||||
]
|
|
||||||
],
|
|
||||||
"upsample_rates": [
|
|
||||||
8,
|
|
||||||
4,
|
|
||||||
2,
|
|
||||||
2
|
|
||||||
],
|
|
||||||
"upsample_initial_channel": 512,
|
|
||||||
"upsample_kernel_sizes": [
|
|
||||||
16,
|
|
||||||
16,
|
|
||||||
8,
|
|
||||||
8
|
|
||||||
],
|
|
||||||
"n_layers_q": 3,
|
|
||||||
"use_spectral_norm": false,
|
|
||||||
"n_flow": 8,
|
|
||||||
"gin_channels": 256
|
|
||||||
},
|
|
||||||
"others": {
|
|
||||||
"os_type": "linux"
|
|
||||||
},
|
|
||||||
"augmentation": {
|
|
||||||
"enable" : true,
|
|
||||||
"gain_p" : 0.5,
|
|
||||||
"min_gain_in_db" : -10,
|
|
||||||
"max_gain_in_db" : 10,
|
|
||||||
"time_stretch_p" : 0.5,
|
|
||||||
"min_rate" : 0.75,
|
|
||||||
"max_rate" : 1.25,
|
|
||||||
"pitch_shift_p" : 0.0,
|
|
||||||
"min_semitones" : -4.0,
|
|
||||||
"max_semitones" : 4.0,
|
|
||||||
"add_gaussian_noise_p" : 0.0,
|
|
||||||
"min_amplitude" : 0.001,
|
|
||||||
"max_amplitude" : 0.04,
|
|
||||||
"frequency_mask_p" : 0.0
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,343 +0,0 @@
|
|||||||
import glob
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import argparse
|
|
||||||
import pyopenjtalk
|
|
||||||
import json
|
|
||||||
|
|
||||||
def mozi2phone(mozi):
|
|
||||||
text = pyopenjtalk.g2p(mozi)
|
|
||||||
text = "sil " + text + " sil"
|
|
||||||
text = text.replace(' ', '-')
|
|
||||||
return text
|
|
||||||
|
|
||||||
def create_json(filename, num_speakers, sr, config_path):
|
|
||||||
if os.path.exists(config_path):
|
|
||||||
with open(config_path, "r", encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
data['data']['training_files'] = 'filelists/' + filename + '_textful.txt'
|
|
||||||
data['data']['validation_files'] = 'filelists/' + filename + '_textful_val.txt'
|
|
||||||
data['data']['training_files_notext'] = 'filelists/' + filename + '_textless.txt'
|
|
||||||
data['data']['validation_files_notext'] = 'filelists/' + filename + '_val_textless.txt'
|
|
||||||
data['data']['sampling_rate'] = sr
|
|
||||||
data['data']['n_speakers'] = num_speakers
|
|
||||||
|
|
||||||
with open("./configs/" + filename + ".json", 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
def create_dataset(filename):
|
|
||||||
speaker_id = 107
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
for d in textful_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
continue
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
if speaker_id > 108:
|
|
||||||
break
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return speaker_id
|
|
||||||
|
|
||||||
def create_dataset_zundamon(filename):
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
#paths
|
|
||||||
my_path = "dataset/textful/00_myvoice"
|
|
||||||
zundamon_path = "dataset/textful/1205_zundamon"
|
|
||||||
|
|
||||||
#set list wav and text
|
|
||||||
#myvoice
|
|
||||||
speaker_id = 107
|
|
||||||
d = my_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
speaker_id = 100
|
|
||||||
d = zundamon_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def create_dataset_character(filename, tid):
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
#paths
|
|
||||||
my_path = "dataset/textful/00_myvoice"
|
|
||||||
zundamon_path = "dataset/textful/01_target"
|
|
||||||
|
|
||||||
#set list wav and text
|
|
||||||
#myvoice
|
|
||||||
speaker_id = 107
|
|
||||||
d = my_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
speaker_id = tid
|
|
||||||
d = zundamon_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def create_dataset_multi_character(filename, file_path):
|
|
||||||
Correspondence_list = list()
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textless_dir_list.sort()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
with open(file_path, "r") as f:
|
|
||||||
for line in f.readlines():
|
|
||||||
target_dir , sid = line.split("|")
|
|
||||||
sid = sid.rstrip('\n')
|
|
||||||
wav_file_list = glob.glob("dataset/textful/" + target_dir + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob("dataset/textful/" + target_dir + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + target_dir + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f_text:
|
|
||||||
mozi = f_text.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(sid) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(sid) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(sid) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(sid)+"|"+ target_dir + "\n")
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('-f', '--filename', type=str, required=True,
|
|
||||||
help='filelist for configuration')
|
|
||||||
parser.add_argument('-s', '--sr', type=int, default=24000,
|
|
||||||
help='sampling rate (default = 24000)')
|
|
||||||
parser.add_argument('-t', '--target', type=int, default=9999,
|
|
||||||
help='pre_traind targetid (zundamon = 100, sora = 101, methane = 102, tsumugi = 103)')
|
|
||||||
parser.add_argument('-m', '--multi_target', type=str, default=None,
|
|
||||||
help='pre_traind targetid (zundamon = 100, sora = 101, methane = 102, tsumugi = 103)')
|
|
||||||
parser.add_argument('-c', '--config', type=str, default="./configs/baseconfig.json",
|
|
||||||
help='JSON file for configuration')
|
|
||||||
args = parser.parse_args()
|
|
||||||
filename = args.filename
|
|
||||||
print(filename)
|
|
||||||
if args.multi_target != None:
|
|
||||||
n_spk = create_dataset_multi_character(filename, args.multi_target)
|
|
||||||
elif args.target != 9999 and args.target == 100:
|
|
||||||
n_spk = create_dataset_zundamon(filename)
|
|
||||||
elif args.target != 9999:
|
|
||||||
n_spk = create_dataset_character(filename, args.target)
|
|
||||||
else:
|
|
||||||
n_spk = create_dataset(filename)
|
|
||||||
|
|
||||||
create_json(filename, n_spk, args.sr, args.config)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,305 +0,0 @@
|
|||||||
import glob
|
|
||||||
import sys
|
|
||||||
import os
|
|
||||||
import argparse
|
|
||||||
import pyopenjtalk
|
|
||||||
import json
|
|
||||||
|
|
||||||
def mozi2phone(mozi):
|
|
||||||
text = pyopenjtalk.g2p(mozi)
|
|
||||||
text = "sil " + text + " sil"
|
|
||||||
text = text.replace(' ', '-')
|
|
||||||
return text
|
|
||||||
|
|
||||||
def create_json(filename, num_speakers, sr, config_path):
|
|
||||||
if os.path.exists(config_path):
|
|
||||||
with open(config_path, "r", encoding="utf-8") as f:
|
|
||||||
data = json.load(f)
|
|
||||||
data['data']['training_files'] = 'filelists/' + filename + '_textful.txt'
|
|
||||||
data['data']['validation_files'] = 'filelists/' + filename + '_textful_val.txt'
|
|
||||||
data['data']['training_files_notext'] = 'filelists/' + filename + '_textless.txt'
|
|
||||||
data['data']['validation_files_notext'] = 'filelists/' + filename + '_val_textless.txt'
|
|
||||||
data['data']['sampling_rate'] = sr
|
|
||||||
data['data']['n_speakers'] = num_speakers
|
|
||||||
|
|
||||||
with open("./configs/" + filename + ".json", 'w', encoding='utf-8') as f:
|
|
||||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
def create_dataset(filename, my_sid):
|
|
||||||
speaker_id = my_sid
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
for d in textful_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
continue
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
if speaker_id > 108:
|
|
||||||
break
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return speaker_id + 1
|
|
||||||
|
|
||||||
def create_dataset(filename, my_sid):
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
#paths
|
|
||||||
my_path = "dataset/textful/00_myvoice"
|
|
||||||
target_path = "dataset/textful/01_target"
|
|
||||||
print("myvoice : {}".format(str(os.path.isdir(my_path))))
|
|
||||||
print("target_path : {}".format(str(os.path.isdir(target_path))))
|
|
||||||
|
|
||||||
#set list wav and text
|
|
||||||
#myvoice
|
|
||||||
speaker_id = my_sid
|
|
||||||
d = my_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
if len(lab_file_list) == 0:
|
|
||||||
print("Error : " + d + "/text にテキストデータがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
speaker_id = 108
|
|
||||||
d = target_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def create_dataset_zundamon(filename, my_sid):
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
#paths
|
|
||||||
my_path = "dataset/textful/00_myvoice"
|
|
||||||
zundamon_path = "dataset/textful/1205_zundamon"
|
|
||||||
print("myvoice : {}".format(str(os.path.isdir(my_path))))
|
|
||||||
print("zundamon_path : {}".format(str(os.path.isdir(zundamon_path))))
|
|
||||||
|
|
||||||
#set list wav and text
|
|
||||||
#myvoice
|
|
||||||
speaker_id = my_sid
|
|
||||||
d = my_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
if len(lab_file_list) == 0:
|
|
||||||
print("Error : " + d + "/text にテキストデータがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
speaker_id = 100
|
|
||||||
d = zundamon_path
|
|
||||||
wav_file_list = glob.glob(d + "/wav/*.wav")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*.txt")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
print("Error" + d + "/wav に音声データがありません")
|
|
||||||
exit()
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
with open(lab, 'r', encoding="utf-8") as f:
|
|
||||||
mozi = f.read().split("\n")
|
|
||||||
print(str(mozi))
|
|
||||||
test = mozi2phone(str(mozi))
|
|
||||||
print(test)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*.wav")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+os.path.basename(d) + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return 110
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('-f', '--filename', type=str, required=True,
|
|
||||||
help='filelist for configuration')
|
|
||||||
parser.add_argument('-s', '--sr', type=int, default=24000,
|
|
||||||
help='sampling rate (default = 24000)')
|
|
||||||
parser.add_argument('-m', '--mysid', type=int, default=107,
|
|
||||||
help='sampling rate (default = 24000)')
|
|
||||||
parser.add_argument('-z', '--zundamon', type=bool, default=False,
|
|
||||||
help='U.N. zundamon Was Her? (default = False)')
|
|
||||||
parser.add_argument('-c', '--config', type=str, default="./configs/baseconfig.json",
|
|
||||||
help='JSON file for configuration')
|
|
||||||
args = parser.parse_args()
|
|
||||||
filename = args.filename
|
|
||||||
print(filename)
|
|
||||||
if args.zundamon:
|
|
||||||
n_spk = create_dataset_zundamon(filename, args.mysid)
|
|
||||||
else:
|
|
||||||
n_spk = create_dataset(filename, args.mysid)
|
|
||||||
|
|
||||||
create_json(filename, n_spk, args.sr, args.config)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -1,87 +0,0 @@
|
|||||||
import glob
|
|
||||||
import sys
|
|
||||||
|
|
||||||
def read_lab(lab_f):
|
|
||||||
with open(lab_f, 'r') as f:
|
|
||||||
kw_list = f.read().split("\n")
|
|
||||||
|
|
||||||
out_phono = []
|
|
||||||
for i in range(len(kw_list)-1):
|
|
||||||
out_phono.append(kw_list[i].split()[2])
|
|
||||||
out_phono.append("-")
|
|
||||||
|
|
||||||
if out_phono[0] == 'silB' and out_phono[-2] == 'silE':
|
|
||||||
out_phono[0] = 'sil'
|
|
||||||
out_phono[-2] = 'sil'
|
|
||||||
out_phono = out_phono[0:-1]
|
|
||||||
out_phono_str = "".join(out_phono)
|
|
||||||
return out_phono_str
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("Error!")
|
|
||||||
exit
|
|
||||||
|
|
||||||
def create_dataset(filename):
|
|
||||||
speaker_id = 0
|
|
||||||
textful_dir_list = glob.glob("dataset/textful/*")
|
|
||||||
textless_dir_list = glob.glob("dataset/textless/*")
|
|
||||||
textful_dir_list.sort()
|
|
||||||
textless_dir_list.sort()
|
|
||||||
Correspondence_list = list()
|
|
||||||
output_file_list = list()
|
|
||||||
output_file_list_val = list()
|
|
||||||
output_file_list_textless = list()
|
|
||||||
output_file_list_val_textless = list()
|
|
||||||
for d in textful_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/wav/*")
|
|
||||||
lab_file_list = glob.glob(d + "/text/*")
|
|
||||||
wav_file_list.sort()
|
|
||||||
lab_file_list.sort()
|
|
||||||
if len(wav_file_list) == 0:
|
|
||||||
continue
|
|
||||||
counter = 0
|
|
||||||
for lab, wav in zip(lab_file_list, wav_file_list):
|
|
||||||
test = read_lab(lab)
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|"+ test)
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val.append(wav + "|"+ str(speaker_id) + "|"+ test + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+d + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
for d in textless_dir_list:
|
|
||||||
wav_file_list = glob.glob(d+"/*")
|
|
||||||
wav_file_list.sort()
|
|
||||||
counter = 0
|
|
||||||
for wav in wav_file_list:
|
|
||||||
print(wav + "|"+ str(speaker_id) + "|a")
|
|
||||||
if counter % 10 != 0:
|
|
||||||
output_file_list_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
else:
|
|
||||||
output_file_list_val_textless.append(wav + "|"+ str(speaker_id) + "|a" + "\n")
|
|
||||||
counter = counter +1
|
|
||||||
Correspondence_list.append(str(speaker_id)+"|"+d + "\n")
|
|
||||||
speaker_id = speaker_id + 1
|
|
||||||
|
|
||||||
with open('filelists/' + filename + '_textful.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list)
|
|
||||||
with open('filelists/' + filename + '_textful_val.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val)
|
|
||||||
with open('filelists/' + filename + '_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_textless)
|
|
||||||
with open('filelists/' + filename + '_val_textless.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(output_file_list_val_textless)
|
|
||||||
with open('filelists/' + filename + '_Correspondence.txt', 'w', encoding='utf-8', newline='\n') as f:
|
|
||||||
f.writelines(Correspondence_list)
|
|
||||||
return speaker_id -1
|
|
||||||
|
|
||||||
def main(argv):
|
|
||||||
filename = str(sys.argv[1])
|
|
||||||
print(filename)
|
|
||||||
n_spk = create_dataset(filename)
|
|
||||||
return filename, n_spk
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
sys.exit(main(sys.argv))
|
|
@ -1,492 +0,0 @@
|
|||||||
import time
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
import torch.utils.data
|
|
||||||
import tqdm
|
|
||||||
|
|
||||||
import commons
|
|
||||||
from mel_processing import spectrogram_torch
|
|
||||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
|
||||||
from text import text_to_sequence, cleaned_text_to_sequence
|
|
||||||
|
|
||||||
#add
|
|
||||||
from retry import retry
|
|
||||||
import random
|
|
||||||
import torchaudio
|
|
||||||
|
|
||||||
|
|
||||||
class TextAudioLoader(torch.utils.data.Dataset):
|
|
||||||
"""
|
|
||||||
1) loads audio, text pairs
|
|
||||||
2) normalizes text and converts them to sequences of integers
|
|
||||||
3) computes spectrograms from audio files.
|
|
||||||
"""
|
|
||||||
def __init__(self, audiopaths_and_text, hparams, use_test = True):
|
|
||||||
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
|
||||||
self.text_cleaners = hparams.text_cleaners
|
|
||||||
self.max_wav_value = hparams.max_wav_value
|
|
||||||
self.sampling_rate = hparams.sampling_rate
|
|
||||||
self.filter_length = hparams.filter_length
|
|
||||||
self.hop_length = hparams.hop_length
|
|
||||||
self.win_length = hparams.win_length
|
|
||||||
self.sampling_rate = hparams.sampling_rate
|
|
||||||
self.use_test = use_test
|
|
||||||
|
|
||||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
|
||||||
|
|
||||||
self.add_blank = hparams.add_blank
|
|
||||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
|
||||||
self.max_text_len = getattr(hparams, "max_text_len", 190)
|
|
||||||
|
|
||||||
random.seed(1234)
|
|
||||||
random.shuffle(self.audiopaths_and_text)
|
|
||||||
self._filter()
|
|
||||||
|
|
||||||
|
|
||||||
def _filter(self):
|
|
||||||
"""
|
|
||||||
Filter text & store spec lengths
|
|
||||||
"""
|
|
||||||
# Store spectrogram lengths for Bucketing
|
|
||||||
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
|
||||||
# spec_length = wav_length // hop_length
|
|
||||||
|
|
||||||
audiopaths_and_text_new = []
|
|
||||||
lengths = []
|
|
||||||
for audiopath, text in self.audiopaths_and_text:
|
|
||||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
|
||||||
audiopaths_and_text_new.append([audiopath, text])
|
|
||||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
|
||||||
self.audiopaths_and_text = audiopaths_and_text_new
|
|
||||||
self.lengths = lengths
|
|
||||||
|
|
||||||
def get_audio_text_pair(self, audiopath_and_text):
|
|
||||||
# separate filename and text
|
|
||||||
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
|
||||||
text = self.get_text(text)
|
|
||||||
if self.use_test != True:
|
|
||||||
text = torch.as_tensor("a")
|
|
||||||
spec, wav = self.get_audio(audiopath)
|
|
||||||
return (text, spec, wav)
|
|
||||||
|
|
||||||
def get_audio(self, filename):
|
|
||||||
audio, sampling_rate = load_wav_to_torch(filename)
|
|
||||||
if sampling_rate != self.sampling_rate:
|
|
||||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
|
||||||
sampling_rate, self.sampling_rate))
|
|
||||||
audio_norm = audio / self.max_wav_value
|
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
|
||||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
|
||||||
if os.path.exists(spec_filename):
|
|
||||||
spec = torch.load(spec_filename)
|
|
||||||
else:
|
|
||||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
|
||||||
self.sampling_rate, self.hop_length, self.win_length,
|
|
||||||
center=False)
|
|
||||||
spec = torch.squeeze(spec, 0)
|
|
||||||
torch.save(spec, spec_filename)
|
|
||||||
return spec, audio_norm
|
|
||||||
|
|
||||||
def get_text(self, text):
|
|
||||||
if self.cleaned_text:
|
|
||||||
text_norm = cleaned_text_to_sequence(text)
|
|
||||||
else:
|
|
||||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
|
||||||
if self.add_blank:
|
|
||||||
text_norm = commons.intersperse(text_norm, 0)
|
|
||||||
text_norm = torch.LongTensor(text_norm)
|
|
||||||
return text_norm
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
|
||||||
return self.get_audio_text_pair(self.audiopaths_and_text[index])
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.audiopaths_and_text)
|
|
||||||
|
|
||||||
|
|
||||||
class TextAudioCollate():
|
|
||||||
""" Zero-pads model inputs and targets
|
|
||||||
"""
|
|
||||||
def __init__(self, return_ids=False):
|
|
||||||
self.return_ids = return_ids
|
|
||||||
|
|
||||||
def __call__(self, batch):
|
|
||||||
"""Collate's training batch from normalized text and aduio
|
|
||||||
PARAMS
|
|
||||||
------
|
|
||||||
batch: [text_normalized, spec_normalized, wav_normalized]
|
|
||||||
"""
|
|
||||||
# Right zero-pad all one-hot text sequences to max input length
|
|
||||||
_, ids_sorted_decreasing = torch.sort(
|
|
||||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
|
||||||
dim=0, descending=True)
|
|
||||||
|
|
||||||
max_text_len = max([len(x[0]) for x in batch])
|
|
||||||
max_spec_len = max([x[1].size(1) for x in batch])
|
|
||||||
max_wav_len = max([x[2].size(1) for x in batch])
|
|
||||||
|
|
||||||
text_lengths = torch.LongTensor(len(batch))
|
|
||||||
spec_lengths = torch.LongTensor(len(batch))
|
|
||||||
wav_lengths = torch.LongTensor(len(batch))
|
|
||||||
|
|
||||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
|
||||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
|
||||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
|
||||||
text_padded.zero_()
|
|
||||||
spec_padded.zero_()
|
|
||||||
wav_padded.zero_()
|
|
||||||
for i in range(len(ids_sorted_decreasing)):
|
|
||||||
row = batch[ids_sorted_decreasing[i]]
|
|
||||||
|
|
||||||
text = row[0]
|
|
||||||
text_padded[i, :text.size(0)] = text
|
|
||||||
text_lengths[i] = text.size(0)
|
|
||||||
|
|
||||||
spec = row[1]
|
|
||||||
spec_padded[i, :, :spec.size(1)] = spec
|
|
||||||
spec_lengths[i] = spec.size(1)
|
|
||||||
|
|
||||||
wav = row[2]
|
|
||||||
wav_padded[i, :, :wav.size(1)] = wav
|
|
||||||
wav_lengths[i] = wav.size(1)
|
|
||||||
|
|
||||||
if self.return_ids:
|
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
|
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
|
|
||||||
|
|
||||||
|
|
||||||
"""Multi speaker version"""
|
|
||||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
|
||||||
"""
|
|
||||||
1) loads audio, speaker_id, text pairs
|
|
||||||
2) normalizes text and converts them to sequences of integers
|
|
||||||
3) computes spectrograms from audio files.
|
|
||||||
"""
|
|
||||||
def __init__(self, audiopaths_sid_text, hparams, no_text=False, augmentation=False, augmentation_params=None, no_use_textfile = False):
|
|
||||||
if no_use_textfile:
|
|
||||||
self.audiopaths_sid_text = list()
|
|
||||||
else:
|
|
||||||
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
|
|
||||||
self.text_cleaners = hparams.text_cleaners
|
|
||||||
self.max_wav_value = hparams.max_wav_value
|
|
||||||
self.sampling_rate = hparams.sampling_rate
|
|
||||||
self.filter_length = hparams.filter_length
|
|
||||||
self.hop_length = hparams.hop_length
|
|
||||||
self.win_length = hparams.win_length
|
|
||||||
self.sampling_rate = hparams.sampling_rate
|
|
||||||
self.no_text = no_text
|
|
||||||
self.augmentation = augmentation
|
|
||||||
if augmentation :
|
|
||||||
self.gain_p = augmentation_params.gain_p
|
|
||||||
self.min_gain_in_db = augmentation_params.min_gain_in_db
|
|
||||||
self.max_gain_in_db = augmentation_params.max_gain_in_db
|
|
||||||
self.time_stretch_p = augmentation_params.time_stretch_p
|
|
||||||
self.min_rate = augmentation_params.min_rate
|
|
||||||
self.max_rate = augmentation_params.max_rate
|
|
||||||
self.pitch_shift_p = augmentation_params.pitch_shift_p
|
|
||||||
self.min_semitones = augmentation_params.min_semitones
|
|
||||||
self.max_semitones = augmentation_params.max_semitones
|
|
||||||
self.add_gaussian_noise_p = augmentation_params.add_gaussian_noise_p
|
|
||||||
self.min_amplitude = augmentation_params.min_amplitude
|
|
||||||
self.max_amplitude = augmentation_params.max_amplitude
|
|
||||||
self.frequency_mask_p = augmentation_params.frequency_mask_p
|
|
||||||
|
|
||||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
|
||||||
|
|
||||||
self.add_blank = hparams.add_blank
|
|
||||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
|
||||||
self.max_text_len = getattr(hparams, "max_text_len", 1000)
|
|
||||||
|
|
||||||
random.seed(1234)
|
|
||||||
random.shuffle(self.audiopaths_sid_text)
|
|
||||||
self._filter()
|
|
||||||
|
|
||||||
@retry(tries=30, delay=10)
|
|
||||||
def _filter(self):
|
|
||||||
"""
|
|
||||||
Filter text & store spec lengths
|
|
||||||
"""
|
|
||||||
# Store spectrogram lengths for Bucketing
|
|
||||||
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
|
||||||
# spec_length = wav_length // hop_length
|
|
||||||
|
|
||||||
audiopaths_sid_text_new = []
|
|
||||||
lengths = []
|
|
||||||
|
|
||||||
for audiopath, sid, text in tqdm.tqdm(self.audiopaths_sid_text):
|
|
||||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
|
||||||
audiopaths_sid_text_new.append([audiopath, sid, text])
|
|
||||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
|
||||||
self.audiopaths_sid_text = audiopaths_sid_text_new
|
|
||||||
self.lengths = lengths
|
|
||||||
|
|
||||||
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
|
||||||
# separate filename, speaker_id and text
|
|
||||||
audiopath, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
|
|
||||||
text = self.get_text(text)
|
|
||||||
if self.no_text:
|
|
||||||
text = self.get_text("a")
|
|
||||||
spec, wav = self.get_audio(audiopath)
|
|
||||||
sid = self.get_sid(sid)
|
|
||||||
return (text, spec, wav, sid)
|
|
||||||
|
|
||||||
@retry(exceptions=(PermissionError), tries=100, delay=10)
|
|
||||||
def get_audio(self, filename):
|
|
||||||
# 音声データは±1.0内に正規化したtorchベクトルでunsqueeze(0)で外側1次元くるんだものを扱う
|
|
||||||
audio, sampling_rate = load_wav_to_torch(filename)
|
|
||||||
try:
|
|
||||||
if sampling_rate != self.sampling_rate:
|
|
||||||
raise ValueError("[Error] Exception: source {} SR doesn't match target {} SR".format(
|
|
||||||
sampling_rate, self.sampling_rate))
|
|
||||||
except ValueError as e:
|
|
||||||
print(e)
|
|
||||||
exit()
|
|
||||||
audio_norm = self.get_normalized_audio(audio, self.max_wav_value)
|
|
||||||
|
|
||||||
if self.augmentation:
|
|
||||||
audio_augmented = self.add_augmentation(audio_norm, sampling_rate)
|
|
||||||
audio_noised = self.add_noise(audio_augmented, sampling_rate)
|
|
||||||
# ノーマライズ後のaugmentationとnoise付加で範囲外になったところを削る
|
|
||||||
audio_augmented = torch.clamp(audio_augmented, -1, 1)
|
|
||||||
audio_noised = torch.clamp(audio_noised, -1, 1)
|
|
||||||
# audio(音声波形)は教師信号となるのでノイズは含まずaugmentationのみしたものを使用
|
|
||||||
audio_norm = audio_augmented
|
|
||||||
# spec(スペクトログラム)は入力信号となるのでaugmentationしてさらにノイズを付加したものを使用
|
|
||||||
spec = spectrogram_torch(audio_noised, self.filter_length,
|
|
||||||
self.sampling_rate, self.hop_length, self.win_length,
|
|
||||||
center=False)
|
|
||||||
spec_noised = self.add_spectrogram_noise(spec)
|
|
||||||
spec = torch.squeeze(spec_noised, 0)
|
|
||||||
else:
|
|
||||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
|
||||||
self.sampling_rate, self.hop_length, self.win_length,
|
|
||||||
center=False)
|
|
||||||
spec = torch.squeeze(spec, 0)
|
|
||||||
return spec, audio_norm
|
|
||||||
|
|
||||||
def add_augmentation(self, audio, sampling_rate):
|
|
||||||
gain_in_db = 0.0
|
|
||||||
if random.random() <= self.gain_p:
|
|
||||||
gain_in_db = random.uniform(self.min_gain_in_db, self.max_gain_in_db)
|
|
||||||
time_stretch_rate = 1.0
|
|
||||||
if random.random() <= self.time_stretch_p:
|
|
||||||
time_stretch_rate = random.uniform(self.min_rate, self.max_rate)
|
|
||||||
pitch_shift_semitones = 0
|
|
||||||
if random.random() <= self.pitch_shift_p:
|
|
||||||
pitch_shift_semitones = random.uniform(self.min_semitones, self.max_semitones) * 100 # 1/100 semitone 単位指定のため
|
|
||||||
augmentation_effects = [
|
|
||||||
["gain", f"{gain_in_db}"],
|
|
||||||
["tempo", f"{time_stretch_rate}"],
|
|
||||||
["pitch", f"{pitch_shift_semitones}"],
|
|
||||||
["rate", f"{sampling_rate}"]
|
|
||||||
]
|
|
||||||
audio_augmented, _ = torchaudio.sox_effects.apply_effects_tensor(audio, sampling_rate, augmentation_effects)
|
|
||||||
return audio_augmented
|
|
||||||
|
|
||||||
def add_noise(self, audio, sampling_rate):
|
|
||||||
# AddGaussianNoise
|
|
||||||
audio = self.add_gaussian_noise(audio)
|
|
||||||
return audio
|
|
||||||
|
|
||||||
def add_gaussian_noise(self, audio):
|
|
||||||
assert self.min_amplitude >= 0.0
|
|
||||||
assert self.max_amplitude >= 0.0
|
|
||||||
assert self.max_amplitude >= self.min_amplitude
|
|
||||||
if random.random() > self.add_gaussian_noise_p:
|
|
||||||
return audio
|
|
||||||
amplitude = random.uniform(self.min_amplitude, self.max_amplitude)
|
|
||||||
noise = torch.randn(audio.size())
|
|
||||||
noised_audio = audio + amplitude * noise
|
|
||||||
return noised_audio
|
|
||||||
|
|
||||||
def add_spectrogram_noise(self, spec):
|
|
||||||
# FrequencyMask
|
|
||||||
masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
|
|
||||||
masked = masking(spec)
|
|
||||||
return masked
|
|
||||||
|
|
||||||
def get_normalized_audio(self, audio, max_wav_value):
|
|
||||||
audio_norm = audio / max_wav_value
|
|
||||||
audio_norm = audio_norm.unsqueeze(0)
|
|
||||||
return audio_norm
|
|
||||||
|
|
||||||
def get_text(self, text):
|
|
||||||
if self.cleaned_text:
|
|
||||||
text_norm = cleaned_text_to_sequence(text)
|
|
||||||
else:
|
|
||||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
|
||||||
if self.add_blank:
|
|
||||||
text_norm = commons.intersperse(text_norm, 0)
|
|
||||||
text_norm = torch.LongTensor(text_norm)
|
|
||||||
return text_norm
|
|
||||||
|
|
||||||
def get_sid(self, sid):
|
|
||||||
sid = torch.LongTensor([int(sid)])
|
|
||||||
return sid
|
|
||||||
|
|
||||||
def __getitem__(self, index):
|
|
||||||
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return len(self.audiopaths_sid_text)
|
|
||||||
|
|
||||||
|
|
||||||
class TextAudioSpeakerCollate():
|
|
||||||
""" Zero-pads model inputs and targets
|
|
||||||
"""
|
|
||||||
def __init__(self, return_ids=False, no_text = False):
|
|
||||||
self.return_ids = return_ids
|
|
||||||
self.no_text = no_text
|
|
||||||
|
|
||||||
def __call__(self, batch):
|
|
||||||
"""Collate's training batch from normalized text, audio and speaker identities
|
|
||||||
PARAMS
|
|
||||||
------
|
|
||||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
|
||||||
"""
|
|
||||||
# Right zero-pad all one-hot text sequences to max input length
|
|
||||||
_, ids_sorted_decreasing = torch.sort(
|
|
||||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
|
||||||
dim=0, descending=True)
|
|
||||||
|
|
||||||
max_text_len = max([len(x[0]) for x in batch])
|
|
||||||
max_spec_len = max([x[1].size(1) for x in batch])
|
|
||||||
max_wav_len = max([x[2].size(1) for x in batch])
|
|
||||||
|
|
||||||
text_lengths = torch.LongTensor(len(batch))
|
|
||||||
spec_lengths = torch.LongTensor(len(batch))
|
|
||||||
wav_lengths = torch.LongTensor(len(batch))
|
|
||||||
sid = torch.LongTensor(len(batch))
|
|
||||||
|
|
||||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
|
||||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
|
||||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
|
||||||
text_padded.zero_()
|
|
||||||
spec_padded.zero_()
|
|
||||||
wav_padded.zero_()
|
|
||||||
for i in range(len(ids_sorted_decreasing)):
|
|
||||||
row = batch[ids_sorted_decreasing[i]]
|
|
||||||
|
|
||||||
text = row[0]
|
|
||||||
text_padded[i, :text.size(0)] = text
|
|
||||||
text_lengths[i] = text.size(0)
|
|
||||||
|
|
||||||
spec = row[1]
|
|
||||||
spec_padded[i, :, :spec.size(1)] = spec
|
|
||||||
spec_lengths[i] = spec.size(1)
|
|
||||||
|
|
||||||
wav = row[2]
|
|
||||||
wav_padded[i, :, :wav.size(1)] = wav
|
|
||||||
wav_lengths[i] = wav.size(1)
|
|
||||||
|
|
||||||
sid[i] = row[3]
|
|
||||||
|
|
||||||
if self.return_ids:
|
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
|
|
||||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
|
|
||||||
|
|
||||||
|
|
||||||
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
|
||||||
"""
|
|
||||||
Maintain similar input lengths in a batch.
|
|
||||||
Length groups are specified by boundaries.
|
|
||||||
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
|
|
||||||
|
|
||||||
It removes samples which are not included in the boundaries.
|
|
||||||
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
|
|
||||||
"""
|
|
||||||
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
|
|
||||||
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
|
|
||||||
self.lengths = dataset.lengths
|
|
||||||
self.batch_size = batch_size
|
|
||||||
self.boundaries = boundaries
|
|
||||||
|
|
||||||
self.buckets, self.num_samples_per_bucket = self._create_buckets()
|
|
||||||
self.total_size = sum(self.num_samples_per_bucket)
|
|
||||||
self.num_samples = self.total_size // self.num_replicas
|
|
||||||
|
|
||||||
def _create_buckets(self):
|
|
||||||
buckets = [[] for _ in range(len(self.boundaries) - 1)]
|
|
||||||
for i in range(len(self.lengths)):
|
|
||||||
length = self.lengths[i]
|
|
||||||
idx_bucket = self._bisect(length)
|
|
||||||
if idx_bucket != -1:
|
|
||||||
buckets[idx_bucket].append(i)
|
|
||||||
|
|
||||||
for i in range(len(buckets) - 1, 0, -1):
|
|
||||||
if len(buckets[i]) == 0:
|
|
||||||
buckets.pop(i)
|
|
||||||
self.boundaries.pop(i+1)
|
|
||||||
|
|
||||||
num_samples_per_bucket = []
|
|
||||||
for i in range(len(buckets)):
|
|
||||||
len_bucket = len(buckets[i])
|
|
||||||
total_batch_size = self.num_replicas * self.batch_size
|
|
||||||
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
|
|
||||||
num_samples_per_bucket.append(len_bucket + rem)
|
|
||||||
return buckets, num_samples_per_bucket
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
# deterministically shuffle based on epoch
|
|
||||||
g = torch.Generator()
|
|
||||||
g.manual_seed(self.epoch)
|
|
||||||
|
|
||||||
indices = []
|
|
||||||
if self.shuffle:
|
|
||||||
for bucket in self.buckets:
|
|
||||||
indices.append(torch.randperm(len(bucket), generator=g).tolist())
|
|
||||||
else:
|
|
||||||
for bucket in self.buckets:
|
|
||||||
indices.append(list(range(len(bucket))))
|
|
||||||
|
|
||||||
batches = []
|
|
||||||
for i in range(len(self.buckets)):
|
|
||||||
next_bucket = (i+1) % len(self.buckets)
|
|
||||||
bucket = self.buckets[i]
|
|
||||||
len_bucket = len(bucket)
|
|
||||||
ids_bucket = indices[i]
|
|
||||||
num_samples_bucket = self.num_samples_per_bucket[i]
|
|
||||||
|
|
||||||
if len_bucket == 0:
|
|
||||||
print("[Warn] Exception: length of buckets {} is 0. ID:{} Skip.".format(i,i))
|
|
||||||
continue
|
|
||||||
|
|
||||||
# add extra samples to make it evenly divisible
|
|
||||||
rem = num_samples_bucket - len_bucket
|
|
||||||
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
|
|
||||||
|
|
||||||
# subsample
|
|
||||||
ids_bucket = ids_bucket[self.rank::self.num_replicas]
|
|
||||||
|
|
||||||
# batching
|
|
||||||
for j in range(len(ids_bucket) // self.batch_size):
|
|
||||||
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
|
|
||||||
batches.append(batch)
|
|
||||||
|
|
||||||
if self.shuffle:
|
|
||||||
batch_ids = torch.randperm(len(batches), generator=g).tolist()
|
|
||||||
batches = [batches[i] for i in batch_ids]
|
|
||||||
self.batches = batches
|
|
||||||
|
|
||||||
assert len(self.batches) * self.batch_size == self.num_samples
|
|
||||||
return iter(self.batches)
|
|
||||||
|
|
||||||
def _bisect(self, x, lo=0, hi=None):
|
|
||||||
if hi is None:
|
|
||||||
hi = len(self.boundaries) - 1
|
|
||||||
|
|
||||||
if hi > lo:
|
|
||||||
mid = (hi + lo) // 2
|
|
||||||
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
|
|
||||||
return mid
|
|
||||||
elif x <= self.boundaries[mid]:
|
|
||||||
return self._bisect(x, lo, mid)
|
|
||||||
else:
|
|
||||||
return self._bisect(x, mid + 1, hi)
|
|
||||||
else:
|
|
||||||
return -1
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
return self.num_samples // self.batch_size
|
|
@ -1,7 +0,0 @@
|
|||||||
00_myvoice|107
|
|
||||||
01_target|108
|
|
||||||
02_target|109
|
|
||||||
03_target|0
|
|
||||||
04_target|1
|
|
||||||
05_target|2
|
|
||||||
1205_zundamon|100
|
|
@ -1,2 +0,0 @@
|
|||||||
*
|
|
||||||
!.gitignore
|
|
@ -1,2 +0,0 @@
|
|||||||
*
|
|
||||||
!.gitignore
|
|
@ -1,2 +0,0 @@
|
|||||||
*
|
|
||||||
!.gitignore
|
|
@ -1,2 +0,0 @@
|
|||||||
*
|
|
||||||
!.gitignore
|
|
@ -1 +0,0 @@
|
|||||||
オンナノコガキッキッウレシソー。
|
|
@ -1 +0,0 @@
|
|||||||
ツァツォニリョコーシタ。
|
|
@ -1 +0,0 @@
|
|||||||
ミンシュウガテュルリーキュウデンニシンニュウシタ。
|
|
@ -1 +0,0 @@
|
|||||||
ハイチキョーワコクデトゥーサンルーヴェルテュールガショーリヲオサメラレタノワ、ジッサイオーネツビョーノオカゲダッタ。
|
|
@ -1 +0,0 @@
|
|||||||
レジャンドルワミンシュウヲテュルリーキュウデンニマネータ。
|
|
@ -1 +0,0 @@
|
|||||||
ジョゲンワデキナイトデュパンワイッタ。
|
|
@ -1 +0,0 @@
|
|||||||
フランスジンシェフトニホンジンシェフワゼンゼンチガウ。
|
|
@ -1 +0,0 @@
|
|||||||
チュウゴクノガイコーダンニアタッシェトシテハケンサレタ。
|
|
@ -1 +0,0 @@
|
|||||||
ファシズムセーリョクトノソーリョクセンニノゾム。
|
|
@ -1 +0,0 @@
|
|||||||
カグショーニンノフィシェルワ、ニグルマトコウマヲカシテクレタ。
|
|
@ -1 +0,0 @@
|
|||||||
ローカルロセンニワファンモオオイ。
|
|
@ -1 +0,0 @@
|
|||||||
フェイントデアイテヲカワシテカラシュートデフィニッシュシタ。
|
|
@ -1 +0,0 @@
|
|||||||
センハッピャクナナジュウナナ、プフェファーニヨリシントーゲンショーガハッケンサレタ。
|
|
@ -1 +0,0 @@
|
|||||||
ユレルフェリーニノルノワワタシニトッテクギョーデス。
|
|
@ -1 +0,0 @@
|
|||||||
ホルロアラティタルッフォトユウトクベツナオリョーリモデマシタ。
|
|
@ -1 +0,0 @@
|
|||||||
フエノオトガナルトウサギノキッドガサッソクピョントハネタ。
|
|
@ -1 +0,0 @@
|
|||||||
アノリョキャクワウワサノキャフェニイクヨーデス。
|
|
@ -1 +0,0 @@
|
|||||||
モクヒョーワイットーショーデス。
|
|
@ -1 +0,0 @@
|
|||||||
ウサギノキッドワキブンヨクピョン、マタピョントトビツヅケタ。
|
|
@ -1 +0,0 @@
|
|||||||
アフタヌーンティーヲタノシミマショー。
|
|
@ -1 +0,0 @@
|
|||||||
カノジョワティピカルナフェミニストデス。
|
|
@ -1 +0,0 @@
|
|||||||
ジョシュタチトミッツィワサガシテイルショルイヲミツケラレナカッタ。
|
|
@ -1 +0,0 @@
|
|||||||
フィレンツェ、パドヴァ、ヴェネツィアワドレモイタリアノトシデス。
|
|
@ -1 +0,0 @@
|
|||||||
ガクフニツギノヨーニカイテアルノガ、エーフェリチェデス。
|
|
@ -1 +0,0 @@
|
|||||||
ショペンハウエルトニーチェノテツガクショヲホンダナカラトリダシタ。
|
|
@ -1 +0,0 @@
|
|||||||
サッソクメシツカイゼンインニシラセマショー。
|
|
@ -1 +0,0 @@
|
|||||||
オモイワタイレヲヌイデ、アワセニキガエル。
|
|
@ -1 +0,0 @@
|
|||||||
ボストンデ、トアルチョプスイヤエハイッテユウハンヲクッタ。
|
|
@ -1 +0,0 @@
|
|||||||
ロクスッポキュウケーヲトラズハタライタ。
|
|
@ -1 +0,0 @@
|
|||||||
カツテヒトリデコクフニシンニュウシタ。
|
|
@ -1 +0,0 @@
|
|||||||
ダガ、キョーオマエガココエゴジュライニナッタノワ、ドンナゴヨーナノカナ?
|
|
@ -1 +0,0 @@
|
|||||||
サブフランチャイザーヲフヤシテメザセヒャクテンポ。
|
|
@ -1 +0,0 @@
|
|||||||
シコクデオヘンロヲアンギャシヨー。
|
|
@ -1 +0,0 @@
|
|||||||
イツモノトオリギャンギャンナキダシマシタ。
|
|
@ -1 +0,0 @@
|
|||||||
センセーワ、タッタママニュースヲミテイマシタ。
|
|
@ -1 +0,0 @@
|
|||||||
ワタシワギョットメヲミヒライタ。
|
|
@ -1 +0,0 @@
|
|||||||
トモダチエニューイヤーカードヲオクロー。
|
|
@ -1 +0,0 @@
|
|||||||
カセーフワヤスミニオシャレナアウターウェアニミヲツツミヒトリデヤタイヲタノシミマシタ。
|
|
@ -1 +0,0 @@
|
|||||||
ウォッカノオトモニワシオヅケノキュウリガアイマス。
|
|
@ -1 +0,0 @@
|
|||||||
ヤマノムコーノミュンヒェンノヒトタチガコーゲキヲシカケタ。
|
|
@ -1 +0,0 @@
|
|||||||
ボスニアコッキョーカラノコーゲキニヨリ、ジュウイチガツニヴァリェヴォガセンリョーサレタ。
|
|
@ -1 +0,0 @@
|
|||||||
シルヴィウスワデュボアトヨバレテイタフランスノユグノーノイエニウマレタ。
|
|
@ -1 +0,0 @@
|
|||||||
ソノホカニワタシニデキルコトワナカッタノデス、ユリエワナミダゴエニナッタ。
|
|
@ -1 +0,0 @@
|
|||||||
ガルハカセヒャクタイチカク。
|
|
@ -1 +0,0 @@
|
|||||||
ニホンセーフカラノヒャクチョーエンヲコエルヨサンヨーキュウ。
|
|
@ -1 +0,0 @@
|
|||||||
シャキョーノウツクシサニワタシワギョーテンシテシマッタ。
|
|
@ -1 +0,0 @@
|
|||||||
ソプラノカシュポリランダチョワカゲキアイーダノトクベツメーカシュトヒョーバンデス。
|
|
@ -1 +0,0 @@
|
|||||||
アナタニワサイショヒャクポンドワタシマス。
|
|
@ -1 +0,0 @@
|
|||||||
シャチョーカラノシジデス。
|
|
@ -1 +0,0 @@
|
|||||||
ドーモキマグレトユウモノワタショーメフィスティックナモノデアルラシイ。
|
|
@ -1 +0,0 @@
|
|||||||
カエルガピョコピョコトビマワッテイマス。
|
|
@ -1 +0,0 @@
|
|||||||
マキョーニアシヲフミイレル。
|
|
@ -1 +0,0 @@
|
|||||||
ヴァンダーヴォットタイムチュウワ、イワユルパーティーノヨーデハレヤカデス。
|
|
@ -1 +0,0 @@
|
|||||||
スピリッツトワジョーリュウシュノコトデス。
|
|
@ -1 +0,0 @@
|
|||||||
ヌルシアノベネディクトゥスワアポロンシンデンヲコワシ、ベネディクトカイノシュウドーインヲタテタ。
|
|
@ -1 +0,0 @@
|
|||||||
チョードソノトキ、テストゥパーゴガコップヲモッテタチアガリマシタ。
|
|