Merge branch 'master' of github.com:w-okada/voice-changer
580
VoiceChangerDemo.ipynb
Normal file
@ -0,0 +1,580 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "VoiceChangerDemo",
|
||||
"provenance": [],
|
||||
"collapsed_sections": [],
|
||||
"authorship_tag": "ABX9TyN+8irLJYUFlwMPzvHMSJof",
|
||||
"include_colab_link": true
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"accelerator": "GPU",
|
||||
"gpuClass": "standard"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"<a href=\"https://colab.research.google.com/github/w-okada/voice-changer/blob/dev/VoiceChangerDemo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"id": "57p7pA1Qb5wa"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!nvidia-smi"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "vV1t7PBRm-o6",
|
||||
"outputId": "60fc80b2-a39e-4840-88c1-0d8d483a36ca"
|
||||
},
|
||||
"execution_count": 1,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Wed Aug 31 06:14:56 2022 \n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
|
||||
"|-------------------------------+----------------------+----------------------+\n",
|
||||
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
|
||||
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
|
||||
"| | | MIG M. |\n",
|
||||
"|===============================+======================+======================|\n",
|
||||
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
|
||||
"| N/A 72C P8 12W / 70W | 0MiB / 15109MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
" \n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| Processes: |\n",
|
||||
"| GPU GI CI PID Type Process name GPU Memory |\n",
|
||||
"| ID ID Usage |\n",
|
||||
"|=============================================================================|\n",
|
||||
"| No running processes found |\n",
|
||||
"+-----------------------------------------------------------------------------+\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"CONFIG=\"/content/drive/MyDrive/VoiceChanger/config.json\"\n",
|
||||
"MODEL=\"/content/drive/MyDrive/VoiceChanger/G_326000.pth\""
|
||||
],
|
||||
"metadata": {
|
||||
"id": "nSXATMWYb4Ik"
|
||||
},
|
||||
"execution_count": 2,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "2wxD-gRSMU5R",
|
||||
"outputId": "83bb80fa-9ced-43e2-a304-d53a3501b142"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Mounted at /content/drive\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from google.colab import drive\n",
|
||||
"drive.mount('/content/drive')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!git clone https://github.com/w-okada/voice-changer.git\n",
|
||||
"%cd voice-changer/demo/\n"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "86wTFmqsNMnD",
|
||||
"outputId": "3fc68f14-b6b7-48bb-e285-5bed78e74f26"
|
||||
},
|
||||
"execution_count": 4,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Cloning into 'voice-changer'...\n",
|
||||
"remote: Enumerating objects: 266, done.\u001b[K\n",
|
||||
"remote: Counting objects: 100% (266/266), done.\u001b[K\n",
|
||||
"remote: Compressing objects: 100% (189/189), done.\u001b[K\n",
|
||||
"remote: Total 266 (delta 123), reused 194 (delta 65), pack-reused 0\u001b[K\n",
|
||||
"Receiving objects: 100% (266/266), 19.11 MiB | 35.44 MiB/s, done.\n",
|
||||
"Resolving deltas: 100% (123/123), done.\n",
|
||||
"/content/voice-changer/demo\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!git checkout dev\n"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "CBsogR-zWH4r",
|
||||
"outputId": "f4c9737b-831d-4938-d387-caf07693030e"
|
||||
},
|
||||
"execution_count": 5,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Branch 'dev' set up to track remote branch 'dev' from 'origin'.\n",
|
||||
"Switched to a new branch 'dev'\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!mkdir -p ../frontend/dist\n",
|
||||
"!cp -r ../docs/* ../frontend/dist/\n",
|
||||
"!ls ../frontend/dist\n"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "uCEKf3_JNoyq",
|
||||
"outputId": "746e1946-5c3a-49af-df26-d86149f8adb1"
|
||||
},
|
||||
"execution_count": 6,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"assets\t coffee.png index.html index.js.LICENSE.txt\n",
|
||||
"audiolet favicon.ico index.js\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!cp ../template/setting_colab.json ../frontend/dist/assets/setting.json"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "Bn4kV8TgXp8i"
|
||||
},
|
||||
"execution_count": 11,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!cat ../frontend/dist/assets/setting.json"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "pjxPsOOaXXTj",
|
||||
"outputId": "1bf85102-87ed-462c-e732-cffb878d95f3"
|
||||
},
|
||||
"execution_count": 12,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"{\n",
|
||||
" \"app_title\": \"voice-changer\",\n",
|
||||
" \"majar_mode\": \"colab\",\n",
|
||||
" \"voice_changer_server_url\": \"http://localhost:8080/test\",\n",
|
||||
" \"sample_rate\": 48000,\n",
|
||||
" \"buffer_size\": 1024,\n",
|
||||
" \"prefix_chunk_size\": 24,\n",
|
||||
" \"chunk_size\": 24,\n",
|
||||
" \"speaker_ids\": [100, 107, 101, 102, 103],\n",
|
||||
" \"speaker_names\": [\"ずんだもん\", \"user\", \"そら\", \"めたん\", \"つぐみ\"],\n",
|
||||
" \"src_id\": 107,\n",
|
||||
" \"dst_id\": 100,\n",
|
||||
" \"vf_enable\": true,\n",
|
||||
" \"voice_changer_mode\": \"realtime\",\n",
|
||||
" \"gpu\": 0,\n",
|
||||
" \"available_gpus\": [-1, 0, 1, 2, 3, 4],\n",
|
||||
" \"avatar\": {\n",
|
||||
" \"motion_capture_face\": true,\n",
|
||||
" \"motion_capture_upperbody\": true,\n",
|
||||
" \"lip_overwrite_with_voice\": true,\n",
|
||||
" \"avatar_url\": \"./assets/vrm/zundamon/zundamon.vrm\",\n",
|
||||
" \"backgournd_image_url\": \"./assets/images/bg_natural_sougen.jpg\",\n",
|
||||
" \"background_color\": \"#0000dd\",\n",
|
||||
" \"chroma_key\": \"#0000dd\",\n",
|
||||
" \"avatar_canvas_size\": [1280, 720],\n",
|
||||
" \"screen_canvas_size\": [1280, 720]\n",
|
||||
" },\n",
|
||||
" \"advance\": {\n",
|
||||
" \"avatar_draw_skip_rate\": 3,\n",
|
||||
" \"screen_draw_skip_rate\": 3,\n",
|
||||
" \"visualizer_draw_skip_rate\": 3,\n",
|
||||
" \"cross_fade_lower_value\": 0.1,\n",
|
||||
" \"cross_fade_overlap_rate\": 0.03\n",
|
||||
" }\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"# 手作業\n",
|
||||
"\n",
|
||||
"・configとモデルをdemoフォルダにコピー\n",
|
||||
"\n",
|
||||
"・docsをfrontendに変更\n",
|
||||
"\n",
|
||||
"・setting.jsonをfrontendにコピー\n"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "8Na2PbLZSWgZ"
|
||||
}
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!apt-get install -y espeak libsndfile1-dev\n",
|
||||
"!pip install flask\n",
|
||||
"!pip install python-socketio\n",
|
||||
"!pip install eventlet\n",
|
||||
"!pip install unidecode\n",
|
||||
"!pip install phonemizer\n",
|
||||
"!pip install retry\n",
|
||||
"!pip install flask\n",
|
||||
"!pip install flask_cors\n"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "LwZAAuqxX7yY",
|
||||
"outputId": "c67b2741-7a1e-448d-abf9-7b8d8f5e3d15"
|
||||
},
|
||||
"execution_count": 13,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Reading package lists... Done\n",
|
||||
"Building dependency tree \n",
|
||||
"Reading state information... Done\n",
|
||||
"libsndfile1-dev is already the newest version (1.0.28-4ubuntu0.18.04.2).\n",
|
||||
"The following package was automatically installed and is no longer required:\n",
|
||||
" libnvidia-common-460\n",
|
||||
"Use 'apt autoremove' to remove it.\n",
|
||||
"The following additional packages will be installed:\n",
|
||||
" espeak-data libespeak1 libportaudio2 libsonic0\n",
|
||||
"The following NEW packages will be installed:\n",
|
||||
" espeak espeak-data libespeak1 libportaudio2 libsonic0\n",
|
||||
"0 upgraded, 5 newly installed, 0 to remove and 20 not upgraded.\n",
|
||||
"Need to get 1,219 kB of archives.\n",
|
||||
"After this operation, 3,031 kB of additional disk space will be used.\n",
|
||||
"Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]\n",
|
||||
"Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]\n",
|
||||
"Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]\n",
|
||||
"Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]\n",
|
||||
"Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]\n",
|
||||
"Fetched 1,219 kB in 1s (1,636 kB/s)\n",
|
||||
"Selecting previously unselected package libportaudio2:amd64.\n",
|
||||
"(Reading database ... 155676 files and directories currently installed.)\n",
|
||||
"Preparing to unpack .../libportaudio2_19.6.0-1_amd64.deb ...\n",
|
||||
"Unpacking libportaudio2:amd64 (19.6.0-1) ...\n",
|
||||
"Selecting previously unselected package libsonic0:amd64.\n",
|
||||
"Preparing to unpack .../libsonic0_0.2.0-6_amd64.deb ...\n",
|
||||
"Unpacking libsonic0:amd64 (0.2.0-6) ...\n",
|
||||
"Selecting previously unselected package espeak-data:amd64.\n",
|
||||
"Preparing to unpack .../espeak-data_1.48.04+dfsg-5_amd64.deb ...\n",
|
||||
"Unpacking espeak-data:amd64 (1.48.04+dfsg-5) ...\n",
|
||||
"Selecting previously unselected package libespeak1:amd64.\n",
|
||||
"Preparing to unpack .../libespeak1_1.48.04+dfsg-5_amd64.deb ...\n",
|
||||
"Unpacking libespeak1:amd64 (1.48.04+dfsg-5) ...\n",
|
||||
"Selecting previously unselected package espeak.\n",
|
||||
"Preparing to unpack .../espeak_1.48.04+dfsg-5_amd64.deb ...\n",
|
||||
"Unpacking espeak (1.48.04+dfsg-5) ...\n",
|
||||
"Setting up libportaudio2:amd64 (19.6.0-1) ...\n",
|
||||
"Setting up espeak-data:amd64 (1.48.04+dfsg-5) ...\n",
|
||||
"Setting up libsonic0:amd64 (0.2.0-6) ...\n",
|
||||
"Setting up libespeak1:amd64 (1.48.04+dfsg-5) ...\n",
|
||||
"Setting up espeak (1.48.04+dfsg-5) ...\n",
|
||||
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
|
||||
"Processing triggers for libc-bin (2.27-3ubuntu1.5) ...\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Requirement already satisfied: flask in /usr/local/lib/python3.7/dist-packages (1.1.4)\n",
|
||||
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from flask) (7.1.2)\n",
|
||||
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from flask) (2.11.3)\n",
|
||||
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from flask) (1.1.0)\n",
|
||||
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from flask) (1.0.1)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->flask) (2.0.1)\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting python-socketio\n",
|
||||
" Downloading python_socketio-5.7.1-py3-none-any.whl (56 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 56 kB 5.0 MB/s \n",
|
||||
"\u001b[?25hCollecting bidict>=0.21.0\n",
|
||||
" Downloading bidict-0.22.0-py3-none-any.whl (36 kB)\n",
|
||||
"Collecting python-engineio>=4.3.0\n",
|
||||
" Downloading python_engineio-4.3.4-py3-none-any.whl (52 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 52 kB 2.0 MB/s \n",
|
||||
"\u001b[?25hInstalling collected packages: python-engineio, bidict, python-socketio\n",
|
||||
"Successfully installed bidict-0.22.0 python-engineio-4.3.4 python-socketio-5.7.1\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting eventlet\n",
|
||||
" Downloading eventlet-0.33.1-py2.py3-none-any.whl (226 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 226 kB 33.3 MB/s \n",
|
||||
"\u001b[?25hCollecting dnspython>=1.15.0\n",
|
||||
" Downloading dnspython-2.2.1-py3-none-any.whl (269 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 269 kB 52.5 MB/s \n",
|
||||
"\u001b[?25hRequirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from eventlet) (1.15.0)\n",
|
||||
"Requirement already satisfied: greenlet>=0.3 in /usr/local/lib/python3.7/dist-packages (from eventlet) (1.1.3)\n",
|
||||
"Installing collected packages: dnspython, eventlet\n",
|
||||
"Successfully installed dnspython-2.2.1 eventlet-0.33.1\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting unidecode\n",
|
||||
" Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 235 kB 28.6 MB/s \n",
|
||||
"\u001b[?25hInstalling collected packages: unidecode\n",
|
||||
"Successfully installed unidecode-1.3.4\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting phonemizer\n",
|
||||
" Downloading phonemizer-3.2.1-py3-none-any.whl (90 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 90 kB 9.5 MB/s \n",
|
||||
"\u001b[?25hCollecting segments\n",
|
||||
" Downloading segments-2.2.1-py2.py3-none-any.whl (15 kB)\n",
|
||||
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from phonemizer) (1.1.0)\n",
|
||||
"Collecting dlinfo\n",
|
||||
" Downloading dlinfo-1.2.1-py3-none-any.whl (3.6 kB)\n",
|
||||
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from phonemizer) (4.1.1)\n",
|
||||
"Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.7/dist-packages (from phonemizer) (22.1.0)\n",
|
||||
"Collecting csvw>=1.5.6\n",
|
||||
" Downloading csvw-3.1.1-py2.py3-none-any.whl (56 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 56 kB 5.7 MB/s \n",
|
||||
"\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from segments->phonemizer) (2022.6.2)\n",
|
||||
"Collecting clldutils>=1.7.3\n",
|
||||
" Downloading clldutils-3.12.0-py2.py3-none-any.whl (197 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 197 kB 63.8 MB/s \n",
|
||||
"\u001b[?25hRequirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (2.8.2)\n",
|
||||
"Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.7/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (0.8.10)\n",
|
||||
"Collecting colorlog\n",
|
||||
" Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n",
|
||||
"Collecting colorama\n",
|
||||
" Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)\n",
|
||||
"Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (4.3.3)\n",
|
||||
"Collecting rdflib\n",
|
||||
" Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 500 kB 53.6 MB/s \n",
|
||||
"\u001b[?25hRequirement already satisfied: babel in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (2.10.3)\n",
|
||||
"Collecting language-tags\n",
|
||||
" Downloading language_tags-1.1.0-py2.py3-none-any.whl (210 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 210 kB 65.4 MB/s \n",
|
||||
"\u001b[?25hCollecting rfc3986<2\n",
|
||||
" Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)\n",
|
||||
"Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (3.0.1)\n",
|
||||
"Collecting isodate\n",
|
||||
" Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n",
|
||||
"\u001b[K |████████████████████████████████| 41 kB 763 kB/s \n",
|
||||
"\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (2.23.0)\n",
|
||||
"Requirement already satisfied: pytz>=2015.7 in /usr/local/lib/python3.7/dist-packages (from babel->csvw>=1.5.6->segments->phonemizer) (2022.2.1)\n",
|
||||
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from isodate->csvw>=1.5.6->segments->phonemizer) (1.15.0)\n",
|
||||
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.18.1)\n",
|
||||
"Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (5.9.0)\n",
|
||||
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (4.12.0)\n",
|
||||
"Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->csvw>=1.5.6->segments->phonemizer) (3.8.1)\n",
|
||||
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (57.4.0)\n",
|
||||
"Requirement already satisfied: pyparsing in /usr/local/lib/python3.7/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (3.0.9)\n",
|
||||
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.0.4)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2022.6.15)\n",
|
||||
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2.10)\n",
|
||||
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (1.24.3)\n",
|
||||
"Installing collected packages: isodate, rfc3986, rdflib, language-tags, colorama, csvw, colorlog, clldutils, segments, dlinfo, phonemizer\n",
|
||||
"Successfully installed clldutils-3.12.0 colorama-0.4.5 colorlog-6.7.0 csvw-3.1.1 dlinfo-1.2.1 isodate-0.6.1 language-tags-1.1.0 phonemizer-3.2.1 rdflib-6.2.0 rfc3986-1.5.0 segments-2.2.1\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting retry\n",
|
||||
" Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)\n",
|
||||
"Requirement already satisfied: decorator>=3.4.2 in /usr/local/lib/python3.7/dist-packages (from retry) (4.4.2)\n",
|
||||
"Requirement already satisfied: py<2.0.0,>=1.4.26 in /usr/local/lib/python3.7/dist-packages (from retry) (1.11.0)\n",
|
||||
"Installing collected packages: retry\n",
|
||||
"Successfully installed retry-0.9.2\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Requirement already satisfied: flask in /usr/local/lib/python3.7/dist-packages (1.1.4)\n",
|
||||
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from flask) (1.1.0)\n",
|
||||
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from flask) (2.11.3)\n",
|
||||
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from flask) (1.0.1)\n",
|
||||
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from flask) (7.1.2)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->flask) (2.0.1)\n",
|
||||
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
|
||||
"Collecting flask_cors\n",
|
||||
" Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)\n",
|
||||
"Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.7/dist-packages (from flask_cors) (1.1.4)\n",
|
||||
"Requirement already satisfied: Six in /usr/local/lib/python3.7/dist-packages (from flask_cors) (1.15.0)\n",
|
||||
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (1.1.0)\n",
|
||||
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (2.11.3)\n",
|
||||
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (1.0.1)\n",
|
||||
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (7.1.2)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=0.9->flask_cors) (2.0.1)\n",
|
||||
"Installing collected packages: flask-cors\n",
|
||||
"Successfully installed flask-cors-3.0.10\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"get_ipython().system_raw(f'python3 serverFlask.py 8082 {CONFIG} {MODEL} >foo 2>&1 &')"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "iNOAB7zISI6J"
|
||||
},
|
||||
"execution_count": 14,
|
||||
"outputs": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!cat foo"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "chu06KpAjEK6",
|
||||
"outputId": "887c2d50-c49f-4a22-f0d0-8a3667511466"
|
||||
},
|
||||
"execution_count": 18,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"[2022-08-31 06:17:58,669] INFO in serverFlask: INITIALIZE MODEL\n",
|
||||
"[2022-08-31 06:18:08,764] INFO in utils: Loaded checkpoint '/content/drive/MyDrive/VoiceChanger/G_326000.pth' (iteration 1136)\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from google.colab import output\n",
|
||||
"\n",
|
||||
"output.serve_kernel_port_as_window(8082)"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "nkRjZm95l87C",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 34
|
||||
},
|
||||
"outputId": "abf57f92-5cb6-4325-b64a-095d42f561d5"
|
||||
},
|
||||
"execution_count": 27,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "display_data",
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<IPython.core.display.Javascript object>"
|
||||
],
|
||||
"application/javascript": [
|
||||
"(async (port, path, text, element) => {\n",
|
||||
" if (!google.colab.kernel.accessAllowed) {\n",
|
||||
" return;\n",
|
||||
" }\n",
|
||||
" element.appendChild(document.createTextNode(''));\n",
|
||||
" const url = await google.colab.kernel.proxyPort(port);\n",
|
||||
" const anchor = document.createElement('a');\n",
|
||||
" anchor.href = new URL(path, url).toString();\n",
|
||||
" anchor.target = '_blank';\n",
|
||||
" anchor.setAttribute('data-href', url + path);\n",
|
||||
" anchor.textContent = text;\n",
|
||||
" element.appendChild(anchor);\n",
|
||||
" })(8082, \"/\", \"https://localhost:8082/\", window.element)"
|
||||
]
|
||||
},
|
||||
"metadata": {}
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"! ls ../frontend/dist/index.html"
|
||||
],
|
||||
"metadata": {
|
||||
"id": "DKWni4moSyzO",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"outputId": "b5635a1e-6ac6-41db-a706-dc3e5fb866a5"
|
||||
},
|
||||
"execution_count": 23,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"../frontend/dist/index.html\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"id": "3hwJmseXZhJY"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
BIN
demo/dummy.wav
Executable file
2
demo/logs/.gitignore
vendored
Executable file
@ -0,0 +1,2 @@
|
||||
*
|
||||
!.gitignore
|
303
demo/mod/attentions.py
Executable file
@ -0,0 +1,303 @@
|
||||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
import modules
|
||||
from modules import LayerNorm
|
||||
|
||||
|
||||
class Encoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.attn_layers[i](x, x, attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
||||
super().__init__()
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.self_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_0 = nn.ModuleList()
|
||||
self.encdec_attn_layers = nn.ModuleList()
|
||||
self.norm_layers_1 = nn.ModuleList()
|
||||
self.ffn_layers = nn.ModuleList()
|
||||
self.norm_layers_2 = nn.ModuleList()
|
||||
for i in range(self.n_layers):
|
||||
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
||||
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
||||
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
||||
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
||||
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
||||
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
||||
|
||||
def forward(self, x, x_mask, h, h_mask):
|
||||
"""
|
||||
x: decoder input
|
||||
h: encoder output
|
||||
"""
|
||||
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
||||
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
||||
x = x * x_mask
|
||||
for i in range(self.n_layers):
|
||||
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_0[i](x + y)
|
||||
|
||||
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_1[i](x + y)
|
||||
|
||||
y = self.ffn_layers[i](x, x_mask)
|
||||
y = self.drop(y)
|
||||
x = self.norm_layers_2[i](x + y)
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class MultiHeadAttention(nn.Module):
|
||||
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
|
||||
super().__init__()
|
||||
assert channels % n_heads == 0
|
||||
|
||||
self.channels = channels
|
||||
self.out_channels = out_channels
|
||||
self.n_heads = n_heads
|
||||
self.p_dropout = p_dropout
|
||||
self.window_size = window_size
|
||||
self.heads_share = heads_share
|
||||
self.block_length = block_length
|
||||
self.proximal_bias = proximal_bias
|
||||
self.proximal_init = proximal_init
|
||||
self.attn = None
|
||||
|
||||
self.k_channels = channels // n_heads
|
||||
self.conv_q = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_k = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_v = nn.Conv1d(channels, channels, 1)
|
||||
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if window_size is not None:
|
||||
n_heads_rel = 1 if heads_share else n_heads
|
||||
rel_stddev = self.k_channels**-0.5
|
||||
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
||||
|
||||
nn.init.xavier_uniform_(self.conv_q.weight)
|
||||
nn.init.xavier_uniform_(self.conv_k.weight)
|
||||
nn.init.xavier_uniform_(self.conv_v.weight)
|
||||
if proximal_init:
|
||||
with torch.no_grad():
|
||||
self.conv_k.weight.copy_(self.conv_q.weight)
|
||||
self.conv_k.bias.copy_(self.conv_q.bias)
|
||||
|
||||
def forward(self, x, c, attn_mask=None):
|
||||
q = self.conv_q(x)
|
||||
k = self.conv_k(c)
|
||||
v = self.conv_v(c)
|
||||
|
||||
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
||||
|
||||
x = self.conv_o(x)
|
||||
return x
|
||||
|
||||
def attention(self, query, key, value, mask=None):
|
||||
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
||||
b, d, t_s, t_t = (*key.size(), query.size(2))
|
||||
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
||||
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
||||
|
||||
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
||||
if self.window_size is not None:
|
||||
assert t_s == t_t, "Relative attention is only available for self-attention."
|
||||
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
||||
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
|
||||
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
||||
scores = scores + scores_local
|
||||
if self.proximal_bias:
|
||||
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
||||
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
||||
if mask is not None:
|
||||
scores = scores.masked_fill(mask == 0, -1e4)
|
||||
if self.block_length is not None:
|
||||
assert t_s == t_t, "Local attention is only available for self-attention."
|
||||
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
||||
scores = scores.masked_fill(block_mask == 0, -1e4)
|
||||
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
||||
p_attn = self.drop(p_attn)
|
||||
output = torch.matmul(p_attn, value)
|
||||
if self.window_size is not None:
|
||||
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
||||
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
||||
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
||||
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
||||
return output, p_attn
|
||||
|
||||
def _matmul_with_relative_values(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, m]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, d]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0))
|
||||
return ret
|
||||
|
||||
def _matmul_with_relative_keys(self, x, y):
|
||||
"""
|
||||
x: [b, h, l, d]
|
||||
y: [h or 1, m, d]
|
||||
ret: [b, h, l, m]
|
||||
"""
|
||||
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
||||
return ret
|
||||
|
||||
def _get_relative_embeddings(self, relative_embeddings, length):
|
||||
max_relative_position = 2 * self.window_size + 1
|
||||
# Pad first before slice to avoid using cond ops.
|
||||
pad_length = max(length - (self.window_size + 1), 0)
|
||||
slice_start_position = max((self.window_size + 1) - length, 0)
|
||||
slice_end_position = slice_start_position + 2 * length - 1
|
||||
if pad_length > 0:
|
||||
padded_relative_embeddings = F.pad(
|
||||
relative_embeddings,
|
||||
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
||||
else:
|
||||
padded_relative_embeddings = relative_embeddings
|
||||
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
|
||||
return used_relative_embeddings
|
||||
|
||||
def _relative_position_to_absolute_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, 2*l-1]
|
||||
ret: [b, h, l, l]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# Concat columns of pad to shift from relative to absolute indexing.
|
||||
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
|
||||
|
||||
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
||||
x_flat = x.view([batch, heads, length * 2 * length])
|
||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
|
||||
|
||||
# Reshape and slice out the padded elements.
|
||||
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
|
||||
return x_final
|
||||
|
||||
def _absolute_position_to_relative_position(self, x):
|
||||
"""
|
||||
x: [b, h, l, l]
|
||||
ret: [b, h, l, 2*l-1]
|
||||
"""
|
||||
batch, heads, length, _ = x.size()
|
||||
# padd along column
|
||||
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
|
||||
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
|
||||
# add 0's in the beginning that will skew the elements after reshape
|
||||
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
||||
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
|
||||
return x_final
|
||||
|
||||
def _attention_bias_proximal(self, length):
|
||||
"""Bias for self-attention to encourage attention to close positions.
|
||||
Args:
|
||||
length: an integer scalar.
|
||||
Returns:
|
||||
a Tensor with shape [1, 1, length, length]
|
||||
"""
|
||||
r = torch.arange(length, dtype=torch.float32)
|
||||
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
||||
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
||||
|
||||
|
||||
class FFN(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.activation = activation
|
||||
self.causal = causal
|
||||
|
||||
if causal:
|
||||
self.padding = self._causal_padding
|
||||
else:
|
||||
self.padding = self._same_padding
|
||||
|
||||
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
||||
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x = self.conv_1(self.padding(x * x_mask))
|
||||
if self.activation == "gelu":
|
||||
x = x * torch.sigmoid(1.702 * x)
|
||||
else:
|
||||
x = torch.relu(x)
|
||||
x = self.drop(x)
|
||||
x = self.conv_2(self.padding(x * x_mask))
|
||||
return x * x_mask
|
||||
|
||||
def _causal_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = self.kernel_size - 1
|
||||
pad_r = 0
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
||||
return x
|
||||
|
||||
def _same_padding(self, x):
|
||||
if self.kernel_size == 1:
|
||||
return x
|
||||
pad_l = (self.kernel_size - 1) // 2
|
||||
pad_r = self.kernel_size // 2
|
||||
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
||||
x = F.pad(x, commons.convert_pad_shape(padding))
|
||||
return x
|
161
demo/mod/commons.py
Executable file
@ -0,0 +1,161 @@
|
||||
import math
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
|
||||
def init_weights(m, mean=0.0, std=0.01):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find("Conv") != -1:
|
||||
m.weight.data.normal_(mean, std)
|
||||
|
||||
|
||||
def get_padding(kernel_size, dilation=1):
|
||||
return int((kernel_size*dilation - dilation)/2)
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def intersperse(lst, item):
|
||||
result = [item] * (len(lst) * 2 + 1)
|
||||
result[1::2] = lst
|
||||
return result
|
||||
|
||||
|
||||
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
||||
"""KL(P||Q)"""
|
||||
kl = (logs_q - logs_p) - 0.5
|
||||
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
|
||||
return kl
|
||||
|
||||
|
||||
def rand_gumbel(shape):
|
||||
"""Sample from the Gumbel distribution, protect from overflows."""
|
||||
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
||||
return -torch.log(-torch.log(uniform_samples))
|
||||
|
||||
|
||||
def rand_gumbel_like(x):
|
||||
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
||||
return g
|
||||
|
||||
|
||||
def slice_segments(x, ids_str, segment_size=4):
|
||||
ret = torch.zeros_like(x[:, :, :segment_size])
|
||||
for i in range(x.size(0)):
|
||||
idx_str = ids_str[i]
|
||||
idx_end = idx_str + segment_size
|
||||
ret[i] = x[i, :, idx_str:idx_end]
|
||||
return ret
|
||||
|
||||
|
||||
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
||||
b, d, t = x.size()
|
||||
if x_lengths is None:
|
||||
x_lengths = t
|
||||
ids_str_max = x_lengths - segment_size + 1
|
||||
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
||||
ret = slice_segments(x, ids_str, segment_size)
|
||||
return ret, ids_str
|
||||
|
||||
|
||||
def get_timing_signal_1d(
|
||||
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
||||
position = torch.arange(length, dtype=torch.float)
|
||||
num_timescales = channels // 2
|
||||
log_timescale_increment = (
|
||||
math.log(float(max_timescale) / float(min_timescale)) /
|
||||
(num_timescales - 1))
|
||||
inv_timescales = min_timescale * torch.exp(
|
||||
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
||||
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
||||
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
||||
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
||||
signal = signal.view(1, channels, length)
|
||||
return signal
|
||||
|
||||
|
||||
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return x + signal.to(dtype=x.dtype, device=x.device)
|
||||
|
||||
|
||||
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
||||
b, channels, length = x.size()
|
||||
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
||||
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
||||
|
||||
|
||||
def subsequent_mask(length):
|
||||
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
||||
return mask
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
||||
n_channels_int = n_channels[0]
|
||||
in_act = input_a + input_b
|
||||
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
||||
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
||||
acts = t_act * s_act
|
||||
return acts
|
||||
|
||||
|
||||
def convert_pad_shape(pad_shape):
|
||||
l = pad_shape[::-1]
|
||||
pad_shape = [item for sublist in l for item in sublist]
|
||||
return pad_shape
|
||||
|
||||
|
||||
def shift_1d(x):
|
||||
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
||||
return x
|
||||
|
||||
|
||||
def sequence_mask(length, max_length=None):
|
||||
if max_length is None:
|
||||
max_length = length.max()
|
||||
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
||||
return x.unsqueeze(0) < length.unsqueeze(1)
|
||||
|
||||
|
||||
def generate_path(duration, mask):
|
||||
"""
|
||||
duration: [b, 1, t_x]
|
||||
mask: [b, 1, t_y, t_x]
|
||||
"""
|
||||
device = duration.device
|
||||
|
||||
b, _, t_y, t_x = mask.shape
|
||||
cum_duration = torch.cumsum(duration, -1)
|
||||
|
||||
cum_duration_flat = cum_duration.view(b * t_x)
|
||||
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
||||
path = path.view(b, t_x, t_y)
|
||||
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
||||
path = path.unsqueeze(1).transpose(2,3) * mask
|
||||
return path
|
||||
|
||||
|
||||
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
||||
if isinstance(parameters, torch.Tensor):
|
||||
parameters = [parameters]
|
||||
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
||||
norm_type = float(norm_type)
|
||||
if clip_value is not None:
|
||||
clip_value = float(clip_value)
|
||||
|
||||
total_norm = 0
|
||||
for p in parameters:
|
||||
param_norm = p.grad.data.norm(norm_type)
|
||||
total_norm += param_norm.item() ** norm_type
|
||||
if clip_value is not None:
|
||||
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
||||
total_norm = total_norm ** (1. / norm_type)
|
||||
return total_norm
|
492
demo/mod/data_utils.py
Executable file
@ -0,0 +1,492 @@
|
||||
import time
|
||||
import os
|
||||
import random
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.data
|
||||
import tqdm
|
||||
|
||||
import commons
|
||||
from mel_processing import spectrogram_torch
|
||||
from utils import load_wav_to_torch, load_filepaths_and_text
|
||||
from text import text_to_sequence, cleaned_text_to_sequence
|
||||
import struct
|
||||
#add
|
||||
from retry import retry
|
||||
import random
|
||||
import torchaudio
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
class TextAudioLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, text pairs
|
||||
2) normalizes text and converts them to sequences of integers
|
||||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
def __init__(self, audiopaths_and_text, hparams, use_test = True):
|
||||
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.filter_length = hparams.filter_length
|
||||
self.hop_length = hparams.hop_length
|
||||
self.win_length = hparams.win_length
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.use_test = use_test
|
||||
|
||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
||||
|
||||
self.add_blank = hparams.add_blank
|
||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
||||
self.max_text_len = getattr(hparams, "max_text_len", 190)
|
||||
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths_and_text)
|
||||
self._filter()
|
||||
|
||||
|
||||
def _filter(self):
|
||||
"""
|
||||
Filter text & store spec lengths
|
||||
"""
|
||||
# Store spectrogram lengths for Bucketing
|
||||
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
|
||||
# spec_length = wav_length // hop_length
|
||||
|
||||
audiopaths_and_text_new = []
|
||||
lengths = []
|
||||
for audiopath, text in self.audiopaths_and_text:
|
||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||
audiopaths_and_text_new.append([audiopath, text])
|
||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
||||
self.audiopaths_and_text = audiopaths_and_text_new
|
||||
self.lengths = lengths
|
||||
|
||||
def get_audio_text_pair(self, audiopath_and_text):
|
||||
# separate filename and text
|
||||
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
|
||||
text = self.get_text(text)
|
||||
if self.use_test != True:
|
||||
text = torch.as_tensor("a")
|
||||
spec, wav = self.get_audio(audiopath)
|
||||
return (text, spec, wav)
|
||||
|
||||
def get_audio(self, filename):
|
||||
audio, sampling_rate = load_wav_to_torch(filename)
|
||||
if sampling_rate != self.sampling_rate:
|
||||
raise ValueError("{} {} SR doesn't match target {} SR".format(
|
||||
sampling_rate, self.sampling_rate))
|
||||
audio_norm = audio / self.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
spec_filename = filename.replace(".wav", ".spec.pt")
|
||||
if os.path.exists(spec_filename):
|
||||
spec = torch.load(spec_filename)
|
||||
else:
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
torch.save(spec, spec_filename)
|
||||
return spec, audio_norm
|
||||
|
||||
def get_text(self, text):
|
||||
if self.cleaned_text:
|
||||
text_norm = cleaned_text_to_sequence(text)
|
||||
else:
|
||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||
if self.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.get_audio_text_pair(self.audiopaths_and_text[index])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_and_text)
|
||||
|
||||
|
||||
class TextAudioCollate():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
def __init__(self, return_ids=False):
|
||||
self.return_ids = return_ids
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text and aduio
|
||||
PARAMS
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
|
||||
max_text_len = max([len(x[0]) for x in batch])
|
||||
max_spec_len = max([x[1].size(1) for x in batch])
|
||||
max_wav_len = max([x[2].size(1) for x in batch])
|
||||
|
||||
text_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
wav_lengths = torch.LongTensor(len(batch))
|
||||
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
text_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
wav_padded.zero_()
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
text = row[0]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
if self.return_ids:
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
|
||||
|
||||
|
||||
"""Multi speaker version"""
|
||||
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
|
||||
"""
|
||||
1) loads audio, speaker_id, text pairs
|
||||
2) normalizes text and converts them to sequences of integers
|
||||
3) computes spectrograms from audio files.
|
||||
"""
|
||||
def __init__(self, audiopaths_sid_text, hparams, no_text=False, augmentation=False, augmentation_params=None, no_use_textfile = False):
|
||||
if no_use_textfile:
|
||||
self.audiopaths_sid_text = list()
|
||||
else:
|
||||
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
|
||||
|
||||
self.text_cleaners = hparams.text_cleaners
|
||||
self.max_wav_value = hparams.max_wav_value
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.filter_length = hparams.filter_length
|
||||
self.hop_length = hparams.hop_length
|
||||
self.win_length = hparams.win_length
|
||||
self.sampling_rate = hparams.sampling_rate
|
||||
self.no_text = no_text
|
||||
self.augmentation = augmentation
|
||||
if augmentation :
|
||||
self.gain_p = augmentation_params.gain_p
|
||||
self.min_gain_in_db = augmentation_params.min_gain_in_db
|
||||
self.max_gain_in_db = augmentation_params.max_gain_in_db
|
||||
self.time_stretch_p = augmentation_params.time_stretch_p
|
||||
self.min_rate = augmentation_params.min_rate
|
||||
self.max_rate = augmentation_params.max_rate
|
||||
self.pitch_shift_p = augmentation_params.pitch_shift_p
|
||||
self.min_semitones = augmentation_params.min_semitones
|
||||
self.max_semitones = augmentation_params.max_semitones
|
||||
self.add_gaussian_noise_p = augmentation_params.add_gaussian_noise_p
|
||||
self.min_amplitude = augmentation_params.min_amplitude
|
||||
self.max_amplitude = augmentation_params.max_amplitude
|
||||
self.frequency_mask_p = augmentation_params.frequency_mask_p
|
||||
|
||||
self.cleaned_text = getattr(hparams, "cleaned_text", False)
|
||||
|
||||
self.add_blank = hparams.add_blank
|
||||
self.min_text_len = getattr(hparams, "min_text_len", 1)
|
||||
self.max_text_len = getattr(hparams, "max_text_len", 1000)
|
||||
|
||||
random.seed(1234)
|
||||
random.shuffle(self.audiopaths_sid_text)
|
||||
self._filter()
|
||||
|
||||
@retry(tries=30, delay=10)
|
||||
def _filter(self):
|
||||
"""
|
||||
Filter text & store spec lengths
|
||||
"""
|
||||
audiopaths_sid_text_new = []
|
||||
lengths = []
|
||||
|
||||
# for audiopath, sid, text in tqdm.tqdm(self.audiopaths_sid_text):
|
||||
for audiopath, sid, text in self.audiopaths_sid_text:
|
||||
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
|
||||
audiopaths_sid_text_new.append([audiopath, sid, text])
|
||||
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
|
||||
self.audiopaths_sid_text = audiopaths_sid_text_new
|
||||
self.lengths = lengths
|
||||
|
||||
def get_audio_text_speaker_pair(self, audiopath_sid_text):
|
||||
# separate filename, speaker_id and text
|
||||
wavdata, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
|
||||
text = self.get_text(text)
|
||||
if self.no_text:
|
||||
text = self.get_text("a")
|
||||
spec, wav = self.get_audio(wavdata)
|
||||
sid = self.get_sid(sid)
|
||||
return (text, spec, wav, sid)
|
||||
|
||||
@retry(exceptions=(PermissionError), tries=100, delay=10)
|
||||
def get_audio(self, wavdata):
|
||||
# 音声データは±1.0内に正規化したtorchベクトルでunsqueeze(0)で外側1次元くるんだものを扱う
|
||||
audio = torch.FloatTensor(wavdata.astype(np.float32))
|
||||
sampling_rate=24000
|
||||
try:
|
||||
if sampling_rate != self.sampling_rate:
|
||||
raise ValueError("[Error] Exception: source {} SR doesn't match target {} SR".format(
|
||||
sampling_rate, self.sampling_rate))
|
||||
except ValueError as e:
|
||||
print(e)
|
||||
exit()
|
||||
audio_norm = self.get_normalized_audio(audio, self.max_wav_value)
|
||||
|
||||
if self.augmentation:
|
||||
audio_augmented = self.add_augmentation(audio_norm, sampling_rate)
|
||||
audio_noised = self.add_noise(audio_augmented, sampling_rate)
|
||||
# ノーマライズ後のaugmentationとnoise付加で範囲外になったところを削る
|
||||
audio_augmented = torch.clamp(audio_augmented, -1, 1)
|
||||
audio_noised = torch.clamp(audio_noised, -1, 1)
|
||||
# audio(音声波形)は教師信号となるのでノイズは含まずaugmentationのみしたものを使用
|
||||
audio_norm = audio_augmented
|
||||
# spec(スペクトログラム)は入力信号となるのでaugmentationしてさらにノイズを付加したものを使用
|
||||
spec = spectrogram_torch(audio_noised, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec_noised = self.add_spectrogram_noise(spec)
|
||||
spec = torch.squeeze(spec_noised, 0)
|
||||
else:
|
||||
spec = spectrogram_torch(audio_norm, self.filter_length,
|
||||
self.sampling_rate, self.hop_length, self.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
return spec, audio_norm
|
||||
|
||||
def add_augmentation(self, audio, sampling_rate):
|
||||
gain_in_db = 0.0
|
||||
if random.random() <= self.gain_p:
|
||||
gain_in_db = random.uniform(self.min_gain_in_db, self.max_gain_in_db)
|
||||
time_stretch_rate = 1.0
|
||||
if random.random() <= self.time_stretch_p:
|
||||
time_stretch_rate = random.uniform(self.min_rate, self.max_rate)
|
||||
pitch_shift_semitones = 0
|
||||
if random.random() <= self.pitch_shift_p:
|
||||
pitch_shift_semitones = random.uniform(self.min_semitones, self.max_semitones) * 100 # 1/100 semitone 単位指定のため
|
||||
augmentation_effects = [
|
||||
["gain", f"{gain_in_db}"],
|
||||
["tempo", f"{time_stretch_rate}"],
|
||||
["pitch", f"{pitch_shift_semitones}"],
|
||||
["rate", f"{sampling_rate}"]
|
||||
]
|
||||
audio_augmented, _ = torchaudio.sox_effects.apply_effects_tensor(audio, sampling_rate, augmentation_effects)
|
||||
return audio_augmented
|
||||
|
||||
def add_noise(self, audio, sampling_rate):
|
||||
# AddGaussianNoise
|
||||
audio = self.add_gaussian_noise(audio)
|
||||
return audio
|
||||
|
||||
def add_gaussian_noise(self, audio):
|
||||
assert self.min_amplitude >= 0.0
|
||||
assert self.max_amplitude >= 0.0
|
||||
assert self.max_amplitude >= self.min_amplitude
|
||||
if random.random() > self.add_gaussian_noise_p:
|
||||
return audio
|
||||
amplitude = random.uniform(self.min_amplitude, self.max_amplitude)
|
||||
noise = torch.randn(audio.size())
|
||||
noised_audio = audio + amplitude * noise
|
||||
return noised_audio
|
||||
|
||||
def add_spectrogram_noise(self, spec):
|
||||
# FrequencyMask
|
||||
masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
|
||||
masked = masking(spec)
|
||||
return masked
|
||||
|
||||
def get_normalized_audio(self, audio, max_wav_value):
|
||||
audio_norm = audio / max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
return audio_norm
|
||||
|
||||
def get_text(self, text):
|
||||
if self.cleaned_text:
|
||||
text_norm = cleaned_text_to_sequence(text)
|
||||
else:
|
||||
text_norm = text_to_sequence(text, self.text_cleaners)
|
||||
if self.add_blank:
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
return text_norm
|
||||
|
||||
def get_sid(self, sid):
|
||||
sid = torch.LongTensor([int(sid)])
|
||||
return sid
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
|
||||
|
||||
def __len__(self):
|
||||
return len(self.audiopaths_sid_text)
|
||||
|
||||
|
||||
class TextAudioSpeakerCollate():
|
||||
""" Zero-pads model inputs and targets
|
||||
"""
|
||||
def __init__(self, return_ids=False, no_text = False):
|
||||
self.return_ids = return_ids
|
||||
self.no_text = no_text
|
||||
|
||||
def __call__(self, batch):
|
||||
"""Collate's training batch from normalized text, audio and speaker identities
|
||||
PARAMS
|
||||
------
|
||||
batch: [text_normalized, spec_normalized, wav_normalized, sid]
|
||||
"""
|
||||
# Right zero-pad all one-hot text sequences to max input length
|
||||
|
||||
_, ids_sorted_decreasing = torch.sort(
|
||||
torch.LongTensor([x[1].size(1) for x in batch]),
|
||||
dim=0, descending=True)
|
||||
|
||||
max_text_len = max([len(x[0]) for x in batch])
|
||||
max_spec_len = max([x[1].size(1) for x in batch])
|
||||
max_wav_len = max([x[2].size(1) for x in batch])
|
||||
|
||||
text_lengths = torch.LongTensor(len(batch))
|
||||
spec_lengths = torch.LongTensor(len(batch))
|
||||
wav_lengths = torch.LongTensor(len(batch))
|
||||
sid = torch.LongTensor(len(batch))
|
||||
|
||||
text_padded = torch.LongTensor(len(batch), max_text_len)
|
||||
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
|
||||
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
||||
text_padded.zero_()
|
||||
spec_padded.zero_()
|
||||
wav_padded.zero_()
|
||||
for i in range(len(ids_sorted_decreasing)):
|
||||
row = batch[ids_sorted_decreasing[i]]
|
||||
|
||||
text = row[0]
|
||||
text_padded[i, :text.size(0)] = text
|
||||
text_lengths[i] = text.size(0)
|
||||
|
||||
spec = row[1]
|
||||
spec_padded[i, :, :spec.size(1)] = spec
|
||||
spec_lengths[i] = spec.size(1)
|
||||
|
||||
wav = row[2]
|
||||
wav_padded[i, :, :wav.size(1)] = wav
|
||||
wav_lengths[i] = wav.size(1)
|
||||
|
||||
sid[i] = row[3]
|
||||
|
||||
if self.return_ids:
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
|
||||
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
|
||||
|
||||
|
||||
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
|
||||
"""
|
||||
Maintain similar input lengths in a batch.
|
||||
Length groups are specified by boundaries.
|
||||
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
|
||||
|
||||
It removes samples which are not included in the boundaries.
|
||||
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
|
||||
"""
|
||||
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
|
||||
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
|
||||
self.lengths = dataset.lengths
|
||||
self.batch_size = batch_size
|
||||
self.boundaries = boundaries
|
||||
|
||||
self.buckets, self.num_samples_per_bucket = self._create_buckets()
|
||||
self.total_size = sum(self.num_samples_per_bucket)
|
||||
self.num_samples = self.total_size // self.num_replicas
|
||||
|
||||
def _create_buckets(self):
|
||||
buckets = [[] for _ in range(len(self.boundaries) - 1)]
|
||||
for i in range(len(self.lengths)):
|
||||
length = self.lengths[i]
|
||||
idx_bucket = self._bisect(length)
|
||||
if idx_bucket != -1:
|
||||
buckets[idx_bucket].append(i)
|
||||
|
||||
for i in range(len(buckets) - 1, 0, -1):
|
||||
if len(buckets[i]) == 0:
|
||||
buckets.pop(i)
|
||||
self.boundaries.pop(i+1)
|
||||
|
||||
num_samples_per_bucket = []
|
||||
for i in range(len(buckets)):
|
||||
len_bucket = len(buckets[i])
|
||||
total_batch_size = self.num_replicas * self.batch_size
|
||||
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
|
||||
num_samples_per_bucket.append(len_bucket + rem)
|
||||
return buckets, num_samples_per_bucket
|
||||
|
||||
def __iter__(self):
|
||||
# deterministically shuffle based on epoch
|
||||
g = torch.Generator()
|
||||
g.manual_seed(self.epoch)
|
||||
|
||||
indices = []
|
||||
if self.shuffle:
|
||||
for bucket in self.buckets:
|
||||
indices.append(torch.randperm(len(bucket), generator=g).tolist())
|
||||
else:
|
||||
for bucket in self.buckets:
|
||||
indices.append(list(range(len(bucket))))
|
||||
|
||||
batches = []
|
||||
for i in range(len(self.buckets)):
|
||||
next_bucket = (i+1) % len(self.buckets)
|
||||
bucket = self.buckets[i]
|
||||
len_bucket = len(bucket)
|
||||
ids_bucket = indices[i]
|
||||
num_samples_bucket = self.num_samples_per_bucket[i]
|
||||
|
||||
if len_bucket == 0:
|
||||
print("[Warn] Exception: length of buckets {} is 0. ID:{} Skip.".format(i,i))
|
||||
continue
|
||||
|
||||
# add extra samples to make it evenly divisible
|
||||
rem = num_samples_bucket - len_bucket
|
||||
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
|
||||
|
||||
# subsample
|
||||
ids_bucket = ids_bucket[self.rank::self.num_replicas]
|
||||
|
||||
# batching
|
||||
for j in range(len(ids_bucket) // self.batch_size):
|
||||
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
|
||||
batches.append(batch)
|
||||
|
||||
if self.shuffle:
|
||||
batch_ids = torch.randperm(len(batches), generator=g).tolist()
|
||||
batches = [batches[i] for i in batch_ids]
|
||||
self.batches = batches
|
||||
|
||||
assert len(self.batches) * self.batch_size == self.num_samples
|
||||
return iter(self.batches)
|
||||
|
||||
def _bisect(self, x, lo=0, hi=None):
|
||||
if hi is None:
|
||||
hi = len(self.boundaries) - 1
|
||||
|
||||
if hi > lo:
|
||||
mid = (hi + lo) // 2
|
||||
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
|
||||
return mid
|
||||
elif x <= self.boundaries[mid]:
|
||||
return self._bisect(x, lo, mid)
|
||||
else:
|
||||
return self._bisect(x, mid + 1, hi)
|
||||
else:
|
||||
return -1
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples // self.batch_size
|
114
demo/mod/mel_processing.py
Executable file
@ -0,0 +1,114 @@
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
import torch
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
import torch.utils.data
|
||||
import numpy as np
|
||||
import librosa
|
||||
import librosa.util as librosa_util
|
||||
from librosa.util import normalize, pad_center, tiny
|
||||
from scipy.signal import get_window
|
||||
from scipy.io.wavfile import read
|
||||
from librosa.filters import mel as librosa_mel_fn
|
||||
|
||||
MAX_WAV_VALUE = 32768.0
|
||||
|
||||
|
||||
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor
|
||||
"""
|
||||
return torch.log(torch.clamp(x, min=clip_val) * C)
|
||||
|
||||
|
||||
def dynamic_range_decompression_torch(x, C=1):
|
||||
"""
|
||||
PARAMS
|
||||
------
|
||||
C: compression factor used to compress
|
||||
"""
|
||||
return torch.exp(x) / C
|
||||
|
||||
|
||||
def spectral_normalize_torch(magnitudes):
|
||||
output = dynamic_range_compression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
def spectral_de_normalize_torch(magnitudes):
|
||||
output = dynamic_range_decompression_torch(magnitudes)
|
||||
return output
|
||||
|
||||
|
||||
mel_basis = {}
|
||||
hann_window = {}
|
||||
|
||||
|
||||
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
global hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
|
||||
spec = torch.view_as_real(spec)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
return spec
|
||||
|
||||
|
||||
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
||||
global mel_basis
|
||||
dtype_device = str(spec.dtype) + '_' + str(spec.device)
|
||||
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
return spec
|
||||
|
||||
|
||||
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
||||
if torch.min(y) < -1.:
|
||||
print('min value is ', torch.min(y))
|
||||
if torch.max(y) > 1.:
|
||||
print('max value is ', torch.max(y))
|
||||
|
||||
global mel_basis, hann_window
|
||||
dtype_device = str(y.dtype) + '_' + str(y.device)
|
||||
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
||||
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
||||
if fmax_dtype_device not in mel_basis:
|
||||
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
||||
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
||||
if wnsize_dtype_device not in hann_window:
|
||||
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
||||
|
||||
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
||||
y = y.squeeze(1)
|
||||
|
||||
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
||||
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
|
||||
spec = torch.view_as_real(spec)
|
||||
|
||||
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
||||
|
||||
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
||||
spec = spectral_normalize_torch(spec)
|
||||
|
||||
return spec
|
407
demo/mod/models.py
Executable file
@ -0,0 +1,407 @@
|
||||
import copy
|
||||
import math
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
import commons
|
||||
import modules
|
||||
import attentions
|
||||
import monotonic_align
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
||||
from commons import init_weights, get_padding
|
||||
|
||||
|
||||
class TextEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout):
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
||||
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
||||
|
||||
self.encoder = attentions.Encoder(
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout)
|
||||
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths):
|
||||
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
||||
x = torch.transpose(x, 1, -1) # [b, h, t]
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
|
||||
x = self.encoder(x * x_mask, x_mask)
|
||||
stats = self.proj(x) * x_mask
|
||||
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
return x, m, logs, x_mask
|
||||
|
||||
|
||||
class ResidualCouplingBlock(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
n_flows=4,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.n_flows = n_flows
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.flows = nn.ModuleList()
|
||||
for i in range(n_flows):
|
||||
self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
||||
self.flows.append(modules.Flip())
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
if not reverse:
|
||||
for flow in self.flows:
|
||||
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
||||
else:
|
||||
for flow in reversed(self.flows):
|
||||
x = flow(x, x_mask, g=g, reverse=reverse)
|
||||
return x
|
||||
|
||||
|
||||
class PosteriorEncoder(nn.Module):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
gin_channels=0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.out_channels = out_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
||||
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
||||
|
||||
def forward(self, x, x_lengths, g=None):
|
||||
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
||||
x = self.pre(x) * x_mask
|
||||
x = self.enc(x, x_mask, g=g)
|
||||
stats = self.proj(x) * x_mask
|
||||
m, logs = torch.split(stats, self.out_channels, dim=1)
|
||||
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
|
||||
return z, m, logs, x_mask
|
||||
|
||||
|
||||
class Generator(torch.nn.Module):
|
||||
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
|
||||
super(Generator, self).__init__()
|
||||
self.num_kernels = len(resblock_kernel_sizes)
|
||||
self.num_upsamples = len(upsample_rates)
|
||||
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
||||
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
|
||||
|
||||
self.ups = nn.ModuleList()
|
||||
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
||||
self.ups.append(weight_norm(
|
||||
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
|
||||
k, u, padding=(k-u)//2)))
|
||||
|
||||
self.resblocks = nn.ModuleList()
|
||||
for i in range(len(self.ups)):
|
||||
ch = upsample_initial_channel//(2**(i+1))
|
||||
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
||||
self.resblocks.append(resblock(ch, k, d))
|
||||
|
||||
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
||||
self.ups.apply(init_weights)
|
||||
|
||||
if gin_channels != 0:
|
||||
#self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
||||
gin_channels = 0
|
||||
|
||||
def forward(self, x, g=None):
|
||||
x = self.conv_pre(x)
|
||||
if g is not None:
|
||||
#x = x + self.cond(g)
|
||||
g=None
|
||||
|
||||
for i in range(self.num_upsamples):
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
x = self.ups[i](x)
|
||||
xs = None
|
||||
for j in range(self.num_kernels):
|
||||
if xs is None:
|
||||
xs = self.resblocks[i*self.num_kernels+j](x)
|
||||
else:
|
||||
xs += self.resblocks[i*self.num_kernels+j](x)
|
||||
x = xs / self.num_kernels
|
||||
x = F.leaky_relu(x)
|
||||
x = self.conv_post(x)
|
||||
x = torch.tanh(x)
|
||||
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
print('Removing weight norm...')
|
||||
for l in self.ups:
|
||||
remove_weight_norm(l)
|
||||
for l in self.resblocks:
|
||||
l.remove_weight_norm()
|
||||
|
||||
|
||||
class DiscriminatorP(torch.nn.Module):
|
||||
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
||||
super(DiscriminatorP, self).__init__()
|
||||
self.period = period
|
||||
self.use_spectral_norm = use_spectral_norm
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
||||
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
||||
])
|
||||
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
# 1d to 2d
|
||||
b, c, t = x.shape
|
||||
if t % self.period != 0: # pad first
|
||||
n_pad = self.period - (t % self.period)
|
||||
x = F.pad(x, (0, n_pad), "reflect")
|
||||
t = t + n_pad
|
||||
x = x.view(b, c, t // self.period, self.period)
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class DiscriminatorS(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(DiscriminatorS, self).__init__()
|
||||
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
||||
self.convs = nn.ModuleList([
|
||||
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
||||
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
||||
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
||||
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
||||
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
||||
])
|
||||
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
||||
|
||||
def forward(self, x):
|
||||
fmap = []
|
||||
|
||||
for l in self.convs:
|
||||
x = l(x)
|
||||
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
||||
fmap.append(x)
|
||||
x = self.conv_post(x)
|
||||
fmap.append(x)
|
||||
x = torch.flatten(x, 1, -1)
|
||||
|
||||
return x, fmap
|
||||
|
||||
|
||||
class MultiPeriodDiscriminator(torch.nn.Module):
|
||||
def __init__(self, use_spectral_norm=False):
|
||||
super(MultiPeriodDiscriminator, self).__init__()
|
||||
periods = [2,3,5,7,11]
|
||||
|
||||
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
||||
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
||||
self.discriminators = nn.ModuleList(discs)
|
||||
|
||||
def forward(self, y, y_hat):
|
||||
y_d_rs = []
|
||||
y_d_gs = []
|
||||
fmap_rs = []
|
||||
fmap_gs = []
|
||||
for i, d in enumerate(self.discriminators):
|
||||
y_d_r, fmap_r = d(y)
|
||||
y_d_g, fmap_g = d(y_hat)
|
||||
y_d_rs.append(y_d_r)
|
||||
y_d_gs.append(y_d_g)
|
||||
fmap_rs.append(fmap_r)
|
||||
fmap_gs.append(fmap_g)
|
||||
|
||||
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
||||
|
||||
|
||||
|
||||
class SynthesizerTrn(nn.Module):
|
||||
"""
|
||||
Synthesizer for Training
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
n_vocab,
|
||||
spec_channels,
|
||||
segment_size,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout,
|
||||
resblock,
|
||||
resblock_kernel_sizes,
|
||||
resblock_dilation_sizes,
|
||||
upsample_rates,
|
||||
upsample_initial_channel,
|
||||
upsample_kernel_sizes,
|
||||
n_flow,
|
||||
n_speakers=0,
|
||||
gin_channels=0,
|
||||
use_sdp=True,
|
||||
**kwargs):
|
||||
|
||||
super().__init__()
|
||||
self.n_vocab = n_vocab
|
||||
self.spec_channels = spec_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.n_heads = n_heads
|
||||
self.n_layers = n_layers
|
||||
self.kernel_size = kernel_size
|
||||
self.p_dropout = p_dropout
|
||||
self.resblock = resblock
|
||||
self.resblock_kernel_sizes = resblock_kernel_sizes
|
||||
self.resblock_dilation_sizes = resblock_dilation_sizes
|
||||
self.upsample_rates = upsample_rates
|
||||
self.upsample_initial_channel = upsample_initial_channel
|
||||
self.upsample_kernel_sizes = upsample_kernel_sizes
|
||||
self.segment_size = segment_size
|
||||
self.n_speakers = n_speakers
|
||||
self.gin_channels = gin_channels
|
||||
|
||||
self.use_sdp = use_sdp
|
||||
|
||||
self.enc_p = TextEncoder(n_vocab,
|
||||
inter_channels,
|
||||
hidden_channels,
|
||||
filter_channels,
|
||||
n_heads,
|
||||
n_layers,
|
||||
kernel_size,
|
||||
p_dropout)
|
||||
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
|
||||
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
|
||||
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels)
|
||||
|
||||
if n_speakers > 1:
|
||||
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
||||
|
||||
def forward(self, x, x_lengths, y, y_lengths, sid=None):
|
||||
|
||||
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
||||
if self.n_speakers > 0:
|
||||
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
||||
else:
|
||||
g = None
|
||||
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
|
||||
z_p = self.flow(z, y_mask, g=g)
|
||||
|
||||
with torch.no_grad():
|
||||
# negative cross-entropy
|
||||
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
|
||||
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
|
||||
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
|
||||
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
|
||||
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
|
||||
|
||||
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
||||
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
|
||||
|
||||
# expand prior
|
||||
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
|
||||
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
|
||||
|
||||
z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
|
||||
o = self.dec(z_slice, g=g)
|
||||
return o, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
|
||||
|
||||
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z, z_p, z_hat)
|
||||
|
||||
def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
o_hat = self.dec(z * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z)
|
||||
|
||||
def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
o_hat = self.dec(z * y_mask, g=g_src)
|
||||
return o_hat, y_mask, (z)
|
||||
|
||||
def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
|
||||
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
|
||||
g_src = self.emb_g(sid_src).unsqueeze(-1)
|
||||
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
|
||||
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
|
||||
z_p = self.flow(z, y_mask, g=g_src)
|
||||
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
||||
z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
|
||||
z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
|
||||
o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
|
||||
return o_hat, y_mask, (z, z_p, z_hat)
|
||||
|
||||
|
390
demo/mod/modules.py
Executable file
@ -0,0 +1,390 @@
|
||||
import copy
|
||||
import math
|
||||
import numpy as np
|
||||
import scipy
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.nn import functional as F
|
||||
|
||||
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
||||
from torch.nn.utils import weight_norm, remove_weight_norm
|
||||
|
||||
import commons
|
||||
from commons import init_weights, get_padding
|
||||
from transforms import piecewise_rational_quadratic_transform
|
||||
|
||||
|
||||
LRELU_SLOPE = 0.1
|
||||
|
||||
|
||||
class LayerNorm(nn.Module):
|
||||
def __init__(self, channels, eps=1e-5):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.eps = eps
|
||||
|
||||
self.gamma = nn.Parameter(torch.ones(channels))
|
||||
self.beta = nn.Parameter(torch.zeros(channels))
|
||||
|
||||
def forward(self, x):
|
||||
x = x.transpose(1, -1)
|
||||
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
||||
return x.transpose(1, -1)
|
||||
|
||||
|
||||
class ConvReluNorm(nn.Module):
|
||||
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.out_channels = out_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
assert n_layers > 1, "Number of layers should be larger than 0."
|
||||
|
||||
self.conv_layers = nn.ModuleList()
|
||||
self.norm_layers = nn.ModuleList()
|
||||
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.relu_drop = nn.Sequential(
|
||||
nn.ReLU(),
|
||||
nn.Dropout(p_dropout))
|
||||
for _ in range(n_layers-1):
|
||||
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
||||
self.norm_layers.append(LayerNorm(hidden_channels))
|
||||
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
||||
self.proj.weight.data.zero_()
|
||||
self.proj.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask):
|
||||
x_org = x
|
||||
for i in range(self.n_layers):
|
||||
x = self.conv_layers[i](x * x_mask)
|
||||
x = self.norm_layers[i](x)
|
||||
x = self.relu_drop(x)
|
||||
x = x_org + self.proj(x)
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class DDSConv(nn.Module):
|
||||
"""
|
||||
Dialted and Depth-Separable Convolution
|
||||
"""
|
||||
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
self.convs_sep = nn.ModuleList()
|
||||
self.convs_1x1 = nn.ModuleList()
|
||||
self.norms_1 = nn.ModuleList()
|
||||
self.norms_2 = nn.ModuleList()
|
||||
for i in range(n_layers):
|
||||
dilation = kernel_size ** i
|
||||
padding = (kernel_size * dilation - dilation) // 2
|
||||
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
||||
groups=channels, dilation=dilation, padding=padding
|
||||
))
|
||||
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
||||
self.norms_1.append(LayerNorm(channels))
|
||||
self.norms_2.append(LayerNorm(channels))
|
||||
|
||||
def forward(self, x, x_mask, g=None):
|
||||
if g is not None:
|
||||
x = x + g
|
||||
for i in range(self.n_layers):
|
||||
y = self.convs_sep[i](x * x_mask)
|
||||
y = self.norms_1[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.convs_1x1[i](y)
|
||||
y = self.norms_2[i](y)
|
||||
y = F.gelu(y)
|
||||
y = self.drop(y)
|
||||
x = x + y
|
||||
return x * x_mask
|
||||
|
||||
|
||||
class WN(torch.nn.Module):
|
||||
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
|
||||
super(WN, self).__init__()
|
||||
assert(kernel_size % 2 == 1)
|
||||
self.hidden_channels =hidden_channels
|
||||
self.kernel_size = kernel_size,
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.gin_channels = gin_channels
|
||||
self.p_dropout = p_dropout
|
||||
|
||||
self.in_layers = torch.nn.ModuleList()
|
||||
self.res_skip_layers = torch.nn.ModuleList()
|
||||
self.drop = nn.Dropout(p_dropout)
|
||||
|
||||
if gin_channels != 0:
|
||||
cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
|
||||
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
||||
|
||||
for i in range(n_layers):
|
||||
dilation = dilation_rate ** i
|
||||
padding = int((kernel_size * dilation - dilation) / 2)
|
||||
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
|
||||
dilation=dilation, padding=padding)
|
||||
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
||||
self.in_layers.append(in_layer)
|
||||
|
||||
# last one is not necessary
|
||||
if i < n_layers - 1:
|
||||
res_skip_channels = 2 * hidden_channels
|
||||
else:
|
||||
res_skip_channels = hidden_channels
|
||||
|
||||
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
||||
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
||||
self.res_skip_layers.append(res_skip_layer)
|
||||
|
||||
def forward(self, x, x_mask, g=None, **kwargs):
|
||||
output = torch.zeros_like(x)
|
||||
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
||||
|
||||
if g is not None:
|
||||
g = self.cond_layer(g)
|
||||
|
||||
for i in range(self.n_layers):
|
||||
x_in = self.in_layers[i](x)
|
||||
if g is not None:
|
||||
cond_offset = i * 2 * self.hidden_channels
|
||||
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
|
||||
else:
|
||||
g_l = torch.zeros_like(x_in)
|
||||
|
||||
acts = commons.fused_add_tanh_sigmoid_multiply(
|
||||
x_in,
|
||||
g_l,
|
||||
n_channels_tensor)
|
||||
acts = self.drop(acts)
|
||||
|
||||
res_skip_acts = self.res_skip_layers[i](acts)
|
||||
if i < self.n_layers - 1:
|
||||
res_acts = res_skip_acts[:,:self.hidden_channels,:]
|
||||
x = (x + res_acts) * x_mask
|
||||
output = output + res_skip_acts[:,self.hidden_channels:,:]
|
||||
else:
|
||||
output = output + res_skip_acts
|
||||
return output * x_mask
|
||||
|
||||
def remove_weight_norm(self):
|
||||
if self.gin_channels != 0:
|
||||
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
||||
for l in self.in_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
for l in self.res_skip_layers:
|
||||
torch.nn.utils.remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock1(torch.nn.Module):
|
||||
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
||||
super(ResBlock1, self).__init__()
|
||||
self.convs1 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
||||
padding=get_padding(kernel_size, dilation[2])))
|
||||
])
|
||||
self.convs1.apply(init_weights)
|
||||
|
||||
self.convs2 = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
||||
padding=get_padding(kernel_size, 1)))
|
||||
])
|
||||
self.convs2.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c1, c2 in zip(self.convs1, self.convs2):
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c1(xt)
|
||||
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c2(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs1:
|
||||
remove_weight_norm(l)
|
||||
for l in self.convs2:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class ResBlock2(torch.nn.Module):
|
||||
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
||||
super(ResBlock2, self).__init__()
|
||||
self.convs = nn.ModuleList([
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
||||
padding=get_padding(kernel_size, dilation[0]))),
|
||||
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
||||
padding=get_padding(kernel_size, dilation[1])))
|
||||
])
|
||||
self.convs.apply(init_weights)
|
||||
|
||||
def forward(self, x, x_mask=None):
|
||||
for c in self.convs:
|
||||
xt = F.leaky_relu(x, LRELU_SLOPE)
|
||||
if x_mask is not None:
|
||||
xt = xt * x_mask
|
||||
xt = c(xt)
|
||||
x = xt + x
|
||||
if x_mask is not None:
|
||||
x = x * x_mask
|
||||
return x
|
||||
|
||||
def remove_weight_norm(self):
|
||||
for l in self.convs:
|
||||
remove_weight_norm(l)
|
||||
|
||||
|
||||
class Log(nn.Module):
|
||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||
if not reverse:
|
||||
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
||||
logdet = torch.sum(-y, [1, 2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = torch.exp(x) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class Flip(nn.Module):
|
||||
def forward(self, x, *args, reverse=False, **kwargs):
|
||||
x = torch.flip(x, [1])
|
||||
if not reverse:
|
||||
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
||||
return x, logdet
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class ElementwiseAffine(nn.Module):
|
||||
def __init__(self, channels):
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.m = nn.Parameter(torch.zeros(channels,1))
|
||||
self.logs = nn.Parameter(torch.zeros(channels,1))
|
||||
|
||||
def forward(self, x, x_mask, reverse=False, **kwargs):
|
||||
if not reverse:
|
||||
y = self.m + torch.exp(self.logs) * x
|
||||
y = y * x_mask
|
||||
logdet = torch.sum(self.logs * x_mask, [1,2])
|
||||
return y, logdet
|
||||
else:
|
||||
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
||||
return x
|
||||
|
||||
|
||||
class ResidualCouplingLayer(nn.Module):
|
||||
def __init__(self,
|
||||
channels,
|
||||
hidden_channels,
|
||||
kernel_size,
|
||||
dilation_rate,
|
||||
n_layers,
|
||||
p_dropout=0,
|
||||
gin_channels=0,
|
||||
mean_only=False):
|
||||
assert channels % 2 == 0, "channels should be divisible by 2"
|
||||
super().__init__()
|
||||
self.channels = channels
|
||||
self.hidden_channels = hidden_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.dilation_rate = dilation_rate
|
||||
self.n_layers = n_layers
|
||||
self.half_channels = channels // 2
|
||||
self.mean_only = mean_only
|
||||
|
||||
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
||||
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
|
||||
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
||||
self.post.weight.data.zero_()
|
||||
self.post.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
||||
h = self.pre(x0) * x_mask
|
||||
h = self.enc(h, x_mask, g=g)
|
||||
stats = self.post(h) * x_mask
|
||||
if not self.mean_only:
|
||||
m, logs = torch.split(stats, [self.half_channels]*2, 1)
|
||||
else:
|
||||
m = stats
|
||||
logs = torch.zeros_like(m)
|
||||
|
||||
if not reverse:
|
||||
x1 = m + x1 * torch.exp(logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
logdet = torch.sum(logs, [1,2])
|
||||
return x, logdet
|
||||
else:
|
||||
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
||||
x = torch.cat([x0, x1], 1)
|
||||
return x
|
||||
|
||||
|
||||
class ConvFlow(nn.Module):
|
||||
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
|
||||
super().__init__()
|
||||
self.in_channels = in_channels
|
||||
self.filter_channels = filter_channels
|
||||
self.kernel_size = kernel_size
|
||||
self.n_layers = n_layers
|
||||
self.num_bins = num_bins
|
||||
self.tail_bound = tail_bound
|
||||
self.half_channels = in_channels // 2
|
||||
|
||||
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
||||
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
|
||||
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
|
||||
self.proj.weight.data.zero_()
|
||||
self.proj.bias.data.zero_()
|
||||
|
||||
def forward(self, x, x_mask, g=None, reverse=False):
|
||||
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
||||
h = self.pre(x0)
|
||||
h = self.convs(h, x_mask, g=g)
|
||||
h = self.proj(h) * x_mask
|
||||
|
||||
b, c, t = x0.shape
|
||||
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
||||
|
||||
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
|
||||
unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
|
||||
unnormalized_derivatives = h[..., 2 * self.num_bins:]
|
||||
|
||||
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=reverse,
|
||||
tails='linear',
|
||||
tail_bound=self.tail_bound
|
||||
)
|
||||
|
||||
x = torch.cat([x0, x1], 1) * x_mask
|
||||
logdet = torch.sum(logabsdet * x_mask, [1,2])
|
||||
if not reverse:
|
||||
return x, logdet
|
||||
else:
|
||||
return x
|
19
demo/mod/monotonic_align/__init__.py
Executable file
@ -0,0 +1,19 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
from .monotonic_align.core import maximum_path_c
|
||||
|
||||
|
||||
def maximum_path(neg_cent, mask):
|
||||
""" Cython optimized version.
|
||||
neg_cent: [b, t_t, t_s]
|
||||
mask: [b, t_t, t_s]
|
||||
"""
|
||||
device = neg_cent.device
|
||||
dtype = neg_cent.dtype
|
||||
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||
|
||||
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
42
demo/mod/monotonic_align/core.pyx
Executable file
@ -0,0 +1,42 @@
|
||||
cimport cython
|
||||
from cython.parallel import prange
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
|
||||
cdef int x
|
||||
cdef int y
|
||||
cdef float v_prev
|
||||
cdef float v_cur
|
||||
cdef float tmp
|
||||
cdef int index = t_x - 1
|
||||
|
||||
for y in range(t_y):
|
||||
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
|
||||
if x == y:
|
||||
v_cur = max_neg_val
|
||||
else:
|
||||
v_cur = value[y-1, x]
|
||||
if x == 0:
|
||||
if y == 0:
|
||||
v_prev = 0.
|
||||
else:
|
||||
v_prev = max_neg_val
|
||||
else:
|
||||
v_prev = value[y-1, x-1]
|
||||
value[y, x] += max(v_prev, v_cur)
|
||||
|
||||
for y in range(t_y - 1, -1, -1):
|
||||
path[y, index] = 1
|
||||
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
|
||||
index = index - 1
|
||||
|
||||
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
|
||||
cdef int b = paths.shape[0]
|
||||
cdef int i
|
||||
for i in prange(b, nogil=True):
|
||||
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])
|
BIN
demo/mod/monotonic_align/monotonic_align/core.cpython-37m-x86_64-linux-gnu.so
Executable file
BIN
demo/mod/monotonic_align/monotonic_align/core.cpython-39-x86_64-linux-gnu.so
Executable file
23
demo/mod/monotonic_align/setup.py
Executable file
@ -0,0 +1,23 @@
|
||||
import numpy as np
|
||||
import torch
|
||||
import sys
|
||||
print(sys.path)
|
||||
sys.path.append("/backend/mod/")
|
||||
print(sys.path)
|
||||
from .monotonic_align.core import maximum_path_c
|
||||
|
||||
|
||||
def maximum_path(neg_cent, mask):
|
||||
""" Cython optimized version.
|
||||
neg_cent: [b, t_t, t_s]
|
||||
mask: [b, t_t, t_s]
|
||||
"""
|
||||
device = neg_cent.device
|
||||
dtype = neg_cent.dtype
|
||||
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
|
||||
path = np.zeros(neg_cent.shape, dtype=np.int32)
|
||||
|
||||
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
|
||||
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
|
||||
return torch.from_numpy(path).to(device=device, dtype=dtype)
|
54
demo/mod/text/__init__.py
Executable file
@ -0,0 +1,54 @@
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
from text import cleaners
|
||||
from text.symbols import symbols
|
||||
|
||||
|
||||
# Mappings from symbol to numeric ID and vice versa:
|
||||
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
||||
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
||||
|
||||
|
||||
def text_to_sequence(text, cleaner_names):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
cleaner_names: names of the cleaner functions to run the text through
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = []
|
||||
|
||||
clean_text = _clean_text(text, cleaner_names)
|
||||
for symbol in clean_text:
|
||||
symbol_id = _symbol_to_id[symbol]
|
||||
sequence += [symbol_id]
|
||||
return sequence
|
||||
|
||||
|
||||
def cleaned_text_to_sequence(cleaned_text):
|
||||
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
||||
Args:
|
||||
text: string to convert to a sequence
|
||||
Returns:
|
||||
List of integers corresponding to the symbols in the text
|
||||
'''
|
||||
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
|
||||
return sequence
|
||||
|
||||
|
||||
def sequence_to_text(sequence):
|
||||
'''Converts a sequence of IDs back to a string'''
|
||||
result = ''
|
||||
for symbol_id in sequence:
|
||||
s = _id_to_symbol[symbol_id]
|
||||
result += s
|
||||
return result
|
||||
|
||||
|
||||
def _clean_text(text, cleaner_names):
|
||||
for name in cleaner_names:
|
||||
cleaner = getattr(cleaners, name)
|
||||
if not cleaner:
|
||||
raise Exception('Unknown cleaner: %s' % name)
|
||||
text = cleaner(text)
|
||||
return text
|
105
demo/mod/text/cleaners.py
Executable file
@ -0,0 +1,105 @@
|
||||
""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d. """
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Cleaners are transformations that run over the input text at both training and eval time.
|
||||
|
||||
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
||||
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
||||
1. "english_cleaners" for English text
|
||||
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
||||
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
||||
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
||||
the symbols in symbols.py to match your data).
|
||||
'''
|
||||
|
||||
import re
|
||||
from unidecode import unidecode
|
||||
from phonemizer import phonemize
|
||||
|
||||
|
||||
# Regular expression matching whitespace:
|
||||
_whitespace_re = re.compile(r'\s+')
|
||||
|
||||
# List of (regular expression, replacement) pairs for abbreviations:
|
||||
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
||||
('mrs', 'misess'),
|
||||
('mr', 'mister'),
|
||||
('dr', 'doctor'),
|
||||
('st', 'saint'),
|
||||
('co', 'company'),
|
||||
('jr', 'junior'),
|
||||
('maj', 'major'),
|
||||
('gen', 'general'),
|
||||
('drs', 'doctors'),
|
||||
('rev', 'reverend'),
|
||||
('lt', 'lieutenant'),
|
||||
('hon', 'honorable'),
|
||||
('sgt', 'sergeant'),
|
||||
('capt', 'captain'),
|
||||
('esq', 'esquire'),
|
||||
('ltd', 'limited'),
|
||||
('col', 'colonel'),
|
||||
('ft', 'fort'),
|
||||
]]
|
||||
|
||||
|
||||
def expand_abbreviations(text):
|
||||
for regex, replacement in _abbreviations:
|
||||
text = re.sub(regex, replacement, text)
|
||||
return text
|
||||
|
||||
|
||||
def expand_numbers(text):
|
||||
return normalize_numbers(text)
|
||||
|
||||
|
||||
def lowercase(text):
|
||||
return text.lower()
|
||||
|
||||
|
||||
def collapse_whitespace(text):
|
||||
return re.sub(_whitespace_re, ' ', text)
|
||||
|
||||
|
||||
def convert_to_ascii(text):
|
||||
return unidecode(text)
|
||||
|
||||
|
||||
def basic_cleaners(text):
|
||||
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def transliteration_cleaners(text):
|
||||
'''Pipeline for non-English text that transliterates to ASCII.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = collapse_whitespace(text)
|
||||
return text
|
||||
|
||||
|
||||
def english_cleaners(text):
|
||||
'''Pipeline for English text, including abbreviation expansion.'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_abbreviations(text)
|
||||
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
|
||||
phonemes = collapse_whitespace(phonemes)
|
||||
return phonemes
|
||||
|
||||
|
||||
def english_cleaners2(text):
|
||||
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
|
||||
text = convert_to_ascii(text)
|
||||
text = lowercase(text)
|
||||
text = expand_abbreviations(text)
|
||||
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
|
||||
phonemes = collapse_whitespace(phonemes)
|
||||
return phonemes
|
||||
|
||||
def japanese_cleaners(text):
|
||||
phonemes = text.split('-')
|
||||
return phonemes
|
64
demo/mod/text/symbols.py
Executable file
@ -0,0 +1,64 @@
|
||||
""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """
|
||||
""" from https://github.com/keithito/tacotron """
|
||||
|
||||
'''
|
||||
Defines the set of symbols used in text input to the model.
|
||||
'''
|
||||
_pad = '_'
|
||||
_punctuation = ';:,.!?¡¿—…"«»“” '
|
||||
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
|
||||
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
||||
|
||||
|
||||
# Export all symbols:
|
||||
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
|
||||
|
||||
# Special symbol ids
|
||||
SPACE_ID = symbols.index(" ")
|
||||
|
||||
symbols = [
|
||||
"A",
|
||||
"E",
|
||||
"I",
|
||||
"N",
|
||||
"O",
|
||||
"U",
|
||||
"a",
|
||||
"b",
|
||||
"by",
|
||||
"ch",
|
||||
"cl",
|
||||
"d",
|
||||
"dy",
|
||||
"e",
|
||||
"f",
|
||||
"g",
|
||||
"gy",
|
||||
"h",
|
||||
"hy",
|
||||
"i",
|
||||
"j",
|
||||
"k",
|
||||
"ky",
|
||||
"m",
|
||||
"my",
|
||||
"n",
|
||||
"ny",
|
||||
"o",
|
||||
"p",
|
||||
"py",
|
||||
"r",
|
||||
"ry",
|
||||
"s",
|
||||
"sh",
|
||||
"t",
|
||||
"ts",
|
||||
"ty",
|
||||
"u",
|
||||
"v",
|
||||
"w",
|
||||
"y",
|
||||
"z",
|
||||
"pau",
|
||||
"sil"
|
||||
]
|
193
demo/mod/transforms.py
Executable file
@ -0,0 +1,193 @@
|
||||
import torch
|
||||
from torch.nn import functional as F
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
||||
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
||||
DEFAULT_MIN_DERIVATIVE = 1e-3
|
||||
|
||||
|
||||
def piecewise_rational_quadratic_transform(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails=None,
|
||||
tail_bound=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
|
||||
if tails is None:
|
||||
spline_fn = rational_quadratic_spline
|
||||
spline_kwargs = {}
|
||||
else:
|
||||
spline_fn = unconstrained_rational_quadratic_spline
|
||||
spline_kwargs = {
|
||||
'tails': tails,
|
||||
'tail_bound': tail_bound
|
||||
}
|
||||
|
||||
outputs, logabsdet = spline_fn(
|
||||
inputs=inputs,
|
||||
unnormalized_widths=unnormalized_widths,
|
||||
unnormalized_heights=unnormalized_heights,
|
||||
unnormalized_derivatives=unnormalized_derivatives,
|
||||
inverse=inverse,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative,
|
||||
**spline_kwargs
|
||||
)
|
||||
return outputs, logabsdet
|
||||
|
||||
|
||||
def searchsorted(bin_locations, inputs, eps=1e-6):
|
||||
bin_locations[..., -1] += eps
|
||||
return torch.sum(
|
||||
inputs[..., None] >= bin_locations,
|
||||
dim=-1
|
||||
) - 1
|
||||
|
||||
|
||||
def unconstrained_rational_quadratic_spline(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
tails='linear',
|
||||
tail_bound=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
||||
outside_interval_mask = ~inside_interval_mask
|
||||
|
||||
outputs = torch.zeros_like(inputs)
|
||||
logabsdet = torch.zeros_like(inputs)
|
||||
|
||||
if tails == 'linear':
|
||||
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
||||
constant = np.log(np.exp(1 - min_derivative) - 1)
|
||||
unnormalized_derivatives[..., 0] = constant
|
||||
unnormalized_derivatives[..., -1] = constant
|
||||
|
||||
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
||||
logabsdet[outside_interval_mask] = 0
|
||||
else:
|
||||
raise RuntimeError('{} tails are not implemented.'.format(tails))
|
||||
|
||||
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
|
||||
inputs=inputs[inside_interval_mask],
|
||||
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
||||
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
||||
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
||||
inverse=inverse,
|
||||
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
|
||||
min_bin_width=min_bin_width,
|
||||
min_bin_height=min_bin_height,
|
||||
min_derivative=min_derivative
|
||||
)
|
||||
|
||||
return outputs, logabsdet
|
||||
|
||||
def rational_quadratic_spline(inputs,
|
||||
unnormalized_widths,
|
||||
unnormalized_heights,
|
||||
unnormalized_derivatives,
|
||||
inverse=False,
|
||||
left=0., right=1., bottom=0., top=1.,
|
||||
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
||||
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
||||
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
||||
if torch.min(inputs) < left or torch.max(inputs) > right:
|
||||
raise ValueError('Input to a transform is not within its domain')
|
||||
|
||||
num_bins = unnormalized_widths.shape[-1]
|
||||
|
||||
if min_bin_width * num_bins > 1.0:
|
||||
raise ValueError('Minimal bin width too large for the number of bins')
|
||||
if min_bin_height * num_bins > 1.0:
|
||||
raise ValueError('Minimal bin height too large for the number of bins')
|
||||
|
||||
widths = F.softmax(unnormalized_widths, dim=-1)
|
||||
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
||||
cumwidths = torch.cumsum(widths, dim=-1)
|
||||
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
|
||||
cumwidths = (right - left) * cumwidths + left
|
||||
cumwidths[..., 0] = left
|
||||
cumwidths[..., -1] = right
|
||||
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
||||
|
||||
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
||||
|
||||
heights = F.softmax(unnormalized_heights, dim=-1)
|
||||
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
||||
cumheights = torch.cumsum(heights, dim=-1)
|
||||
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
|
||||
cumheights = (top - bottom) * cumheights + bottom
|
||||
cumheights[..., 0] = bottom
|
||||
cumheights[..., -1] = top
|
||||
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
||||
|
||||
if inverse:
|
||||
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
||||
else:
|
||||
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
||||
|
||||
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
||||
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
||||
delta = heights / widths
|
||||
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
||||
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
||||
|
||||
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
||||
|
||||
if inverse:
|
||||
a = (((inputs - input_cumheights) * (input_derivatives
|
||||
+ input_derivatives_plus_one
|
||||
- 2 * input_delta)
|
||||
+ input_heights * (input_delta - input_derivatives)))
|
||||
b = (input_heights * input_derivatives
|
||||
- (inputs - input_cumheights) * (input_derivatives
|
||||
+ input_derivatives_plus_one
|
||||
- 2 * input_delta))
|
||||
c = - input_delta * (inputs - input_cumheights)
|
||||
|
||||
discriminant = b.pow(2) - 4 * a * c
|
||||
assert (discriminant >= 0).all()
|
||||
|
||||
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
||||
outputs = root * input_bin_widths + input_cumwidths
|
||||
|
||||
theta_one_minus_theta = root * (1 - root)
|
||||
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta)
|
||||
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - root).pow(2))
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, -logabsdet
|
||||
else:
|
||||
theta = (inputs - input_cumwidths) / input_bin_widths
|
||||
theta_one_minus_theta = theta * (1 - theta)
|
||||
|
||||
numerator = input_heights * (input_delta * theta.pow(2)
|
||||
+ input_derivatives * theta_one_minus_theta)
|
||||
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
||||
* theta_one_minus_theta)
|
||||
outputs = input_cumheights + numerator / denominator
|
||||
|
||||
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
|
||||
+ 2 * input_delta * theta_one_minus_theta
|
||||
+ input_derivatives * (1 - theta).pow(2))
|
||||
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
||||
|
||||
return outputs, logabsdet
|
270
demo/mod/utils.py
Executable file
@ -0,0 +1,270 @@
|
||||
import os
|
||||
import glob
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import subprocess
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import read
|
||||
import torch
|
||||
|
||||
MATPLOTLIB_FLAG = False
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||
logger = logging
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path, model, optimizer=None):
|
||||
assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}"
|
||||
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
||||
iteration = checkpoint_dict['iteration']
|
||||
learning_rate = checkpoint_dict['learning_rate']
|
||||
if optimizer is not None:
|
||||
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
||||
saved_state_dict = checkpoint_dict['model']
|
||||
if hasattr(model, 'module'):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
new_state_dict= {}
|
||||
for k, v in state_dict.items():
|
||||
try:
|
||||
new_state_dict[k] = saved_state_dict[k]
|
||||
except:
|
||||
logger.info("%s is not in the checkpoint" % k)
|
||||
new_state_dict[k] = v
|
||||
if hasattr(model, 'module'):
|
||||
model.module.load_state_dict(new_state_dict)
|
||||
else:
|
||||
model.load_state_dict(new_state_dict)
|
||||
logger.info("Loaded checkpoint '{}' (iteration {})" .format(
|
||||
checkpoint_path, iteration))
|
||||
return model, optimizer, learning_rate, iteration
|
||||
|
||||
|
||||
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
|
||||
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
||||
iteration, checkpoint_path))
|
||||
if hasattr(model, 'module'):
|
||||
state_dict = model.module.state_dict()
|
||||
else:
|
||||
state_dict = model.state_dict()
|
||||
torch.save({'model': state_dict,
|
||||
'iteration': iteration,
|
||||
'optimizer': optimizer.state_dict(),
|
||||
'learning_rate': learning_rate}, checkpoint_path)
|
||||
|
||||
|
||||
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
||||
for k, v in scalars.items():
|
||||
writer.add_scalar(k, v, global_step)
|
||||
for k, v in histograms.items():
|
||||
writer.add_histogram(k, v, global_step)
|
||||
for k, v in images.items():
|
||||
writer.add_image(k, v, global_step, dataformats='HWC')
|
||||
for k, v in audios.items():
|
||||
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
||||
|
||||
|
||||
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
||||
f_list = glob.glob(os.path.join(dir_path, regex))
|
||||
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
||||
x = f_list[-1]
|
||||
print(x)
|
||||
return x
|
||||
|
||||
|
||||
def plot_spectrogram_to_numpy(spectrogram):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(10,2))
|
||||
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
||||
interpolation='none')
|
||||
plt.colorbar(im, ax=ax)
|
||||
plt.xlabel("Frames")
|
||||
plt.ylabel("Channels")
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def plot_alignment_to_numpy(alignment, info=None):
|
||||
global MATPLOTLIB_FLAG
|
||||
if not MATPLOTLIB_FLAG:
|
||||
import matplotlib
|
||||
matplotlib.use("Agg")
|
||||
MATPLOTLIB_FLAG = True
|
||||
mpl_logger = logging.getLogger('matplotlib')
|
||||
mpl_logger.setLevel(logging.WARNING)
|
||||
import matplotlib.pylab as plt
|
||||
import numpy as np
|
||||
|
||||
fig, ax = plt.subplots(figsize=(6, 4))
|
||||
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
|
||||
interpolation='none')
|
||||
fig.colorbar(im, ax=ax)
|
||||
xlabel = 'Decoder timestep'
|
||||
if info is not None:
|
||||
xlabel += '\n\n' + info
|
||||
plt.xlabel(xlabel)
|
||||
plt.ylabel('Encoder timestep')
|
||||
plt.tight_layout()
|
||||
|
||||
fig.canvas.draw()
|
||||
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
||||
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
||||
plt.close()
|
||||
return data
|
||||
|
||||
|
||||
def load_wav_to_torch(full_path):
|
||||
sampling_rate, data = read(full_path) # scipy.io.wavfile
|
||||
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
||||
|
||||
|
||||
def load_filepaths_and_text(filename, split="|"):
|
||||
with open(filename, encoding='utf-8') as f:
|
||||
filepaths_and_text = [line.strip().split(split) for line in f]
|
||||
return filepaths_and_text
|
||||
|
||||
|
||||
def get_hparams(init=True):
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
|
||||
help='JSON file for configuration')
|
||||
parser.add_argument('-m', '--model', type=str, required=True,
|
||||
help='Model name')
|
||||
parser.add_argument('-fg', '--fine_tuning_g', type=str, default=None,
|
||||
help='If fine tuning, please specify model(G)')
|
||||
parser.add_argument('-fd', '--fine_tuning_d', type=str, default=None,
|
||||
help='If fine tuning, please specify model(D)')
|
||||
|
||||
args = parser.parse_args()
|
||||
model_dir = os.path.join("./logs", args.model)
|
||||
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
|
||||
config_path = args.config
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
if init:
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
with open(config_save_path, "w") as f:
|
||||
f.write(data)
|
||||
else:
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
#Added about fine tuning
|
||||
if args.fine_tuning_g != None and args.fine_tuning_d != None:
|
||||
config['fine_flag'] = True
|
||||
config['fine_model_g'] = args.fine_tuning_g
|
||||
config['fine_model_d'] = args.fine_tuning_d
|
||||
else:
|
||||
config['fine_flag'] = False
|
||||
|
||||
hparams = HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_dir(model_dir):
|
||||
config_save_path = os.path.join(model_dir, "config.json")
|
||||
with open(config_save_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams =HParams(**config)
|
||||
hparams.model_dir = model_dir
|
||||
return hparams
|
||||
|
||||
|
||||
def get_hparams_from_file(config_path):
|
||||
with open(config_path, "r") as f:
|
||||
data = f.read()
|
||||
config = json.loads(data)
|
||||
|
||||
hparams =HParams(**config)
|
||||
return hparams
|
||||
|
||||
|
||||
def check_git_hash(model_dir):
|
||||
source_dir = os.path.dirname(os.path.realpath(__file__))
|
||||
if not os.path.exists(os.path.join(source_dir, ".git")):
|
||||
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
|
||||
source_dir
|
||||
))
|
||||
return
|
||||
|
||||
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
||||
|
||||
path = os.path.join(model_dir, "githash")
|
||||
if os.path.exists(path):
|
||||
saved_hash = open(path).read()
|
||||
if saved_hash != cur_hash:
|
||||
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
|
||||
saved_hash[:8], cur_hash[:8]))
|
||||
else:
|
||||
open(path, "w").write(cur_hash)
|
||||
|
||||
|
||||
def get_logger(model_dir, filename="train.log"):
|
||||
global logger
|
||||
logger = logging.getLogger(os.path.basename(model_dir))
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
||||
if not os.path.exists(model_dir):
|
||||
os.makedirs(model_dir)
|
||||
h = logging.FileHandler(os.path.join(model_dir, filename))
|
||||
h.setLevel(logging.DEBUG)
|
||||
h.setFormatter(formatter)
|
||||
logger.addHandler(h)
|
||||
return logger
|
||||
|
||||
|
||||
class HParams():
|
||||
def __init__(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
if type(v) == dict:
|
||||
v = HParams(**v)
|
||||
self[k] = v
|
||||
|
||||
def keys(self):
|
||||
return self.__dict__.keys()
|
||||
|
||||
def items(self):
|
||||
return self.__dict__.items()
|
||||
|
||||
def values(self):
|
||||
return self.__dict__.values()
|
||||
|
||||
def __len__(self):
|
||||
return len(self.__dict__)
|
||||
|
||||
def __getitem__(self, key):
|
||||
return getattr(self, key)
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
return setattr(self, key, value)
|
||||
|
||||
def __contains__(self, key):
|
||||
return key in self.__dict__
|
||||
|
||||
def __repr__(self):
|
||||
return self.__dict__.__repr__()
|
85
demo/requirements.txt
Normal file
@ -0,0 +1,85 @@
|
||||
absl-py==1.2.0
|
||||
appdirs==1.4.4
|
||||
attrs==22.1.0
|
||||
audioread==3.0.0
|
||||
Babel==2.10.3
|
||||
bidict==0.22.0
|
||||
cachetools==5.2.0
|
||||
certifi==2022.6.15
|
||||
cffi==1.15.1
|
||||
charset-normalizer==2.1.1
|
||||
clldutils==3.12.0
|
||||
colorama==0.4.5
|
||||
colorlog==6.6.0
|
||||
csvw==3.1.1
|
||||
cycler==0.11.0
|
||||
Cython==0.29.32
|
||||
decorator==5.1.1
|
||||
dlinfo==1.2.1
|
||||
dnspython==2.2.1
|
||||
eventlet==0.33.1
|
||||
fonttools==4.36.0
|
||||
google-auth==2.11.0
|
||||
google-auth-oauthlib==0.4.6
|
||||
greenlet==1.1.2
|
||||
grpcio==1.47.0
|
||||
idna==3.3
|
||||
importlib-metadata==4.12.0
|
||||
isodate==0.6.1
|
||||
joblib==1.1.0
|
||||
jsonschema==4.14.0
|
||||
kiwisolver==1.4.4
|
||||
language-tags==1.1.0
|
||||
librosa==0.9.2
|
||||
llvmlite==0.39.0
|
||||
Markdown==3.4.1
|
||||
MarkupSafe==2.1.1
|
||||
matplotlib==3.5.3
|
||||
numba==0.56.0
|
||||
numpy==1.22.4
|
||||
oauthlib==3.2.0
|
||||
packaging==21.3
|
||||
phonemizer==3.2.1
|
||||
Pillow==9.2.0
|
||||
pooch==1.6.0
|
||||
protobuf==3.19.4
|
||||
psutil==5.9.1
|
||||
py==1.11.0
|
||||
pyasn1==0.4.8
|
||||
pyasn1-modules==0.2.8
|
||||
pycparser==2.21
|
||||
pyopenjtalk==0.2.0
|
||||
pyparsing==3.0.9
|
||||
pyrsistent==0.18.1
|
||||
python-dateutil==2.8.2
|
||||
python-engineio==4.3.4
|
||||
python-socketio==5.7.1
|
||||
pytz==2022.2.1
|
||||
rdflib==6.2.0
|
||||
regex==2022.8.17
|
||||
requests==2.28.1
|
||||
requests-oauthlib==1.3.1
|
||||
resampy==0.4.0
|
||||
retry==0.9.2
|
||||
rfc3986==1.5.0
|
||||
rsa==4.9
|
||||
scikit-learn==1.1.2
|
||||
scipy==1.9.0
|
||||
segments==2.2.1
|
||||
six==1.16.0
|
||||
SoundFile==0.10.3.post1
|
||||
tabulate==0.8.10
|
||||
tensorboard==2.10.0
|
||||
tensorboard-data-server==0.6.1
|
||||
tensorboard-plugin-wit==1.8.1
|
||||
threadpoolctl==3.1.0
|
||||
torch==1.12.1+cu113
|
||||
torchaudio==0.12.1+cu113
|
||||
torchvision==0.13.1+cu113
|
||||
tqdm==4.64.0
|
||||
typing-extensions==4.3.0
|
||||
Unidecode==1.3.4
|
||||
uritemplate==4.1.1
|
||||
urllib3==1.26.11
|
||||
Werkzeug==2.2.2
|
||||
zipp==3.8.1
|
136
demo/serverFlask.py
Executable file
@ -0,0 +1,136 @@
|
||||
from flask import Flask, request, Markup, abort, jsonify
|
||||
from flask_cors import CORS
|
||||
import logging
|
||||
from logging.config import dictConfig
|
||||
import sys
|
||||
import base64
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import write, read
|
||||
from datetime import datetime
|
||||
|
||||
import traceback
|
||||
import struct
|
||||
|
||||
sys.path.append("mod")
|
||||
sys.path.append("mod/text")
|
||||
|
||||
import utils
|
||||
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
|
||||
from models import SynthesizerTrn
|
||||
from text.symbols import symbols
|
||||
|
||||
dictConfig({
|
||||
'version': 1,
|
||||
'formatters': {'default': {
|
||||
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
|
||||
}},
|
||||
'handlers': {'wsgi': {
|
||||
'class': 'logging.StreamHandler',
|
||||
'stream': 'ext://flask.logging.wsgi_errors_stream',
|
||||
'formatter': 'default'
|
||||
}},
|
||||
'root': {
|
||||
'level': 'INFO',
|
||||
'handlers': ['wsgi']
|
||||
}
|
||||
})
|
||||
|
||||
app = Flask(__name__, static_folder="../frontend/dist", static_url_path='/')
|
||||
CORS(app, resources={r"/*": {"origins": "*"}})
|
||||
|
||||
class VoiceChanger():
|
||||
def __init__(self, config, model):
|
||||
self.hps =utils.get_hparams_from_file(config)
|
||||
self.net_g = SynthesizerTrn(
|
||||
len(symbols),
|
||||
self.hps.data.filter_length // 2 + 1,
|
||||
self.hps.train.segment_size // self.hps.data.hop_length,
|
||||
n_speakers=self.hps.data.n_speakers,
|
||||
**self.hps.model)
|
||||
self.net_g.eval()
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
print("GPU_NUM:",self.gpu_num)
|
||||
utils.load_checkpoint( model, self.net_g, None)
|
||||
|
||||
|
||||
def on_request(self, gpu, srcId, dstId, timestamp, wav):
|
||||
if wav==0:
|
||||
samplerate, data=read("dummy.wav")
|
||||
unpackedData = data
|
||||
else:
|
||||
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
|
||||
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
|
||||
|
||||
try:
|
||||
if gpu<0 or self.gpu_num==0 :
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cpu()
|
||||
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
|
||||
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
except Exception as e:
|
||||
print("VC PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
|
||||
audio1 = audio1.astype(np.int16)
|
||||
return audio1
|
||||
|
||||
|
||||
|
||||
@app.route('/test', methods=['GET', 'POST'])
|
||||
def test():
|
||||
try:
|
||||
if request.method == 'GET':
|
||||
return request.args.get('query', '')
|
||||
elif request.method == 'POST':
|
||||
print("POST REQUEST PROCESSING....")
|
||||
gpu = int(request.json['gpu'])
|
||||
srcId = int(request.json['srcId'])
|
||||
dstId = int(request.json['dstId'])
|
||||
timestamp = int(request.json['timestamp'])
|
||||
buffer = request.json['buffer']
|
||||
wav = base64.b64decode(buffer)
|
||||
# print(wav)
|
||||
# print(base64.b64encode(wav))
|
||||
changedVoice = voiceChanger.on_request(gpu, srcId, dstId, timestamp, wav)
|
||||
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
|
||||
# print("changedVoice",changedVoice)
|
||||
# print("CV64",changedVoiceBase64)
|
||||
data = {
|
||||
"gpu":gpu,
|
||||
"srcId":srcId,
|
||||
"dstId":dstId,
|
||||
"timestamp":timestamp,
|
||||
"changedVoiceBase64":changedVoiceBase64
|
||||
}
|
||||
return jsonify(data)
|
||||
else:
|
||||
return abort(400)
|
||||
except Exception as e:
|
||||
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
return str(e)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = sys.argv
|
||||
PORT = args[1]
|
||||
CONFIG = args[2]
|
||||
MODEL = args[3]
|
||||
app.logger.info('INITIALIZE MODEL')
|
||||
voiceChanger = VoiceChanger(CONFIG, MODEL)
|
||||
voiceChanger.on_request(0,0,0,0,0)
|
||||
app.logger.info('START APP')
|
||||
app.run(debug=True, host='0.0.0.0',port=PORT)
|
96
demo/serverSIO.py
Executable file
@ -0,0 +1,96 @@
|
||||
import eventlet
|
||||
import socketio
|
||||
import sys
|
||||
from datetime import datetime
|
||||
import struct
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
sys.path.append("mod")
|
||||
sys.path.append("mod/text")
|
||||
import utils
|
||||
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
|
||||
from models import SynthesizerTrn
|
||||
from text.symbols import symbols
|
||||
|
||||
|
||||
|
||||
class MyCustomNamespace(socketio.Namespace):
|
||||
def __init__(self, namespace, config, model):
|
||||
super().__init__(namespace)
|
||||
self.hps =utils.get_hparams_from_file(config)
|
||||
self.net_g = SynthesizerTrn(
|
||||
len(symbols),
|
||||
self.hps.data.filter_length // 2 + 1,
|
||||
self.hps.train.segment_size // self.hps.data.hop_length,
|
||||
n_speakers=self.hps.data.n_speakers,
|
||||
**self.hps.model)
|
||||
self.net_g.eval()
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
print("GPU_NUM:",self.gpu_num)
|
||||
utils.load_checkpoint( model, self.net_g, None)
|
||||
|
||||
def on_connect(self, sid, environ):
|
||||
print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
|
||||
# print('[{}] connet env : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , environ))
|
||||
|
||||
def on_request_message(self, sid, msg):
|
||||
# print("MESSGaa", msg)
|
||||
gpu = int(msg[0])
|
||||
srcId = int(msg[1])
|
||||
dstId = int(msg[2])
|
||||
timestamp = int(msg[3])
|
||||
data = msg[4]
|
||||
# print(srcId, dstId, timestamp)
|
||||
unpackedData = np.array(struct.unpack('<%sh'%(len(data) // struct.calcsize('<h') ), data))
|
||||
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
|
||||
|
||||
# self.emit('response', msg)
|
||||
|
||||
if gpu<0 or self.gpu_num==0 :
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cpu()
|
||||
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
|
||||
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
|
||||
audio1 = audio1.astype(np.int16)
|
||||
bin = struct.pack('<%sh'%len(audio1), *audio1)
|
||||
|
||||
# print("return timestamp", timestamp)
|
||||
self.emit('response',[timestamp, bin])
|
||||
|
||||
|
||||
|
||||
|
||||
def on_disconnect(self, sid):
|
||||
# print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
||||
pass;
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = sys.argv
|
||||
PORT = args[1]
|
||||
CONFIG = args[2]
|
||||
MODEL = args[3]
|
||||
print(f"start... PORT:{PORT}, CONFIG:{CONFIG}, MODEL:{MODEL}")
|
||||
# sio = socketio.Server(cors_allowed_origins='http://localhost:8080')
|
||||
sio = socketio.Server(cors_allowed_origins='*')
|
||||
sio.register_namespace(MyCustomNamespace('/test', CONFIG, MODEL))
|
||||
app = socketio.WSGIApp(sio,static_files={
|
||||
'': '../frontend/dist',
|
||||
})
|
||||
eventlet.wsgi.server(eventlet.listen(('0.0.0.0',int(PORT))), app)
|
||||
|
13
demo/setup.sh
Executable file
@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo config: $1
|
||||
echo model: $2
|
||||
cp -r /resources/* .
|
||||
|
||||
if [[ -e ./setting.json ]]; then
|
||||
cp ./setting.json ../frontend/dist/assets/setting.json
|
||||
fi
|
||||
|
||||
python3 serverSIO.py 8080 $1 $2
|
||||
|
||||
|
14
demo/setupFlask.sh
Executable file
@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo config: $1
|
||||
echo model: $2
|
||||
cp -r /resources/* .
|
||||
|
||||
if [[ -e ./setting.json ]]; then
|
||||
cp ./setting.json ../frontend/dist/assets/setting.json
|
||||
fi
|
||||
pip install flask
|
||||
pip install flask_cors
|
||||
python3 serverFlask.py 8080 $1 $2
|
||||
|
||||
|
5
demo/start.sh
Executable file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
|
||||
# python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_94000.pth
|
||||
# python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_164000.pth
|
||||
python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_210000.pth
|
BIN
docs/assets/face_detection_short_range.binc083ebee6df759da467d
Executable file
BIN
docs/assets/hand_landmark_lite.bin037c09b22c066e04d3ff
Executable file
1
docs/assets/icons/file-text.svg
Executable file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-file-text"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"></path><polyline points="14 2 14 8 20 8"></polyline><line x1="16" y1="13" x2="8" y2="13"></line><line x1="16" y1="17" x2="8" y2="17"></line><polyline points="10 9 9 9 8 9"></polyline></svg>
|
After Width: | Height: | Size: 473 B |
BIN
docs/assets/icons/flect.png
Executable file
After Width: | Height: | Size: 1.3 KiB |
1
docs/assets/icons/github.svg
Executable file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-github"><path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"></path></svg>
|
After Width: | Height: | Size: 522 B |
1
docs/assets/icons/home.svg
Executable file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-home"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"></path><polyline points="9 22 9 12 15 12 15 22"></polyline></svg>
|
After Width: | Height: | Size: 327 B |
1
docs/assets/icons/linkedin.svg
Executable file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-linkedin"><path d="M16 8a6 6 0 0 1 6 6v7h-4v-7a2 2 0 0 0-2-2 2 2 0 0 0-2 2v7h-4v-7a6 6 0 0 1 6-6z"></path><rect x="2" y="9" width="4" height="12"></rect><circle cx="4" cy="4" r="2"></circle></svg>
|
After Width: | Height: | Size: 395 B |
1
docs/assets/icons/twitter.svg
Executable file
@ -0,0 +1 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-twitter"><path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path></svg>
|
After Width: | Height: | Size: 403 B |
BIN
docs/assets/images/bg_natural_sougen.jpg
Executable file
After Width: | Height: | Size: 22 KiB |
BIN
docs/assets/model_float16_quant.bin33ee0cfa3b13c82ace2a
Executable file
BIN
docs/assets/palm_detection_lite.binba92fbef448d5b4334bc
Executable file
BIN
docs/assets/pose_detection.bin5f6876fde03ed33ebc9a
Executable file
BIN
docs/assets/pose_landmark_lite.bin80f1d546bddf782578f3
Executable file
35
docs/assets/setting.json
Executable file
@ -0,0 +1,35 @@
|
||||
{
|
||||
"app_title": "voice-changer",
|
||||
"majar_mode": "docker",
|
||||
"voice_changer_server_url": "http://localhost:8080/test",
|
||||
"sample_rate": 48000,
|
||||
"buffer_size": 1024,
|
||||
"prefix_chunk_size": 24,
|
||||
"chunk_size": 24,
|
||||
"speaker_ids": [100, 107, 101, 102, 103],
|
||||
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
|
||||
"src_id": 107,
|
||||
"dst_id": 100,
|
||||
"vf_enable": true,
|
||||
"voice_changer_mode": "realtime",
|
||||
"gpu": 0,
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4],
|
||||
"avatar": {
|
||||
"motion_capture_face": true,
|
||||
"motion_capture_upperbody": true,
|
||||
"lip_overwrite_with_voice": true,
|
||||
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
|
||||
"background_color": "#0000dd",
|
||||
"chroma_key": "#0000dd",
|
||||
"avatar_canvas_size": [1280, 720],
|
||||
"screen_canvas_size": [1280, 720]
|
||||
},
|
||||
"advance": {
|
||||
"avatar_draw_skip_rate": 3,
|
||||
"screen_draw_skip_rate": 3,
|
||||
"visualizer_draw_skip_rate": 3,
|
||||
"cross_fade_lower_value": 0.1,
|
||||
"cross_fade_overlap_rate": 0.03
|
||||
}
|
||||
}
|
BIN
docs/assets/tflite-simd.wasm1f6a0e789251efcdaa4d
Executable file
75
docs/assets/vrm/zundamon/ReadMe.txt
Executable file
@ -0,0 +1,75 @@
|
||||
この度は「ずんだもん(人型)」モデルデータセットをお求めいただき、誠にありがとうございます。
|
||||
ずんだもん(人型)モデルデータは、以下のフォーマットを用意しています。
|
||||
|
||||
|
||||
・VRChat用データ(PC/Oculus対応)
|
||||
VRChat上で必要なプログラム、マテリアル、PC用ではDynamicBoneを設定済みのUnity用編集データ(.unitypackage)です。
|
||||
・MMDデータ
|
||||
剛体、ジョイント、マテリアル等を設定済みのモデルデータ(.pmx)です。
|
||||
・VRMデータ
|
||||
VRMデータ対応の各プラットフォームで使用可能するモデルデータ(.vrm)です。SpringBone、コライダ、マテリアル等を設定済みです。
|
||||
・FBXデータ
|
||||
上記3フォーマット作成に使用したfbxデータです。ゲーム制作等、必要に応じてご使用下さい。
|
||||
・blenderデータ
|
||||
FBXデータを作成する際のモデルデータ(.blender)です。必要に応じてご使用下さい。
|
||||
・PSDデータ
|
||||
UVマップレイヤーを同梱したPSDファイルです。モデル用テクスチャを改変の際にご使用下さい。
|
||||
|
||||
なお、上記モデル用テクスチャデータ(.png)を、各々のモデルデータへ同梱しています。
|
||||
|
||||
|
||||
モデルデータ詳細=====================================
|
||||
●VRChat用データ(PC/Oculus対応)
|
||||
・VRChat用データを使用する場合、別途オーサリングソフトウェアである「Unity2019.4.31f1」が必要です。
|
||||
・Avatar3.0専用です。Avatar2.0ではご使用できません。
|
||||
・マテリアル・シェーダーでは「ユニティちゃんトゥーンシェーダー2.0.8」を使用しています。
|
||||
本モデルデータには上記シェーダーは同梱されておりませんので、あらかじめご用意をお願いします。
|
||||
・PC版ではDynamicBone設定済みですが、DynamicBone本体は同梱されておりません。
|
||||
あらかじめ購入およびご用意をお願いします。
|
||||
・シェイプキー 口:20種 目:12種 眉:3種 その他:1種を同梱しております。
|
||||
・PC版、Oculus版共にハンドサインにて表情切替可能です(7種)。
|
||||
・ハンドサインに登録されていない表情は、Unityエディターで切り替え編集を行って下さい。
|
||||
・PC版ではフルトラッキング対応です。基本的には腰、両足首にトラッカーを装着して下さい。
|
||||
・PC版では衣装やアクセサリーは別パーツとなっており、Unity上で脱着が可能です。
|
||||
・Oculus版では、仕様により衣装やアクセサリーの脱着は行えません。
|
||||
・VRChatへの詳しい導入方法は別途「VRChatキャラクター導入マニュアル」をご覧下さい。
|
||||
|
||||
●MMDデータ
|
||||
・MMDデータを使用する場合、別途「MikuMikuDance」「MikuMikuMoving」などのMMD対応ソフトウェアが必要です。
|
||||
・剛体、ジョイント、IK設定済みです。
|
||||
・準標準ボーン(全ての親、グループ、上半身2、腰、肩キャンセル、腕捩、手捩、親指0、足IK親)が設定済みです。
|
||||
・衣装やアクセサリーは別パーツとなっており、その他モーフ「脱衣」パラメータを1にすることで脱衣可能です。
|
||||
|
||||
●VRMデータ
|
||||
・VRMデータを使用する場合、VRMモデルを使用できる各アプリケーションが別途必要です。
|
||||
・VRMSDK ver.0.92を使用してデータを作成しています。
|
||||
・SpringBoneおよびSpringBoneCollider設定済みです。
|
||||
・シェーダーはVRM MToonおよびスタンダードシェーダーを使用しています。
|
||||
・BlendShapeは基本12種(A,I,U,E,O,Blink,Joy,Angly,Sorrow,Fun)から更に8種追加(Wink_L,Wink_R,Star,Hachume,No_HightLight,Aozame,Hauu,Tear)の全20種使用可能です。
|
||||
|
||||
●FBXデータ
|
||||
・FBXデータバージョンは2020となっています。アプリケーションによっては正常に読み込むことができない場合があります。
|
||||
・FBXデータご使用の際は、必ずテクスチャファイルも同時にインポート願います。
|
||||
・アプリケーションへインポートの際、座標系の問題が生じるおそれがあります(Z軸が縦方向、Y軸が奥行となります)
|
||||
・メッシュデータ、ボーンデータ、マテリアル以外のデータ(ライト、カメラ、モーションなど)は同梱されていません。
|
||||
|
||||
●Blenderデータ
|
||||
・Blenderデータバージョンは2.93です。バージョン2.93以前のBlenderでは正常に読み込むことができない場合があります。
|
||||
・MMD以外のボーン(アーマーチュア)に対応しています。MMDではボーン構造が特殊なため対応していません。
|
||||
・IKは設定しておりません。必要に応じてIKを設定願います。
|
||||
・素体、各衣装やアクセサリーを別オブジェクトとなりますので、お好みに合わせて脱着することが可能です。
|
||||
|
||||
=====================================
|
||||
|
||||
|
||||
●ご利用規約、禁止事項、免責事項は別途「ずんだもん(人型)モデル利用規約.txt」をご覧下さい。
|
||||
|
||||
|
||||
●お問合せ先
|
||||
不具合、バグ、感想、調整、商用利用などのお問合せなどは、ホームページ(https://zunko.jp)よりご連絡下さい。
|
||||
|
||||
|
||||
|
||||
|
||||
●更新履歴
|
||||
2021/12/31 初版
|
BIN
docs/assets/vrm/zundamon/zundamon.vrm
Executable file
68
docs/assets/vrm/zundamon/ずんだもん(人型)モデル利用規約(Terms of Use).txt
Executable file
@ -0,0 +1,68 @@
|
||||
ずんだもん(人型) モデルデータ利用規約
|
||||
|
||||
3Dモデリング:絹井けい
|
||||
販売元:SSS LLC. (https://zunko.jp)※以下「当社」とします。
|
||||
|
||||
・本モデルを利用、もしくは改変した時点で本規約に同意したものとします。
|
||||
・本ライセンスの内容は変更する場合があり、最新のものが適用されます。
|
||||
・本ライセンスは日本語のものが正本とされ、翻訳と差異がある場合は常に日本語のものが優先されます。
|
||||
・本モデルの二次創作物を利用することによって生じた何らかのトラブル・損失に対し、絹井けい、および当社は一切責任を負わないものとします。
|
||||
・本モデルを改変した二次創作物の著作権法上の全ての権利は当社に帰属するものとします。
|
||||
|
||||
|
||||
●利用規約●
|
||||
本モデルでは基礎条項に加え、個別条項を元に以下の行為を許可します。
|
||||
|
||||
・二次的著作物の配布の許可
|
||||
・成人向け表現(性的表現)の許可
|
||||
・成人向け表現(暴力表現)の許可
|
||||
・本モデルでの利用を目的とした衣装等を配布・頒布・販売する場合に限り、本モデルが含む以下のデータを流用することを許可します。ただし改変が著しく少ない場合を除きます。
|
||||
1.本モデルを構成するボーン・ウェイト
|
||||
2.モデルの素体パーツ「ずんだもん(人型)」メッシュデータ
|
||||
・当モデルデータを営利目的で使用する場合、販売元であるSSS LLC.が運営する「東北ずん子」ウェブサイトにて記載されている「版権商用利用の手引き(https://zunko.jp/con_shoushi.html)」に従ってください。
|
||||
ただし以下の場合、事前受諾は不要です。
|
||||
1.印刷やディスクプレスなど、個人から依頼されたものを複製する場合
|
||||
2.動画や写真への映り込みなど、本モデルがメインコンテンツと判断されない場合
|
||||
|
||||
|
||||
|
||||
●お問合せ先●
|
||||
|
||||
ホームページ:https://zunko.jp/
|
||||
版権商用利用の手引き:https://zunko.jp/con_shoushi.html
|
||||
|
||||
|
||||
|
||||
EN:Zundamon (humanoid) Model Data Terms of Use
|
||||
|
||||
3D modeling: Kei Kinui
|
||||
Distributed by: SSS LLC. (https://zunko.jp) hereinafter referred to as "the Company".
|
||||
|
||||
*By using or modifying this model, you agree to be bound by these terms.
|
||||
*The content of this license is subject to change, and the latest version shall apply.
|
||||
*The Japanese version of this license shall be the original, and if there are any differences between the Japanese version and the translated version, the Japanese version shall always take precedence.
|
||||
*Kei Kinui and the Company shall not be held responsible for any problems or losses that may arise from the use of secondary works of this model.
|
||||
*All rights under copyright law for secondary works modified from this model shall belong to the Company.
|
||||
|
||||
|
||||
*Terms of Use*
|
||||
In addition to the basic terms, this model permits the following actions based on the individual terms.
|
||||
|
||||
*Permission to distribute derivative works
|
||||
*Permission for adult expression (sexual expression)
|
||||
*Permission to use adult-oriented expressions (violent expressions)
|
||||
*Permission is granted to use the following data included in this model only for the purpose of distributing or selling costumes, etc. for use in this model. However, this excludes cases where the modifications are extremely small.
|
||||
1:The bones and weights that make up the model
|
||||
2:Mesh data of the model's body part "Zundamon (humanoid)
|
||||
*If you wish to use this model data for commercial purposes, please follow the "Guide to the Commercial Use of Copyrighted Material (https://zunko.jp/con_shoushi.html)" described on the "Tohoku Zunko" website operated by SSS LLC, the distributor.
|
||||
However, prior consent is not required in the following cases
|
||||
1:Reproduction of materials commissioned by individuals, such as printing or disc pressing.
|
||||
2:When this model is not judged to be the main content, such as reflection in videos or photos.
|
||||
|
||||
|
||||
|
||||
*contact us*
|
||||
|
||||
HomePage(https://zunko.jp)
|
||||
|
||||
Guide to the Commercial Use of Copyrighted Material(https://zunko.jp/con_shoushi.html)
|
1
docs/audiolet/index.js
Executable file
@ -0,0 +1 @@
|
||||
(()=>{"use strict";class e extends AudioWorkletProcessor{initialized=!1;playBuffer=[];deltaChunkSize=24;bufferSize=1024;constructor(){super(),this.initialized=!0,this.port.onmessage=this.handleMessage.bind(this)}prevF32Data=null;handleMessage(e){if(e.data.deltaSize)return void(this.deltaChunkSize=e.data.deltaSize);const t=e.data.data,l=new Int16Array(t),n=new Float32Array(l.length);l.forEach(((e,t)=>{const l=e>=32768?-(65536-e)/32768:e/32767;n[t]=l}));let s=this.prevF32Data?this.prevF32Data.slice(this.prevF32Data.length-this.deltaChunkSize*this.bufferSize/2):null;const h=n.slice(n.length-this.deltaChunkSize*this.bufferSize*2/2,n.length-this.deltaChunkSize*this.bufferSize/2);if(s?.length!==h.length&&(s=null),s)for(let e=0;e<s.length;e++){let t=0;if(e<s.length/3)t=0;else if(e>s.length/3*2)t=1;else{const l=e-s.length/3;t=Math.min(l/(s.length/3),1)}const l=s[e]*(1-t),n=h[e]*t;h[e]=l+n}if(this.playBuffer.length>50)for(console.log("Buffer truncated");this.playBuffer.length>2;)this.playBuffer.shift();let i;for(let e=0;e<h.length;e++){const t=2*e%128;0===t&&(i=new Float32Array(128));const l=h[e],n=e+1<h.length?h[e+1]:h[e];i[t]=l,i[t+1]=(l+n)/2,i.length===t+2&&this.playBuffer.push(i)}this.prevF32Data=n}handleMessage_(e){const t=e.data.data,l=new Int16Array(t),n=new Float32Array(l.length);l.forEach(((e,t)=>{const l=e>=32768?-(65536-e)/32768:e/32767;n[t]=l}));let s=this.prevF32Data?this.prevF32Data.slice(this.prevF32Data.length/2):null;const h=n.slice(0,n.length/2);if(s?.length!==h.length&&(s=null),s)for(let e=0;e<s.length;e++){let t=0;if(e<s.length/3)t=0;else if(e>s.length/3*2)t=1;else{const l=e-s.length/3;t=Math.min(l/(s.length/100),1)}const l=s[e]*(1-t),n=h[e]*t;h[e]=l+n}if(this.playBuffer.length>100)for(console.log("Buffer truncated");this.playBuffer.length>2;)this.playBuffer.shift();let i;for(let e=0;e<h.length;e++){const t=2*e%128;0===t&&(i=new Float32Array(128));const l=h[e],n=e+1<h.length?h[e+1]:h[e];i[t]=l,i[t+1]=(l+n)/2,i.length===t+2&&this.playBuffer.push(i)}this.prevF32Data=n}process(e,t,l){if(!this.initialized)return console.log("worklet_process not ready"),!0;if(0===this.playBuffer.length)return console.log("no play buffer"),!0;const n=this.playBuffer.shift();return t[0][0].set(n),!0}}registerProcessor("voice-player-worklet-processor",e)})();
|
BIN
docs/coffee.png
Executable file
After Width: | Height: | Size: 8.5 KiB |
BIN
docs/favicon.ico
Executable file
After Width: | Height: | Size: 9.4 KiB |
1
docs/index.html
Executable file
@ -0,0 +1 @@
|
||||
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html>
|
2
docs/index.js
Executable file
70
docs/index.js.LICENSE.txt
Executable file
@ -0,0 +1,70 @@
|
||||
/*!
|
||||
localForage -- Offline Storage, Improved
|
||||
Version 1.10.0
|
||||
https://localforage.github.io/localForage
|
||||
(c) 2013-2017 Mozilla, Apache License 2.0
|
||||
*/
|
||||
|
||||
/*!
|
||||
* Font Awesome Free 6.1.2 by @fontawesome - https://fontawesome.com
|
||||
* License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
|
||||
* Copyright 2022 Fonticons, Inc.
|
||||
*/
|
||||
|
||||
/*!
|
||||
* The buffer module from node.js, for the browser.
|
||||
*
|
||||
* @author Feross Aboukhadijeh <https://feross.org>
|
||||
* @license MIT
|
||||
*/
|
||||
|
||||
/*! (c) 2019-2021 pixiv Inc. - https://github.com/pixiv/three-vrm/blob/release/LICENSE */
|
||||
|
||||
/*! *****************************************************************************
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
Permission to use, copy, modify, and/or distribute this software for any
|
||||
purpose with or without fee is hereby granted.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
|
||||
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
|
||||
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
|
||||
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
||||
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THIS SOFTWARE.
|
||||
***************************************************************************** */
|
||||
|
||||
/*! For license information please see index.js.LICENSE.txt */
|
||||
|
||||
/*! ieee754. BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource> */
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react-dom.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* react.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @license React
|
||||
* scheduler.production.min.js
|
||||
*
|
||||
* Copyright (c) Facebook, Inc. and its affiliates.
|
||||
*
|
||||
* This source code is licensed under the MIT license found in the
|
||||
* LICENSE file in the root directory of this source tree.
|
||||
*/
|
1687
package-lock.json
generated
Normal file
@ -5,6 +5,9 @@
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"build:docker": "date +%Y%m%d%H%M%S > trainer/dummy && DOCKER_BUILDKIT=1 docker build -f trainer/Dockerfile trainer/ -t voice-changer",
|
||||
"copy:frontend":"docker run -v `pwd`/docs:/docs --entrypoint /bin/bash -ti voice-changer -c \"cp -r /voice-changer-internal/frontend/dist/* /docs\"",
|
||||
"copy:backend":"docker run -v `pwd`/demo:/demo --entrypoint /bin/bash -ti voice-changer -c \"cp -r /voice-changer-internal/voice-change-service/* /demo/\"",
|
||||
"create:demo":"run-p copy:frontend copy:backend",
|
||||
"push:docker": "bash script/001_pushDocker.sh",
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
@ -18,5 +21,8 @@
|
||||
"bugs": {
|
||||
"url": "https://github.com/w-okada/voice-changer/issues"
|
||||
},
|
||||
"homepage": "https://github.com/w-okada/voice-changer#readme"
|
||||
"homepage": "https://github.com/w-okada/voice-changer#readme",
|
||||
"devDependencies": {
|
||||
"npm-run-all": "^4.1.5"
|
||||
}
|
||||
}
|
||||
|
13
start.sh
@ -1,13 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
EXP_NAME=$1
|
||||
shift
|
||||
|
||||
docker run -it --gpus all --shm-size=128M \
|
||||
-v `pwd`/exp/${EXP_NAME}/dataset:/MMVC_Trainer/dataset \
|
||||
-v `pwd`/exp/${EXP_NAME}/logs:/MMVC_Trainer/logs \
|
||||
-v `pwd`/exp/${EXP_NAME}/filelists:/MMVC_Trainer/filelists \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"
|
159
start2.sh
Normal file
@ -0,0 +1,159 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 参考:https://programwiz.org/2022/03/22/how-to-write-shell-script-for-option-parsing/
|
||||
|
||||
DOCKER_IMAGE=dannadori/voice-changer:20220831_151141
|
||||
TENSORBOARD_PORT=6006
|
||||
VOICE_CHANGER_PORT=8080
|
||||
|
||||
set -eu
|
||||
|
||||
echo "------"
|
||||
echo "$@"
|
||||
echo "------"
|
||||
|
||||
usage() {
|
||||
echo "
|
||||
usage:
|
||||
For training
|
||||
$0 [-t] -n <exp_name> [-b batch_size] [-r]
|
||||
-t: トレーニングモードで実行する場合に指定してください。(train)
|
||||
-n: トレーニングの名前です。(name)
|
||||
-b: バッチサイズです。(batchsize)
|
||||
-r: トレーニング再開の場合に指定してください。(resume)
|
||||
For changing voice
|
||||
$0 [-v] [-c config] [-m model] [-g on/off]
|
||||
-v: ボイスチェンジャーモードで実行する場合に指定してください。(voice changer)
|
||||
-c: トレーニングで使用したConfigのファイル名です。(config)
|
||||
-m: トレーニング済みのモデルのファイル名です。(model)
|
||||
-g: GPU使用/不使用。デフォルトはonなのでGPUを使う場合は指定不要。(gpu)
|
||||
For help
|
||||
$0 [-h]
|
||||
-h: show this help
|
||||
" >&2
|
||||
}
|
||||
warn () {
|
||||
echo "! ! ! $1 ! ! !"
|
||||
exit 1
|
||||
}
|
||||
|
||||
|
||||
training_flag=false
|
||||
name=999_exp
|
||||
batch_size=10
|
||||
resume_flag=false
|
||||
|
||||
voice_change_flag=false
|
||||
config=
|
||||
model=
|
||||
gpu=on
|
||||
|
||||
escape_flag=false
|
||||
|
||||
# オプション解析
|
||||
while getopts tn:b:rvc:m:g:hx OPT; do
|
||||
case $OPT in
|
||||
t)
|
||||
training_flag=true
|
||||
;;
|
||||
n)
|
||||
name="$OPTARG"
|
||||
;;
|
||||
b)
|
||||
batch_size="$OPTARG"
|
||||
;;
|
||||
r)
|
||||
resume_flag=true
|
||||
;;
|
||||
v)
|
||||
voice_change_flag=true
|
||||
;;
|
||||
c)
|
||||
config="$OPTARG"
|
||||
;;
|
||||
m)
|
||||
model="$OPTARG"
|
||||
;;
|
||||
g)
|
||||
gpu="$OPTARG"
|
||||
;;
|
||||
h | \?)
|
||||
usage && exit 1
|
||||
;;
|
||||
x)
|
||||
escape_flag=true
|
||||
esac
|
||||
done
|
||||
|
||||
|
||||
# モード解析
|
||||
if $training_flag && $voice_change_flag; then
|
||||
warn "-t(トレーニングモード) と -v(ボイチェンモード)は同時に指定できません。"
|
||||
elif $training_flag; then
|
||||
echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
|
||||
elif $voice_change_flag; then
|
||||
echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
|
||||
elif $escape_flag; then
|
||||
/bin/bash
|
||||
else
|
||||
warn "-t(トレーニングモード) と -v(ボイチェンモード)のいずれかを指定してください。"
|
||||
fi
|
||||
|
||||
|
||||
|
||||
if $training_flag; then
|
||||
if $resume_flag; then
|
||||
echo "トレーニングを再開します"
|
||||
docker run -it --gpus all --shm-size=128M \
|
||||
-v `pwd`/exp/${name}/dataset:/MMVC_Trainer/dataset \
|
||||
-v `pwd`/exp/${name}/logs:/MMVC_Trainer/logs \
|
||||
-v `pwd`/exp/${name}/filelists:/MMVC_Trainer/filelists \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p ${TENSORBOARD_PORT}:6006 $DOCKER_IMAGE -t -b ${batch_size} -r
|
||||
else
|
||||
echo "トレーニングを開始します"
|
||||
docker run -it --gpus all --shm-size=128M \
|
||||
-v `pwd`/exp/${name}/dataset:/MMVC_Trainer/dataset \
|
||||
-v `pwd`/exp/${name}/logs:/MMVC_Trainer/logs \
|
||||
-v `pwd`/exp/${name}/filelists:/MMVC_Trainer/filelists \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p ${TENSORBOARD_PORT}:6006 $DOCKER_IMAGE -t -b ${batch_size}
|
||||
fi
|
||||
fi
|
||||
|
||||
if $voice_change_flag; then
|
||||
if [[ -z "$config" ]]; then
|
||||
warn "コンフィグファイル(-c)を指定してください"
|
||||
fi
|
||||
if [[ -z "$model" ]]; then
|
||||
warn "モデルファイル(-m)を指定してください"
|
||||
fi
|
||||
if [ "${gpu}" = "on" ]; then
|
||||
echo "GPUをマウントして起動します。"
|
||||
|
||||
docker run -it --gpus all --shm-size=128M \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p ${VOICE_CHANGER_PORT}:8080 $DOCKER_IMAGE -v -c ${config} -m ${model}
|
||||
elif [ "${gpu}" = "off" ]; then
|
||||
echo "CPUのみで稼働します。GPUは使用できません。"
|
||||
docker run -it --shm-size=128M \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p ${VOICE_CHANGER_PORT}:8080 $DOCKER_IMAGE -v -c ${config} -m ${model}
|
||||
else
|
||||
echo ${gpu}
|
||||
warn "-g は onかoffで指定して下さい。"
|
||||
|
||||
fi
|
||||
|
||||
|
||||
fi
|
||||
|
||||
|
@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker run -it --gpus all --shm-size=128M \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"
|
@ -1,7 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
docker run -it --shm-size=128M \
|
||||
-v `pwd`/vc_resources:/resources \
|
||||
-e LOCAL_UID=$(id -u $USER) \
|
||||
-e LOCAL_GID=$(id -g $USER) \
|
||||
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"
|
@ -1,8 +1,10 @@
|
||||
{
|
||||
"app_title": "voice-changer",
|
||||
"majar_mode": "docker",
|
||||
"voice_changer_server_url": "http://localhost:8080/test",
|
||||
"sample_rate": 48000,
|
||||
"buffer_size": 1024,
|
||||
"prefix_chunk_size": 24,
|
||||
"chunk_size": 24,
|
||||
"speaker_ids": [100, 107, 101, 102, 103],
|
||||
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
|
||||
@ -11,7 +13,7 @@
|
||||
"vf_enable": true,
|
||||
"voice_changer_mode": "realtime",
|
||||
"gpu": 0,
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4, 5, 100, 200],
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4],
|
||||
"avatar": {
|
||||
"motion_capture_face": true,
|
||||
"motion_capture_upperbody": true,
|
||||
@ -26,6 +28,8 @@
|
||||
"advance": {
|
||||
"avatar_draw_skip_rate": 3,
|
||||
"screen_draw_skip_rate": 3,
|
||||
"visualizer_draw_skip_rate": 3
|
||||
"visualizer_draw_skip_rate": 3,
|
||||
"cross_fade_lower_value": 0.1,
|
||||
"cross_fade_overlap_rate": 0.03
|
||||
}
|
||||
}
|
||||
|
35
template/setting_colab.json
Normal file
@ -0,0 +1,35 @@
|
||||
{
|
||||
"app_title": "voice-changer",
|
||||
"majar_mode": "colab",
|
||||
"voice_changer_server_url": "http://localhost:8080/test",
|
||||
"sample_rate": 48000,
|
||||
"buffer_size": 1024,
|
||||
"prefix_chunk_size": 24,
|
||||
"chunk_size": 24,
|
||||
"speaker_ids": [100, 107, 101, 102, 103],
|
||||
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
|
||||
"src_id": 107,
|
||||
"dst_id": 100,
|
||||
"vf_enable": true,
|
||||
"voice_changer_mode": "realtime",
|
||||
"gpu": 0,
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4],
|
||||
"avatar": {
|
||||
"motion_capture_face": true,
|
||||
"motion_capture_upperbody": true,
|
||||
"lip_overwrite_with_voice": true,
|
||||
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
|
||||
"background_color": "#0000dd",
|
||||
"chroma_key": "#0000dd",
|
||||
"avatar_canvas_size": [1280, 720],
|
||||
"screen_canvas_size": [1280, 720]
|
||||
},
|
||||
"advance": {
|
||||
"avatar_draw_skip_rate": 3,
|
||||
"screen_draw_skip_rate": 3,
|
||||
"visualizer_draw_skip_rate": 3,
|
||||
"cross_fade_lower_value": 0.1,
|
||||
"cross_fade_overlap_rate": 0.03
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
FROM dannadori/voice-changer-internal:20220826_093634 as front
|
||||
FROM dannadori/voice-changer-internal:20220831_150941 as front
|
||||
FROM debian:bullseye-slim as base
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
@ -76,20 +76,20 @@ done
|
||||
# ## コマンドライン引数から、オプション引数分を削除
|
||||
# # shift $((OPTIND - 1))
|
||||
|
||||
# モード解析
|
||||
if $training_flag && $voice_change_flag; then
|
||||
warn "-t(トレーニングモード) と -v(ボイチェンモード)は同時に指定できません。"
|
||||
exit 1
|
||||
elif $training_flag; then
|
||||
echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
|
||||
elif $voice_change_flag; then
|
||||
echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
|
||||
elif $escape_flag; then
|
||||
/bin/bash
|
||||
else
|
||||
warn "-t(トレーニングモード) と -v(ボイチェンモード)のいずれかを指定してください。"
|
||||
exit 1
|
||||
fi
|
||||
# # モード解析
|
||||
# if $training_flag && $voice_change_flag; then
|
||||
# warn "-t(トレーニングモード) と -v(ボイチェンモード)は同時に指定できません。"
|
||||
# exit 1
|
||||
# elif $training_flag; then
|
||||
# echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
|
||||
# elif $voice_change_flag; then
|
||||
# echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
|
||||
# elif $escape_flag; then
|
||||
# /bin/bash
|
||||
# else
|
||||
# warn "-t(トレーニングモード) と -v(ボイチェンモード)のいずれかを指定してください。"
|
||||
# exit 1
|
||||
# fi
|
||||
|
||||
|
||||
|
||||
|