Merge branch 'master' of github.com:w-okada/voice-changer

This commit is contained in:
wataru 2022-08-31 15:24:46 +09:00
commit 5ccdb12ff0
59 changed files with 5734 additions and 45 deletions

580
VoiceChangerDemo.ipynb Normal file
View File

@ -0,0 +1,580 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "VoiceChangerDemo",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyN+8irLJYUFlwMPzvHMSJof",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/w-okada/voice-changer/blob/dev/VoiceChangerDemo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [],
"metadata": {
"id": "57p7pA1Qb5wa"
}
},
{
"cell_type": "code",
"source": [
"!nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vV1t7PBRm-o6",
"outputId": "60fc80b2-a39e-4840-88c1-0d8d483a36ca"
},
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Wed Aug 31 06:14:56 2022 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 460.32.03 Driver Version: 460.32.03 CUDA Version: 11.2 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 72C P8 12W / 70W | 0MiB / 15109MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"| No running processes found |\n",
"+-----------------------------------------------------------------------------+\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"CONFIG=\"/content/drive/MyDrive/VoiceChanger/config.json\"\n",
"MODEL=\"/content/drive/MyDrive/VoiceChanger/G_326000.pth\""
],
"metadata": {
"id": "nSXATMWYb4Ik"
},
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2wxD-gRSMU5R",
"outputId": "83bb80fa-9ced-43e2-a304-d53a3501b142"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
],
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
]
},
{
"cell_type": "code",
"source": [
"!git clone https://github.com/w-okada/voice-changer.git\n",
"%cd voice-changer/demo/\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "86wTFmqsNMnD",
"outputId": "3fc68f14-b6b7-48bb-e285-5bed78e74f26"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Cloning into 'voice-changer'...\n",
"remote: Enumerating objects: 266, done.\u001b[K\n",
"remote: Counting objects: 100% (266/266), done.\u001b[K\n",
"remote: Compressing objects: 100% (189/189), done.\u001b[K\n",
"remote: Total 266 (delta 123), reused 194 (delta 65), pack-reused 0\u001b[K\n",
"Receiving objects: 100% (266/266), 19.11 MiB | 35.44 MiB/s, done.\n",
"Resolving deltas: 100% (123/123), done.\n",
"/content/voice-changer/demo\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!git checkout dev\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "CBsogR-zWH4r",
"outputId": "f4c9737b-831d-4938-d387-caf07693030e"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Branch 'dev' set up to track remote branch 'dev' from 'origin'.\n",
"Switched to a new branch 'dev'\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!mkdir -p ../frontend/dist\n",
"!cp -r ../docs/* ../frontend/dist/\n",
"!ls ../frontend/dist\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uCEKf3_JNoyq",
"outputId": "746e1946-5c3a-49af-df26-d86149f8adb1"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"assets\t coffee.png index.html index.js.LICENSE.txt\n",
"audiolet favicon.ico index.js\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!cp ../template/setting_colab.json ../frontend/dist/assets/setting.json"
],
"metadata": {
"id": "Bn4kV8TgXp8i"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cat ../frontend/dist/assets/setting.json"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pjxPsOOaXXTj",
"outputId": "1bf85102-87ed-462c-e732-cffb878d95f3"
},
"execution_count": 12,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"{\n",
" \"app_title\": \"voice-changer\",\n",
" \"majar_mode\": \"colab\",\n",
" \"voice_changer_server_url\": \"http://localhost:8080/test\",\n",
" \"sample_rate\": 48000,\n",
" \"buffer_size\": 1024,\n",
" \"prefix_chunk_size\": 24,\n",
" \"chunk_size\": 24,\n",
" \"speaker_ids\": [100, 107, 101, 102, 103],\n",
" \"speaker_names\": [\"ずんだもん\", \"user\", \"そら\", \"めたん\", \"つぐみ\"],\n",
" \"src_id\": 107,\n",
" \"dst_id\": 100,\n",
" \"vf_enable\": true,\n",
" \"voice_changer_mode\": \"realtime\",\n",
" \"gpu\": 0,\n",
" \"available_gpus\": [-1, 0, 1, 2, 3, 4],\n",
" \"avatar\": {\n",
" \"motion_capture_face\": true,\n",
" \"motion_capture_upperbody\": true,\n",
" \"lip_overwrite_with_voice\": true,\n",
" \"avatar_url\": \"./assets/vrm/zundamon/zundamon.vrm\",\n",
" \"backgournd_image_url\": \"./assets/images/bg_natural_sougen.jpg\",\n",
" \"background_color\": \"#0000dd\",\n",
" \"chroma_key\": \"#0000dd\",\n",
" \"avatar_canvas_size\": [1280, 720],\n",
" \"screen_canvas_size\": [1280, 720]\n",
" },\n",
" \"advance\": {\n",
" \"avatar_draw_skip_rate\": 3,\n",
" \"screen_draw_skip_rate\": 3,\n",
" \"visualizer_draw_skip_rate\": 3,\n",
" \"cross_fade_lower_value\": 0.1,\n",
" \"cross_fade_overlap_rate\": 0.03\n",
" }\n",
"}\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"# 手作業\n",
"\n",
"・configとモデルをdemoフォルダにコピー\n",
"\n",
"・docsをfrontendに変更\n",
"\n",
"・setting.jsonをfrontendにコピー\n"
],
"metadata": {
"id": "8Na2PbLZSWgZ"
}
},
{
"cell_type": "code",
"source": [
"!apt-get install -y espeak libsndfile1-dev\n",
"!pip install flask\n",
"!pip install python-socketio\n",
"!pip install eventlet\n",
"!pip install unidecode\n",
"!pip install phonemizer\n",
"!pip install retry\n",
"!pip install flask\n",
"!pip install flask_cors\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "LwZAAuqxX7yY",
"outputId": "c67b2741-7a1e-448d-abf9-7b8d8f5e3d15"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Reading package lists... Done\n",
"Building dependency tree \n",
"Reading state information... Done\n",
"libsndfile1-dev is already the newest version (1.0.28-4ubuntu0.18.04.2).\n",
"The following package was automatically installed and is no longer required:\n",
" libnvidia-common-460\n",
"Use 'apt autoremove' to remove it.\n",
"The following additional packages will be installed:\n",
" espeak-data libespeak1 libportaudio2 libsonic0\n",
"The following NEW packages will be installed:\n",
" espeak espeak-data libespeak1 libportaudio2 libsonic0\n",
"0 upgraded, 5 newly installed, 0 to remove and 20 not upgraded.\n",
"Need to get 1,219 kB of archives.\n",
"After this operation, 3,031 kB of additional disk space will be used.\n",
"Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]\n",
"Get:2 http://archive.ubuntu.com/ubuntu bionic/main amd64 libsonic0 amd64 0.2.0-6 [13.4 kB]\n",
"Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak-data amd64 1.48.04+dfsg-5 [934 kB]\n",
"Get:4 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libespeak1 amd64 1.48.04+dfsg-5 [145 kB]\n",
"Get:5 http://archive.ubuntu.com/ubuntu bionic/universe amd64 espeak amd64 1.48.04+dfsg-5 [61.6 kB]\n",
"Fetched 1,219 kB in 1s (1,636 kB/s)\n",
"Selecting previously unselected package libportaudio2:amd64.\n",
"(Reading database ... 155676 files and directories currently installed.)\n",
"Preparing to unpack .../libportaudio2_19.6.0-1_amd64.deb ...\n",
"Unpacking libportaudio2:amd64 (19.6.0-1) ...\n",
"Selecting previously unselected package libsonic0:amd64.\n",
"Preparing to unpack .../libsonic0_0.2.0-6_amd64.deb ...\n",
"Unpacking libsonic0:amd64 (0.2.0-6) ...\n",
"Selecting previously unselected package espeak-data:amd64.\n",
"Preparing to unpack .../espeak-data_1.48.04+dfsg-5_amd64.deb ...\n",
"Unpacking espeak-data:amd64 (1.48.04+dfsg-5) ...\n",
"Selecting previously unselected package libespeak1:amd64.\n",
"Preparing to unpack .../libespeak1_1.48.04+dfsg-5_amd64.deb ...\n",
"Unpacking libespeak1:amd64 (1.48.04+dfsg-5) ...\n",
"Selecting previously unselected package espeak.\n",
"Preparing to unpack .../espeak_1.48.04+dfsg-5_amd64.deb ...\n",
"Unpacking espeak (1.48.04+dfsg-5) ...\n",
"Setting up libportaudio2:amd64 (19.6.0-1) ...\n",
"Setting up espeak-data:amd64 (1.48.04+dfsg-5) ...\n",
"Setting up libsonic0:amd64 (0.2.0-6) ...\n",
"Setting up libespeak1:amd64 (1.48.04+dfsg-5) ...\n",
"Setting up espeak (1.48.04+dfsg-5) ...\n",
"Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n",
"Processing triggers for libc-bin (2.27-3ubuntu1.5) ...\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: flask in /usr/local/lib/python3.7/dist-packages (1.1.4)\n",
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from flask) (7.1.2)\n",
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from flask) (2.11.3)\n",
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from flask) (1.1.0)\n",
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from flask) (1.0.1)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->flask) (2.0.1)\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting python-socketio\n",
" Downloading python_socketio-5.7.1-py3-none-any.whl (56 kB)\n",
"\u001b[K |████████████████████████████████| 56 kB 5.0 MB/s \n",
"\u001b[?25hCollecting bidict>=0.21.0\n",
" Downloading bidict-0.22.0-py3-none-any.whl (36 kB)\n",
"Collecting python-engineio>=4.3.0\n",
" Downloading python_engineio-4.3.4-py3-none-any.whl (52 kB)\n",
"\u001b[K |████████████████████████████████| 52 kB 2.0 MB/s \n",
"\u001b[?25hInstalling collected packages: python-engineio, bidict, python-socketio\n",
"Successfully installed bidict-0.22.0 python-engineio-4.3.4 python-socketio-5.7.1\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting eventlet\n",
" Downloading eventlet-0.33.1-py2.py3-none-any.whl (226 kB)\n",
"\u001b[K |████████████████████████████████| 226 kB 33.3 MB/s \n",
"\u001b[?25hCollecting dnspython>=1.15.0\n",
" Downloading dnspython-2.2.1-py3-none-any.whl (269 kB)\n",
"\u001b[K |████████████████████████████████| 269 kB 52.5 MB/s \n",
"\u001b[?25hRequirement already satisfied: six>=1.10.0 in /usr/local/lib/python3.7/dist-packages (from eventlet) (1.15.0)\n",
"Requirement already satisfied: greenlet>=0.3 in /usr/local/lib/python3.7/dist-packages (from eventlet) (1.1.3)\n",
"Installing collected packages: dnspython, eventlet\n",
"Successfully installed dnspython-2.2.1 eventlet-0.33.1\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting unidecode\n",
" Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)\n",
"\u001b[K |████████████████████████████████| 235 kB 28.6 MB/s \n",
"\u001b[?25hInstalling collected packages: unidecode\n",
"Successfully installed unidecode-1.3.4\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting phonemizer\n",
" Downloading phonemizer-3.2.1-py3-none-any.whl (90 kB)\n",
"\u001b[K |████████████████████████████████| 90 kB 9.5 MB/s \n",
"\u001b[?25hCollecting segments\n",
" Downloading segments-2.2.1-py2.py3-none-any.whl (15 kB)\n",
"Requirement already satisfied: joblib in /usr/local/lib/python3.7/dist-packages (from phonemizer) (1.1.0)\n",
"Collecting dlinfo\n",
" Downloading dlinfo-1.2.1-py3-none-any.whl (3.6 kB)\n",
"Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from phonemizer) (4.1.1)\n",
"Requirement already satisfied: attrs>=18.1 in /usr/local/lib/python3.7/dist-packages (from phonemizer) (22.1.0)\n",
"Collecting csvw>=1.5.6\n",
" Downloading csvw-3.1.1-py2.py3-none-any.whl (56 kB)\n",
"\u001b[K |████████████████████████████████| 56 kB 5.7 MB/s \n",
"\u001b[?25hRequirement already satisfied: regex in /usr/local/lib/python3.7/dist-packages (from segments->phonemizer) (2022.6.2)\n",
"Collecting clldutils>=1.7.3\n",
" Downloading clldutils-3.12.0-py2.py3-none-any.whl (197 kB)\n",
"\u001b[K |████████████████████████████████| 197 kB 63.8 MB/s \n",
"\u001b[?25hRequirement already satisfied: python-dateutil in /usr/local/lib/python3.7/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (2.8.2)\n",
"Requirement already satisfied: tabulate>=0.7.7 in /usr/local/lib/python3.7/dist-packages (from clldutils>=1.7.3->segments->phonemizer) (0.8.10)\n",
"Collecting colorlog\n",
" Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)\n",
"Collecting colorama\n",
" Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)\n",
"Requirement already satisfied: jsonschema in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (4.3.3)\n",
"Collecting rdflib\n",
" Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)\n",
"\u001b[K |████████████████████████████████| 500 kB 53.6 MB/s \n",
"\u001b[?25hRequirement already satisfied: babel in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (2.10.3)\n",
"Collecting language-tags\n",
" Downloading language_tags-1.1.0-py2.py3-none-any.whl (210 kB)\n",
"\u001b[K |████████████████████████████████| 210 kB 65.4 MB/s \n",
"\u001b[?25hCollecting rfc3986<2\n",
" Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)\n",
"Requirement already satisfied: uritemplate>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (3.0.1)\n",
"Collecting isodate\n",
" Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n",
"\u001b[K |████████████████████████████████| 41 kB 763 kB/s \n",
"\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from csvw>=1.5.6->segments->phonemizer) (2.23.0)\n",
"Requirement already satisfied: pytz>=2015.7 in /usr/local/lib/python3.7/dist-packages (from babel->csvw>=1.5.6->segments->phonemizer) (2022.2.1)\n",
"Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from isodate->csvw>=1.5.6->segments->phonemizer) (1.15.0)\n",
"Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.18.1)\n",
"Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (5.9.0)\n",
"Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (4.12.0)\n",
"Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema->csvw>=1.5.6->segments->phonemizer) (3.8.1)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (57.4.0)\n",
"Requirement already satisfied: pyparsing in /usr/local/lib/python3.7/dist-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (3.0.9)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2022.6.15)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->csvw>=1.5.6->segments->phonemizer) (1.24.3)\n",
"Installing collected packages: isodate, rfc3986, rdflib, language-tags, colorama, csvw, colorlog, clldutils, segments, dlinfo, phonemizer\n",
"Successfully installed clldutils-3.12.0 colorama-0.4.5 colorlog-6.7.0 csvw-3.1.1 dlinfo-1.2.1 isodate-0.6.1 language-tags-1.1.0 phonemizer-3.2.1 rdflib-6.2.0 rfc3986-1.5.0 segments-2.2.1\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting retry\n",
" Downloading retry-0.9.2-py2.py3-none-any.whl (8.0 kB)\n",
"Requirement already satisfied: decorator>=3.4.2 in /usr/local/lib/python3.7/dist-packages (from retry) (4.4.2)\n",
"Requirement already satisfied: py<2.0.0,>=1.4.26 in /usr/local/lib/python3.7/dist-packages (from retry) (1.11.0)\n",
"Installing collected packages: retry\n",
"Successfully installed retry-0.9.2\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Requirement already satisfied: flask in /usr/local/lib/python3.7/dist-packages (1.1.4)\n",
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from flask) (1.1.0)\n",
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from flask) (2.11.3)\n",
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from flask) (1.0.1)\n",
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from flask) (7.1.2)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->flask) (2.0.1)\n",
"Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
"Collecting flask_cors\n",
" Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)\n",
"Requirement already satisfied: Flask>=0.9 in /usr/local/lib/python3.7/dist-packages (from flask_cors) (1.1.4)\n",
"Requirement already satisfied: Six in /usr/local/lib/python3.7/dist-packages (from flask_cors) (1.15.0)\n",
"Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (1.1.0)\n",
"Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (2.11.3)\n",
"Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (1.0.1)\n",
"Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from Flask>=0.9->flask_cors) (7.1.2)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->Flask>=0.9->flask_cors) (2.0.1)\n",
"Installing collected packages: flask-cors\n",
"Successfully installed flask-cors-3.0.10\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"get_ipython().system_raw(f'python3 serverFlask.py 8082 {CONFIG} {MODEL} >foo 2>&1 &')"
],
"metadata": {
"id": "iNOAB7zISI6J"
},
"execution_count": 14,
"outputs": []
},
{
"cell_type": "code",
"source": [
"!cat foo"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "chu06KpAjEK6",
"outputId": "887c2d50-c49f-4a22-f0d0-8a3667511466"
},
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"[2022-08-31 06:17:58,669] INFO in serverFlask: INITIALIZE MODEL\n",
"[2022-08-31 06:18:08,764] INFO in utils: Loaded checkpoint '/content/drive/MyDrive/VoiceChanger/G_326000.pth' (iteration 1136)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from google.colab import output\n",
"\n",
"output.serve_kernel_port_as_window(8082)"
],
"metadata": {
"id": "nkRjZm95l87C",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "abf57f92-5cb6-4325-b64a-095d42f561d5"
},
"execution_count": 27,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"(async (port, path, text, element) => {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" element.appendChild(document.createTextNode(''));\n",
" const url = await google.colab.kernel.proxyPort(port);\n",
" const anchor = document.createElement('a');\n",
" anchor.href = new URL(path, url).toString();\n",
" anchor.target = '_blank';\n",
" anchor.setAttribute('data-href', url + path);\n",
" anchor.textContent = text;\n",
" element.appendChild(anchor);\n",
" })(8082, \"/\", \"https://localhost:8082/\", window.element)"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"! ls ../frontend/dist/index.html"
],
"metadata": {
"id": "DKWni4moSyzO",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "b5635a1e-6ac6-41db-a706-dc3e5fb866a5"
},
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"../frontend/dist/index.html\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "3hwJmseXZhJY"
},
"execution_count": null,
"outputs": []
}
]
}

BIN
demo/dummy.wav Executable file

Binary file not shown.

2
demo/logs/.gitignore vendored Executable file
View File

@ -0,0 +1,2 @@
*
!.gitignore

303
demo/mod/attentions.py Executable file
View File

@ -0,0 +1,303 @@
import copy
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
import commons
import modules
from modules import LayerNorm
class Encoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.window_size = window_size
self.drop = nn.Dropout(p_dropout)
self.attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask):
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.attn_layers[i](x, x, attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class Decoder(nn.Module):
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
super().__init__()
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.drop = nn.Dropout(p_dropout)
self.self_attn_layers = nn.ModuleList()
self.norm_layers_0 = nn.ModuleList()
self.encdec_attn_layers = nn.ModuleList()
self.norm_layers_1 = nn.ModuleList()
self.ffn_layers = nn.ModuleList()
self.norm_layers_2 = nn.ModuleList()
for i in range(self.n_layers):
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
self.norm_layers_0.append(LayerNorm(hidden_channels))
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
self.norm_layers_1.append(LayerNorm(hidden_channels))
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
self.norm_layers_2.append(LayerNorm(hidden_channels))
def forward(self, x, x_mask, h, h_mask):
"""
x: decoder input
h: encoder output
"""
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
x = x * x_mask
for i in range(self.n_layers):
y = self.self_attn_layers[i](x, x, self_attn_mask)
y = self.drop(y)
x = self.norm_layers_0[i](x + y)
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
y = self.drop(y)
x = self.norm_layers_1[i](x + y)
y = self.ffn_layers[i](x, x_mask)
y = self.drop(y)
x = self.norm_layers_2[i](x + y)
x = x * x_mask
return x
class MultiHeadAttention(nn.Module):
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
super().__init__()
assert channels % n_heads == 0
self.channels = channels
self.out_channels = out_channels
self.n_heads = n_heads
self.p_dropout = p_dropout
self.window_size = window_size
self.heads_share = heads_share
self.block_length = block_length
self.proximal_bias = proximal_bias
self.proximal_init = proximal_init
self.attn = None
self.k_channels = channels // n_heads
self.conv_q = nn.Conv1d(channels, channels, 1)
self.conv_k = nn.Conv1d(channels, channels, 1)
self.conv_v = nn.Conv1d(channels, channels, 1)
self.conv_o = nn.Conv1d(channels, out_channels, 1)
self.drop = nn.Dropout(p_dropout)
if window_size is not None:
n_heads_rel = 1 if heads_share else n_heads
rel_stddev = self.k_channels**-0.5
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
nn.init.xavier_uniform_(self.conv_q.weight)
nn.init.xavier_uniform_(self.conv_k.weight)
nn.init.xavier_uniform_(self.conv_v.weight)
if proximal_init:
with torch.no_grad():
self.conv_k.weight.copy_(self.conv_q.weight)
self.conv_k.bias.copy_(self.conv_q.bias)
def forward(self, x, c, attn_mask=None):
q = self.conv_q(x)
k = self.conv_k(c)
v = self.conv_v(c)
x, self.attn = self.attention(q, k, v, mask=attn_mask)
x = self.conv_o(x)
return x
def attention(self, query, key, value, mask=None):
# reshape [b, d, t] -> [b, n_h, t, d_k]
b, d, t_s, t_t = (*key.size(), query.size(2))
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
if self.window_size is not None:
assert t_s == t_t, "Relative attention is only available for self-attention."
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
scores_local = self._relative_position_to_absolute_position(rel_logits)
scores = scores + scores_local
if self.proximal_bias:
assert t_s == t_t, "Proximal bias is only available for self-attention."
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e4)
if self.block_length is not None:
assert t_s == t_t, "Local attention is only available for self-attention."
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
scores = scores.masked_fill(block_mask == 0, -1e4)
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
p_attn = self.drop(p_attn)
output = torch.matmul(p_attn, value)
if self.window_size is not None:
relative_weights = self._absolute_position_to_relative_position(p_attn)
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
return output, p_attn
def _matmul_with_relative_values(self, x, y):
"""
x: [b, h, l, m]
y: [h or 1, m, d]
ret: [b, h, l, d]
"""
ret = torch.matmul(x, y.unsqueeze(0))
return ret
def _matmul_with_relative_keys(self, x, y):
"""
x: [b, h, l, d]
y: [h or 1, m, d]
ret: [b, h, l, m]
"""
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
return ret
def _get_relative_embeddings(self, relative_embeddings, length):
max_relative_position = 2 * self.window_size + 1
# Pad first before slice to avoid using cond ops.
pad_length = max(length - (self.window_size + 1), 0)
slice_start_position = max((self.window_size + 1) - length, 0)
slice_end_position = slice_start_position + 2 * length - 1
if pad_length > 0:
padded_relative_embeddings = F.pad(
relative_embeddings,
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
else:
padded_relative_embeddings = relative_embeddings
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
return used_relative_embeddings
def _relative_position_to_absolute_position(self, x):
"""
x: [b, h, l, 2*l-1]
ret: [b, h, l, l]
"""
batch, heads, length, _ = x.size()
# Concat columns of pad to shift from relative to absolute indexing.
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
# Concat extra elements so to add up to shape (len+1, 2*len-1).
x_flat = x.view([batch, heads, length * 2 * length])
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
# Reshape and slice out the padded elements.
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
return x_final
def _absolute_position_to_relative_position(self, x):
"""
x: [b, h, l, l]
ret: [b, h, l, 2*l-1]
"""
batch, heads, length, _ = x.size()
# padd along column
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
# add 0's in the beginning that will skew the elements after reshape
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
return x_final
def _attention_bias_proximal(self, length):
"""Bias for self-attention to encourage attention to close positions.
Args:
length: an integer scalar.
Returns:
a Tensor with shape [1, 1, length, length]
"""
r = torch.arange(length, dtype=torch.float32)
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
class FFN(nn.Module):
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.activation = activation
self.causal = causal
if causal:
self.padding = self._causal_padding
else:
self.padding = self._same_padding
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
self.drop = nn.Dropout(p_dropout)
def forward(self, x, x_mask):
x = self.conv_1(self.padding(x * x_mask))
if self.activation == "gelu":
x = x * torch.sigmoid(1.702 * x)
else:
x = torch.relu(x)
x = self.drop(x)
x = self.conv_2(self.padding(x * x_mask))
return x * x_mask
def _causal_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = self.kernel_size - 1
pad_r = 0
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x
def _same_padding(self, x):
if self.kernel_size == 1:
return x
pad_l = (self.kernel_size - 1) // 2
pad_r = self.kernel_size // 2
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
x = F.pad(x, commons.convert_pad_shape(padding))
return x

161
demo/mod/commons.py Executable file
View File

@ -0,0 +1,161 @@
import math
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F
def init_weights(m, mean=0.0, std=0.01):
classname = m.__class__.__name__
if classname.find("Conv") != -1:
m.weight.data.normal_(mean, std)
def get_padding(kernel_size, dilation=1):
return int((kernel_size*dilation - dilation)/2)
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def intersperse(lst, item):
result = [item] * (len(lst) * 2 + 1)
result[1::2] = lst
return result
def kl_divergence(m_p, logs_p, m_q, logs_q):
"""KL(P||Q)"""
kl = (logs_q - logs_p) - 0.5
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
return kl
def rand_gumbel(shape):
"""Sample from the Gumbel distribution, protect from overflows."""
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
return -torch.log(-torch.log(uniform_samples))
def rand_gumbel_like(x):
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
return g
def slice_segments(x, ids_str, segment_size=4):
ret = torch.zeros_like(x[:, :, :segment_size])
for i in range(x.size(0)):
idx_str = ids_str[i]
idx_end = idx_str + segment_size
ret[i] = x[i, :, idx_str:idx_end]
return ret
def rand_slice_segments(x, x_lengths=None, segment_size=4):
b, d, t = x.size()
if x_lengths is None:
x_lengths = t
ids_str_max = x_lengths - segment_size + 1
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
ret = slice_segments(x, ids_str, segment_size)
return ret, ids_str
def get_timing_signal_1d(
length, channels, min_timescale=1.0, max_timescale=1.0e4):
position = torch.arange(length, dtype=torch.float)
num_timescales = channels // 2
log_timescale_increment = (
math.log(float(max_timescale) / float(min_timescale)) /
(num_timescales - 1))
inv_timescales = min_timescale * torch.exp(
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
signal = F.pad(signal, [0, 0, 0, channels % 2])
signal = signal.view(1, channels, length)
return signal
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return x + signal.to(dtype=x.dtype, device=x.device)
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
b, channels, length = x.size()
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
def subsequent_mask(length):
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
return mask
@torch.jit.script
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
n_channels_int = n_channels[0]
in_act = input_a + input_b
t_act = torch.tanh(in_act[:, :n_channels_int, :])
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
acts = t_act * s_act
return acts
def convert_pad_shape(pad_shape):
l = pad_shape[::-1]
pad_shape = [item for sublist in l for item in sublist]
return pad_shape
def shift_1d(x):
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
return x
def sequence_mask(length, max_length=None):
if max_length is None:
max_length = length.max()
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
return x.unsqueeze(0) < length.unsqueeze(1)
def generate_path(duration, mask):
"""
duration: [b, 1, t_x]
mask: [b, 1, t_y, t_x]
"""
device = duration.device
b, _, t_y, t_x = mask.shape
cum_duration = torch.cumsum(duration, -1)
cum_duration_flat = cum_duration.view(b * t_x)
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
path = path.view(b, t_x, t_y)
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
path = path.unsqueeze(1).transpose(2,3) * mask
return path
def clip_grad_value_(parameters, clip_value, norm_type=2):
if isinstance(parameters, torch.Tensor):
parameters = [parameters]
parameters = list(filter(lambda p: p.grad is not None, parameters))
norm_type = float(norm_type)
if clip_value is not None:
clip_value = float(clip_value)
total_norm = 0
for p in parameters:
param_norm = p.grad.data.norm(norm_type)
total_norm += param_norm.item() ** norm_type
if clip_value is not None:
p.grad.data.clamp_(min=-clip_value, max=clip_value)
total_norm = total_norm ** (1. / norm_type)
return total_norm

492
demo/mod/data_utils.py Executable file
View File

@ -0,0 +1,492 @@
import time
import os
import random
import numpy as np
import torch
import torch.utils.data
import tqdm
import commons
from mel_processing import spectrogram_torch
from utils import load_wav_to_torch, load_filepaths_and_text
from text import text_to_sequence, cleaned_text_to_sequence
import struct
#add
from retry import retry
import random
import torchaudio
from scipy.io.wavfile import write
class TextAudioLoader(torch.utils.data.Dataset):
"""
1) loads audio, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_and_text, hparams, use_test = True):
self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.use_test = use_test
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 190)
random.seed(1234)
random.shuffle(self.audiopaths_and_text)
self._filter()
def _filter(self):
"""
Filter text & store spec lengths
"""
# Store spectrogram lengths for Bucketing
# wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
# spec_length = wav_length // hop_length
audiopaths_and_text_new = []
lengths = []
for audiopath, text in self.audiopaths_and_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_and_text_new.append([audiopath, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_and_text = audiopaths_and_text_new
self.lengths = lengths
def get_audio_text_pair(self, audiopath_and_text):
# separate filename and text
audiopath, text = audiopath_and_text[0], audiopath_and_text[1]
text = self.get_text(text)
if self.use_test != True:
text = torch.as_tensor("a")
spec, wav = self.get_audio(audiopath)
return (text, spec, wav)
def get_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
if sampling_rate != self.sampling_rate:
raise ValueError("{} {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if os.path.exists(spec_filename):
spec = torch.load(spec_filename)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
return spec, audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def __getitem__(self, index):
return self.get_audio_text_pair(self.audiopaths_and_text[index])
def __len__(self):
return len(self.audiopaths_and_text)
class TextAudioCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False):
self.return_ids = return_ids
def __call__(self, batch):
"""Collate's training batch from normalized text and aduio
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths
"""Multi speaker version"""
class TextAudioSpeakerLoader(torch.utils.data.Dataset):
"""
1) loads audio, speaker_id, text pairs
2) normalizes text and converts them to sequences of integers
3) computes spectrograms from audio files.
"""
def __init__(self, audiopaths_sid_text, hparams, no_text=False, augmentation=False, augmentation_params=None, no_use_textfile = False):
if no_use_textfile:
self.audiopaths_sid_text = list()
else:
self.audiopaths_sid_text = load_filepaths_and_text(audiopaths_sid_text)
self.text_cleaners = hparams.text_cleaners
self.max_wav_value = hparams.max_wav_value
self.sampling_rate = hparams.sampling_rate
self.filter_length = hparams.filter_length
self.hop_length = hparams.hop_length
self.win_length = hparams.win_length
self.sampling_rate = hparams.sampling_rate
self.no_text = no_text
self.augmentation = augmentation
if augmentation :
self.gain_p = augmentation_params.gain_p
self.min_gain_in_db = augmentation_params.min_gain_in_db
self.max_gain_in_db = augmentation_params.max_gain_in_db
self.time_stretch_p = augmentation_params.time_stretch_p
self.min_rate = augmentation_params.min_rate
self.max_rate = augmentation_params.max_rate
self.pitch_shift_p = augmentation_params.pitch_shift_p
self.min_semitones = augmentation_params.min_semitones
self.max_semitones = augmentation_params.max_semitones
self.add_gaussian_noise_p = augmentation_params.add_gaussian_noise_p
self.min_amplitude = augmentation_params.min_amplitude
self.max_amplitude = augmentation_params.max_amplitude
self.frequency_mask_p = augmentation_params.frequency_mask_p
self.cleaned_text = getattr(hparams, "cleaned_text", False)
self.add_blank = hparams.add_blank
self.min_text_len = getattr(hparams, "min_text_len", 1)
self.max_text_len = getattr(hparams, "max_text_len", 1000)
random.seed(1234)
random.shuffle(self.audiopaths_sid_text)
self._filter()
@retry(tries=30, delay=10)
def _filter(self):
"""
Filter text & store spec lengths
"""
audiopaths_sid_text_new = []
lengths = []
# for audiopath, sid, text in tqdm.tqdm(self.audiopaths_sid_text):
for audiopath, sid, text in self.audiopaths_sid_text:
if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
audiopaths_sid_text_new.append([audiopath, sid, text])
lengths.append(os.path.getsize(audiopath) // (2 * self.hop_length))
self.audiopaths_sid_text = audiopaths_sid_text_new
self.lengths = lengths
def get_audio_text_speaker_pair(self, audiopath_sid_text):
# separate filename, speaker_id and text
wavdata, sid, text = audiopath_sid_text[0], audiopath_sid_text[1], audiopath_sid_text[2]
text = self.get_text(text)
if self.no_text:
text = self.get_text("a")
spec, wav = self.get_audio(wavdata)
sid = self.get_sid(sid)
return (text, spec, wav, sid)
@retry(exceptions=(PermissionError), tries=100, delay=10)
def get_audio(self, wavdata):
# 音声データは±1.0内に正規化したtorchベクトルでunsqueeze(0)で外側1次元くるんだものを扱う
audio = torch.FloatTensor(wavdata.astype(np.float32))
sampling_rate=24000
try:
if sampling_rate != self.sampling_rate:
raise ValueError("[Error] Exception: source {} SR doesn't match target {} SR".format(
sampling_rate, self.sampling_rate))
except ValueError as e:
print(e)
exit()
audio_norm = self.get_normalized_audio(audio, self.max_wav_value)
if self.augmentation:
audio_augmented = self.add_augmentation(audio_norm, sampling_rate)
audio_noised = self.add_noise(audio_augmented, sampling_rate)
# ーマライズ後のaugmentationとnoise付加で範囲外になったところを削る
audio_augmented = torch.clamp(audio_augmented, -1, 1)
audio_noised = torch.clamp(audio_noised, -1, 1)
# audio(音声波形)は教師信号となるのでイズは含まずaugmentationのみしたものを使用
audio_norm = audio_augmented
# spec(スペクトログラム)は入力信号となるのでaugmentationしてさらにイズを付加したものを使用
spec = spectrogram_torch(audio_noised, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec_noised = self.add_spectrogram_noise(spec)
spec = torch.squeeze(spec_noised, 0)
else:
spec = spectrogram_torch(audio_norm, self.filter_length,
self.sampling_rate, self.hop_length, self.win_length,
center=False)
spec = torch.squeeze(spec, 0)
return spec, audio_norm
def add_augmentation(self, audio, sampling_rate):
gain_in_db = 0.0
if random.random() <= self.gain_p:
gain_in_db = random.uniform(self.min_gain_in_db, self.max_gain_in_db)
time_stretch_rate = 1.0
if random.random() <= self.time_stretch_p:
time_stretch_rate = random.uniform(self.min_rate, self.max_rate)
pitch_shift_semitones = 0
if random.random() <= self.pitch_shift_p:
pitch_shift_semitones = random.uniform(self.min_semitones, self.max_semitones) * 100 # 1/100 semitone 単位指定のため
augmentation_effects = [
["gain", f"{gain_in_db}"],
["tempo", f"{time_stretch_rate}"],
["pitch", f"{pitch_shift_semitones}"],
["rate", f"{sampling_rate}"]
]
audio_augmented, _ = torchaudio.sox_effects.apply_effects_tensor(audio, sampling_rate, augmentation_effects)
return audio_augmented
def add_noise(self, audio, sampling_rate):
# AddGaussianNoise
audio = self.add_gaussian_noise(audio)
return audio
def add_gaussian_noise(self, audio):
assert self.min_amplitude >= 0.0
assert self.max_amplitude >= 0.0
assert self.max_amplitude >= self.min_amplitude
if random.random() > self.add_gaussian_noise_p:
return audio
amplitude = random.uniform(self.min_amplitude, self.max_amplitude)
noise = torch.randn(audio.size())
noised_audio = audio + amplitude * noise
return noised_audio
def add_spectrogram_noise(self, spec):
# FrequencyMask
masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=80)
masked = masking(spec)
return masked
def get_normalized_audio(self, audio, max_wav_value):
audio_norm = audio / max_wav_value
audio_norm = audio_norm.unsqueeze(0)
return audio_norm
def get_text(self, text):
if self.cleaned_text:
text_norm = cleaned_text_to_sequence(text)
else:
text_norm = text_to_sequence(text, self.text_cleaners)
if self.add_blank:
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
return text_norm
def get_sid(self, sid):
sid = torch.LongTensor([int(sid)])
return sid
def __getitem__(self, index):
return self.get_audio_text_speaker_pair(self.audiopaths_sid_text[index])
def __len__(self):
return len(self.audiopaths_sid_text)
class TextAudioSpeakerCollate():
""" Zero-pads model inputs and targets
"""
def __init__(self, return_ids=False, no_text = False):
self.return_ids = return_ids
self.no_text = no_text
def __call__(self, batch):
"""Collate's training batch from normalized text, audio and speaker identities
PARAMS
------
batch: [text_normalized, spec_normalized, wav_normalized, sid]
"""
# Right zero-pad all one-hot text sequences to max input length
_, ids_sorted_decreasing = torch.sort(
torch.LongTensor([x[1].size(1) for x in batch]),
dim=0, descending=True)
max_text_len = max([len(x[0]) for x in batch])
max_spec_len = max([x[1].size(1) for x in batch])
max_wav_len = max([x[2].size(1) for x in batch])
text_lengths = torch.LongTensor(len(batch))
spec_lengths = torch.LongTensor(len(batch))
wav_lengths = torch.LongTensor(len(batch))
sid = torch.LongTensor(len(batch))
text_padded = torch.LongTensor(len(batch), max_text_len)
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
text_padded.zero_()
spec_padded.zero_()
wav_padded.zero_()
for i in range(len(ids_sorted_decreasing)):
row = batch[ids_sorted_decreasing[i]]
text = row[0]
text_padded[i, :text.size(0)] = text
text_lengths[i] = text.size(0)
spec = row[1]
spec_padded[i, :, :spec.size(1)] = spec
spec_lengths[i] = spec.size(1)
wav = row[2]
wav_padded[i, :, :wav.size(1)] = wav
wav_lengths[i] = wav.size(1)
sid[i] = row[3]
if self.return_ids:
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid, ids_sorted_decreasing
return text_padded, text_lengths, spec_padded, spec_lengths, wav_padded, wav_lengths, sid
class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
"""
Maintain similar input lengths in a batch.
Length groups are specified by boundaries.
Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
It removes samples which are not included in the boundaries.
Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
"""
def __init__(self, dataset, batch_size, boundaries, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
self.lengths = dataset.lengths
self.batch_size = batch_size
self.boundaries = boundaries
self.buckets, self.num_samples_per_bucket = self._create_buckets()
self.total_size = sum(self.num_samples_per_bucket)
self.num_samples = self.total_size // self.num_replicas
def _create_buckets(self):
buckets = [[] for _ in range(len(self.boundaries) - 1)]
for i in range(len(self.lengths)):
length = self.lengths[i]
idx_bucket = self._bisect(length)
if idx_bucket != -1:
buckets[idx_bucket].append(i)
for i in range(len(buckets) - 1, 0, -1):
if len(buckets[i]) == 0:
buckets.pop(i)
self.boundaries.pop(i+1)
num_samples_per_bucket = []
for i in range(len(buckets)):
len_bucket = len(buckets[i])
total_batch_size = self.num_replicas * self.batch_size
rem = (total_batch_size - (len_bucket % total_batch_size)) % total_batch_size
num_samples_per_bucket.append(len_bucket + rem)
return buckets, num_samples_per_bucket
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
if self.shuffle:
for bucket in self.buckets:
indices.append(torch.randperm(len(bucket), generator=g).tolist())
else:
for bucket in self.buckets:
indices.append(list(range(len(bucket))))
batches = []
for i in range(len(self.buckets)):
next_bucket = (i+1) % len(self.buckets)
bucket = self.buckets[i]
len_bucket = len(bucket)
ids_bucket = indices[i]
num_samples_bucket = self.num_samples_per_bucket[i]
if len_bucket == 0:
print("[Warn] Exception: length of buckets {} is 0. ID:{} Skip.".format(i,i))
continue
# add extra samples to make it evenly divisible
rem = num_samples_bucket - len_bucket
ids_bucket = ids_bucket + ids_bucket * (rem // len_bucket) + ids_bucket[:(rem % len_bucket)]
# subsample
ids_bucket = ids_bucket[self.rank::self.num_replicas]
# batching
for j in range(len(ids_bucket) // self.batch_size):
batch = [bucket[idx] for idx in ids_bucket[j*self.batch_size:(j+1)*self.batch_size]]
batches.append(batch)
if self.shuffle:
batch_ids = torch.randperm(len(batches), generator=g).tolist()
batches = [batches[i] for i in batch_ids]
self.batches = batches
assert len(self.batches) * self.batch_size == self.num_samples
return iter(self.batches)
def _bisect(self, x, lo=0, hi=None):
if hi is None:
hi = len(self.boundaries) - 1
if hi > lo:
mid = (hi + lo) // 2
if self.boundaries[mid] < x and x <= self.boundaries[mid+1]:
return mid
elif x <= self.boundaries[mid]:
return self._bisect(x, lo, mid)
else:
return self._bisect(x, mid + 1, hi)
else:
return -1
def __len__(self):
return self.num_samples // self.batch_size

114
demo/mod/mel_processing.py Executable file
View File

@ -0,0 +1,114 @@
import math
import os
import random
import torch
from torch import nn
import torch.nn.functional as F
import torch.utils.data
import numpy as np
import librosa
import librosa.util as librosa_util
from librosa.util import normalize, pad_center, tiny
from scipy.signal import get_window
from scipy.io.wavfile import read
from librosa.filters import mel as librosa_mel_fn
MAX_WAV_VALUE = 32768.0
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
"""
PARAMS
------
C: compression factor
"""
return torch.log(torch.clamp(x, min=clip_val) * C)
def dynamic_range_decompression_torch(x, C=1):
"""
PARAMS
------
C: compression factor used to compress
"""
return torch.exp(x) / C
def spectral_normalize_torch(magnitudes):
output = dynamic_range_compression_torch(magnitudes)
return output
def spectral_de_normalize_torch(magnitudes):
output = dynamic_range_decompression_torch(magnitudes)
return output
mel_basis = {}
hann_window = {}
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
global hann_window
dtype_device = str(y.dtype) + '_' + str(y.device)
wnsize_dtype_device = str(win_size) + '_' + dtype_device
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.view_as_real(spec)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
return spec
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
global mel_basis
dtype_device = str(spec.dtype) + '_' + str(spec.device)
fmax_dtype_device = str(fmax) + '_' + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
if torch.min(y) < -1.:
print('min value is ', torch.min(y))
if torch.max(y) > 1.:
print('max value is ', torch.max(y))
global mel_basis, hann_window
dtype_device = str(y.dtype) + '_' + str(y.device)
fmax_dtype_device = str(fmax) + '_' + dtype_device
wnsize_dtype_device = str(win_size) + '_' + dtype_device
if fmax_dtype_device not in mel_basis:
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
if wnsize_dtype_device not in hann_window:
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
y = y.squeeze(1)
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=True)
spec = torch.view_as_real(spec)
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
spec = spectral_normalize_torch(spec)
return spec

407
demo/mod/models.py Executable file
View File

@ -0,0 +1,407 @@
import copy
import math
import torch
from torch import nn
from torch.nn import functional as F
import commons
import modules
import attentions
import monotonic_align
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
from commons import init_weights, get_padding
class TextEncoder(nn.Module):
def __init__(self,
n_vocab,
out_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout):
super().__init__()
self.n_vocab = n_vocab
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.emb = nn.Embedding(n_vocab, hidden_channels)
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
self.encoder = attentions.Encoder(
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout)
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths):
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
x = torch.transpose(x, 1, -1) # [b, h, t]
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.encoder(x * x_mask, x_mask)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
return x, m, logs, x_mask
class ResidualCouplingBlock(nn.Module):
def __init__(self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
n_flows=4,
gin_channels=0):
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.n_flows = n_flows
self.gin_channels = gin_channels
self.flows = nn.ModuleList()
for i in range(n_flows):
self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
self.flows.append(modules.Flip())
def forward(self, x, x_mask, g=None, reverse=False):
if not reverse:
for flow in self.flows:
x, _ = flow(x, x_mask, g=g, reverse=reverse)
else:
for flow in reversed(self.flows):
x = flow(x, x_mask, g=g, reverse=reverse)
return x
class PosteriorEncoder(nn.Module):
def __init__(self,
in_channels,
out_channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
gin_channels=0):
super().__init__()
self.in_channels = in_channels
self.out_channels = out_channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels)
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
def forward(self, x, x_lengths, g=None):
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
x = self.pre(x) * x_mask
x = self.enc(x, x_mask, g=g)
stats = self.proj(x) * x_mask
m, logs = torch.split(stats, self.out_channels, dim=1)
z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
return z, m, logs, x_mask
class Generator(torch.nn.Module):
def __init__(self, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=0):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
resblock = modules.ResBlock1 if resblock == '1' else modules.ResBlock2
self.ups = nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups.append(weight_norm(
ConvTranspose1d(upsample_initial_channel//(2**i), upsample_initial_channel//(2**(i+1)),
k, u, padding=(k-u)//2)))
self.resblocks = nn.ModuleList()
for i in range(len(self.ups)):
ch = upsample_initial_channel//(2**(i+1))
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
self.resblocks.append(resblock(ch, k, d))
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups.apply(init_weights)
if gin_channels != 0:
#self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
gin_channels = 0
def forward(self, x, g=None):
x = self.conv_pre(x)
if g is not None:
#x = x + self.cond(g)
g=None
for i in range(self.num_upsamples):
x = F.leaky_relu(x, modules.LRELU_SLOPE)
x = self.ups[i](x)
xs = None
for j in range(self.num_kernels):
if xs is None:
xs = self.resblocks[i*self.num_kernels+j](x)
else:
xs += self.resblocks[i*self.num_kernels+j](x)
x = xs / self.num_kernels
x = F.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def remove_weight_norm(self):
print('Removing weight norm...')
for l in self.ups:
remove_weight_norm(l)
for l in self.resblocks:
l.remove_weight_norm()
class DiscriminatorP(torch.nn.Module):
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
super(DiscriminatorP, self).__init__()
self.period = period
self.use_spectral_norm = use_spectral_norm
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
])
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
def forward(self, x):
fmap = []
# 1d to 2d
b, c, t = x.shape
if t % self.period != 0: # pad first
n_pad = self.period - (t % self.period)
x = F.pad(x, (0, n_pad), "reflect")
t = t + n_pad
x = x.view(b, c, t // self.period, self.period)
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class DiscriminatorS(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(DiscriminatorS, self).__init__()
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
self.convs = nn.ModuleList([
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
])
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
def forward(self, x):
fmap = []
for l in self.convs:
x = l(x)
x = F.leaky_relu(x, modules.LRELU_SLOPE)
fmap.append(x)
x = self.conv_post(x)
fmap.append(x)
x = torch.flatten(x, 1, -1)
return x, fmap
class MultiPeriodDiscriminator(torch.nn.Module):
def __init__(self, use_spectral_norm=False):
super(MultiPeriodDiscriminator, self).__init__()
periods = [2,3,5,7,11]
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
self.discriminators = nn.ModuleList(discs)
def forward(self, y, y_hat):
y_d_rs = []
y_d_gs = []
fmap_rs = []
fmap_gs = []
for i, d in enumerate(self.discriminators):
y_d_r, fmap_r = d(y)
y_d_g, fmap_g = d(y_hat)
y_d_rs.append(y_d_r)
y_d_gs.append(y_d_g)
fmap_rs.append(fmap_r)
fmap_gs.append(fmap_g)
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
class SynthesizerTrn(nn.Module):
"""
Synthesizer for Training
"""
def __init__(self,
n_vocab,
spec_channels,
segment_size,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
n_flow,
n_speakers=0,
gin_channels=0,
use_sdp=True,
**kwargs):
super().__init__()
self.n_vocab = n_vocab
self.spec_channels = spec_channels
self.hidden_channels = hidden_channels
self.filter_channels = filter_channels
self.n_heads = n_heads
self.n_layers = n_layers
self.kernel_size = kernel_size
self.p_dropout = p_dropout
self.resblock = resblock
self.resblock_kernel_sizes = resblock_kernel_sizes
self.resblock_dilation_sizes = resblock_dilation_sizes
self.upsample_rates = upsample_rates
self.upsample_initial_channel = upsample_initial_channel
self.upsample_kernel_sizes = upsample_kernel_sizes
self.segment_size = segment_size
self.n_speakers = n_speakers
self.gin_channels = gin_channels
self.use_sdp = use_sdp
self.enc_p = TextEncoder(n_vocab,
inter_channels,
hidden_channels,
filter_channels,
n_heads,
n_layers,
kernel_size,
p_dropout)
self.dec = Generator(inter_channels, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates, upsample_initial_channel, upsample_kernel_sizes, gin_channels=gin_channels)
self.enc_q = PosteriorEncoder(spec_channels, inter_channels, hidden_channels, 5, 1, 16, gin_channels=gin_channels)
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, n_flows=n_flow, gin_channels=gin_channels)
if n_speakers > 1:
self.emb_g = nn.Embedding(n_speakers, gin_channels)
def forward(self, x, x_lengths, y, y_lengths, sid=None):
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
if self.n_speakers > 0:
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
else:
g = None
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
z_p = self.flow(z, y_mask, g=g)
with torch.no_grad():
# negative cross-entropy
s_p_sq_r = torch.exp(-2 * logs_p) # [b, d, t]
neg_cent1 = torch.sum(-0.5 * math.log(2 * math.pi) - logs_p, [1], keepdim=True) # [b, 1, t_s]
neg_cent2 = torch.matmul(-0.5 * (z_p ** 2).transpose(1, 2), s_p_sq_r) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent3 = torch.matmul(z_p.transpose(1, 2), (m_p * s_p_sq_r)) # [b, t_t, d] x [b, d, t_s] = [b, t_t, t_s]
neg_cent4 = torch.sum(-0.5 * (m_p ** 2) * s_p_sq_r, [1], keepdim=True) # [b, 1, t_s]
neg_cent = neg_cent1 + neg_cent2 + neg_cent3 + neg_cent4
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
attn = monotonic_align.maximum_path(neg_cent, attn_mask.squeeze(1)).unsqueeze(1).detach()
# expand prior
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2)
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2)
z_slice, ids_slice = commons.rand_slice_segments(z, y_lengths, self.segment_size)
o = self.dec(z_slice, g=g)
return o, attn, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
o_hat = self.dec(z_hat * y_mask, g=g_tgt)
return o_hat, y_mask, (z, z_p, z_hat)
def voice_ra_pa_db(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
o_hat = self.dec(z * y_mask, g=g_tgt)
return o_hat, y_mask, (z)
def voice_ra_pa_da(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
o_hat = self.dec(z * y_mask, g=g_src)
return o_hat, y_mask, (z)
def voice_conversion_cycle(self, y, y_lengths, sid_src, sid_tgt):
assert self.n_speakers > 0, "n_speakers have to be larger than 0."
g_src = self.emb_g(sid_src).unsqueeze(-1)
g_tgt = self.emb_g(sid_tgt).unsqueeze(-1)
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src)
z_p = self.flow(z, y_mask, g=g_src)
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
z_p_hat = self.flow(z_hat, y_mask, g=g_tgt)
z_hat_hat = self.flow(z_p_hat, y_mask, g=g_src, reverse=True)
o_hat = self.dec(z_hat_hat * y_mask, g=g_tgt)
return o_hat, y_mask, (z, z_p, z_hat)

390
demo/mod/modules.py Executable file
View File

@ -0,0 +1,390 @@
import copy
import math
import numpy as np
import scipy
import torch
from torch import nn
from torch.nn import functional as F
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
from torch.nn.utils import weight_norm, remove_weight_norm
import commons
from commons import init_weights, get_padding
from transforms import piecewise_rational_quadratic_transform
LRELU_SLOPE = 0.1
class LayerNorm(nn.Module):
def __init__(self, channels, eps=1e-5):
super().__init__()
self.channels = channels
self.eps = eps
self.gamma = nn.Parameter(torch.ones(channels))
self.beta = nn.Parameter(torch.zeros(channels))
def forward(self, x):
x = x.transpose(1, -1)
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
return x.transpose(1, -1)
class ConvReluNorm(nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
super().__init__()
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.out_channels = out_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
assert n_layers > 1, "Number of layers should be larger than 0."
self.conv_layers = nn.ModuleList()
self.norm_layers = nn.ModuleList()
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
self.norm_layers.append(LayerNorm(hidden_channels))
self.relu_drop = nn.Sequential(
nn.ReLU(),
nn.Dropout(p_dropout))
for _ in range(n_layers-1):
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
self.norm_layers.append(LayerNorm(hidden_channels))
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask):
x_org = x
for i in range(self.n_layers):
x = self.conv_layers[i](x * x_mask)
x = self.norm_layers[i](x)
x = self.relu_drop(x)
x = x_org + self.proj(x)
return x * x_mask
class DDSConv(nn.Module):
"""
Dialted and Depth-Separable Convolution
"""
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
super().__init__()
self.channels = channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.p_dropout = p_dropout
self.drop = nn.Dropout(p_dropout)
self.convs_sep = nn.ModuleList()
self.convs_1x1 = nn.ModuleList()
self.norms_1 = nn.ModuleList()
self.norms_2 = nn.ModuleList()
for i in range(n_layers):
dilation = kernel_size ** i
padding = (kernel_size * dilation - dilation) // 2
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
groups=channels, dilation=dilation, padding=padding
))
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
self.norms_1.append(LayerNorm(channels))
self.norms_2.append(LayerNorm(channels))
def forward(self, x, x_mask, g=None):
if g is not None:
x = x + g
for i in range(self.n_layers):
y = self.convs_sep[i](x * x_mask)
y = self.norms_1[i](y)
y = F.gelu(y)
y = self.convs_1x1[i](y)
y = self.norms_2[i](y)
y = F.gelu(y)
y = self.drop(y)
x = x + y
return x * x_mask
class WN(torch.nn.Module):
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=0, p_dropout=0):
super(WN, self).__init__()
assert(kernel_size % 2 == 1)
self.hidden_channels =hidden_channels
self.kernel_size = kernel_size,
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.gin_channels = gin_channels
self.p_dropout = p_dropout
self.in_layers = torch.nn.ModuleList()
self.res_skip_layers = torch.nn.ModuleList()
self.drop = nn.Dropout(p_dropout)
if gin_channels != 0:
cond_layer = torch.nn.Conv1d(gin_channels, 2*hidden_channels*n_layers, 1)
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
for i in range(n_layers):
dilation = dilation_rate ** i
padding = int((kernel_size * dilation - dilation) / 2)
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
dilation=dilation, padding=padding)
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
self.in_layers.append(in_layer)
# last one is not necessary
if i < n_layers - 1:
res_skip_channels = 2 * hidden_channels
else:
res_skip_channels = hidden_channels
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
self.res_skip_layers.append(res_skip_layer)
def forward(self, x, x_mask, g=None, **kwargs):
output = torch.zeros_like(x)
n_channels_tensor = torch.IntTensor([self.hidden_channels])
if g is not None:
g = self.cond_layer(g)
for i in range(self.n_layers):
x_in = self.in_layers[i](x)
if g is not None:
cond_offset = i * 2 * self.hidden_channels
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
else:
g_l = torch.zeros_like(x_in)
acts = commons.fused_add_tanh_sigmoid_multiply(
x_in,
g_l,
n_channels_tensor)
acts = self.drop(acts)
res_skip_acts = self.res_skip_layers[i](acts)
if i < self.n_layers - 1:
res_acts = res_skip_acts[:,:self.hidden_channels,:]
x = (x + res_acts) * x_mask
output = output + res_skip_acts[:,self.hidden_channels:,:]
else:
output = output + res_skip_acts
return output * x_mask
def remove_weight_norm(self):
if self.gin_channels != 0:
torch.nn.utils.remove_weight_norm(self.cond_layer)
for l in self.in_layers:
torch.nn.utils.remove_weight_norm(l)
for l in self.res_skip_layers:
torch.nn.utils.remove_weight_norm(l)
class ResBlock1(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
super(ResBlock1, self).__init__()
self.convs1 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
padding=get_padding(kernel_size, dilation[2])))
])
self.convs1.apply(init_weights)
self.convs2 = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
padding=get_padding(kernel_size, 1)))
])
self.convs2.apply(init_weights)
def forward(self, x, x_mask=None):
for c1, c2 in zip(self.convs1, self.convs2):
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c1(xt)
xt = F.leaky_relu(xt, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c2(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs1:
remove_weight_norm(l)
for l in self.convs2:
remove_weight_norm(l)
class ResBlock2(torch.nn.Module):
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
super(ResBlock2, self).__init__()
self.convs = nn.ModuleList([
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
padding=get_padding(kernel_size, dilation[0]))),
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
padding=get_padding(kernel_size, dilation[1])))
])
self.convs.apply(init_weights)
def forward(self, x, x_mask=None):
for c in self.convs:
xt = F.leaky_relu(x, LRELU_SLOPE)
if x_mask is not None:
xt = xt * x_mask
xt = c(xt)
x = xt + x
if x_mask is not None:
x = x * x_mask
return x
def remove_weight_norm(self):
for l in self.convs:
remove_weight_norm(l)
class Log(nn.Module):
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
logdet = torch.sum(-y, [1, 2])
return y, logdet
else:
x = torch.exp(x) * x_mask
return x
class Flip(nn.Module):
def forward(self, x, *args, reverse=False, **kwargs):
x = torch.flip(x, [1])
if not reverse:
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
return x, logdet
else:
return x
class ElementwiseAffine(nn.Module):
def __init__(self, channels):
super().__init__()
self.channels = channels
self.m = nn.Parameter(torch.zeros(channels,1))
self.logs = nn.Parameter(torch.zeros(channels,1))
def forward(self, x, x_mask, reverse=False, **kwargs):
if not reverse:
y = self.m + torch.exp(self.logs) * x
y = y * x_mask
logdet = torch.sum(self.logs * x_mask, [1,2])
return y, logdet
else:
x = (x - self.m) * torch.exp(-self.logs) * x_mask
return x
class ResidualCouplingLayer(nn.Module):
def __init__(self,
channels,
hidden_channels,
kernel_size,
dilation_rate,
n_layers,
p_dropout=0,
gin_channels=0,
mean_only=False):
assert channels % 2 == 0, "channels should be divisible by 2"
super().__init__()
self.channels = channels
self.hidden_channels = hidden_channels
self.kernel_size = kernel_size
self.dilation_rate = dilation_rate
self.n_layers = n_layers
self.half_channels = channels // 2
self.mean_only = mean_only
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, gin_channels=gin_channels)
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
self.post.weight.data.zero_()
self.post.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
h = self.pre(x0) * x_mask
h = self.enc(h, x_mask, g=g)
stats = self.post(h) * x_mask
if not self.mean_only:
m, logs = torch.split(stats, [self.half_channels]*2, 1)
else:
m = stats
logs = torch.zeros_like(m)
if not reverse:
x1 = m + x1 * torch.exp(logs) * x_mask
x = torch.cat([x0, x1], 1)
logdet = torch.sum(logs, [1,2])
return x, logdet
else:
x1 = (x1 - m) * torch.exp(-logs) * x_mask
x = torch.cat([x0, x1], 1)
return x
class ConvFlow(nn.Module):
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
super().__init__()
self.in_channels = in_channels
self.filter_channels = filter_channels
self.kernel_size = kernel_size
self.n_layers = n_layers
self.num_bins = num_bins
self.tail_bound = tail_bound
self.half_channels = in_channels // 2
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
self.proj.weight.data.zero_()
self.proj.bias.data.zero_()
def forward(self, x, x_mask, g=None, reverse=False):
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
h = self.pre(x0)
h = self.convs(h, x_mask, g=g)
h = self.proj(h) * x_mask
b, c, t = x0.shape
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
unnormalized_derivatives = h[..., 2 * self.num_bins:]
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=reverse,
tails='linear',
tail_bound=self.tail_bound
)
x = torch.cat([x0, x1], 1) * x_mask
logdet = torch.sum(logabsdet * x_mask, [1,2])
if not reverse:
return x, logdet
else:
return x

View File

@ -0,0 +1,19 @@
import numpy as np
import torch
from .monotonic_align.core import maximum_path_c
def maximum_path(neg_cent, mask):
""" Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)

View File

@ -0,0 +1,42 @@
cimport cython
from cython.parallel import prange
@cython.boundscheck(False)
@cython.wraparound(False)
cdef void maximum_path_each(int[:,::1] path, float[:,::1] value, int t_y, int t_x, float max_neg_val=-1e9) nogil:
cdef int x
cdef int y
cdef float v_prev
cdef float v_cur
cdef float tmp
cdef int index = t_x - 1
for y in range(t_y):
for x in range(max(0, t_x + y - t_y), min(t_x, y + 1)):
if x == y:
v_cur = max_neg_val
else:
v_cur = value[y-1, x]
if x == 0:
if y == 0:
v_prev = 0.
else:
v_prev = max_neg_val
else:
v_prev = value[y-1, x-1]
value[y, x] += max(v_prev, v_cur)
for y in range(t_y - 1, -1, -1):
path[y, index] = 1
if index != 0 and (index == y or value[y-1, index] < value[y-1, index-1]):
index = index - 1
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef void maximum_path_c(int[:,:,::1] paths, float[:,:,::1] values, int[::1] t_ys, int[::1] t_xs) nogil:
cdef int b = paths.shape[0]
cdef int i
for i in prange(b, nogil=True):
maximum_path_each(paths[i], values[i], t_ys[i], t_xs[i])

View File

@ -0,0 +1,23 @@
import numpy as np
import torch
import sys
print(sys.path)
sys.path.append("/backend/mod/")
print(sys.path)
from .monotonic_align.core import maximum_path_c
def maximum_path(neg_cent, mask):
""" Cython optimized version.
neg_cent: [b, t_t, t_s]
mask: [b, t_t, t_s]
"""
device = neg_cent.device
dtype = neg_cent.dtype
neg_cent = neg_cent.data.cpu().numpy().astype(np.float32)
path = np.zeros(neg_cent.shape, dtype=np.int32)
t_t_max = mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32)
t_s_max = mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32)
maximum_path_c(path, neg_cent, t_t_max, t_s_max)
return torch.from_numpy(path).to(device=device, dtype=dtype)

54
demo/mod/text/__init__.py Executable file
View File

@ -0,0 +1,54 @@
""" from https://github.com/keithito/tacotron """
from text import cleaners
from text.symbols import symbols
# Mappings from symbol to numeric ID and vice versa:
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
def text_to_sequence(text, cleaner_names):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
cleaner_names: names of the cleaner functions to run the text through
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = []
clean_text = _clean_text(text, cleaner_names)
for symbol in clean_text:
symbol_id = _symbol_to_id[symbol]
sequence += [symbol_id]
return sequence
def cleaned_text_to_sequence(cleaned_text):
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
text: string to convert to a sequence
Returns:
List of integers corresponding to the symbols in the text
'''
sequence = [_symbol_to_id[symbol] for symbol in cleaned_text]
return sequence
def sequence_to_text(sequence):
'''Converts a sequence of IDs back to a string'''
result = ''
for symbol_id in sequence:
s = _id_to_symbol[symbol_id]
result += s
return result
def _clean_text(text, cleaner_names):
for name in cleaner_names:
cleaner = getattr(cleaners, name)
if not cleaner:
raise Exception('Unknown cleaner: %s' % name)
text = cleaner(text)
return text

105
demo/mod/text/cleaners.py Executable file
View File

@ -0,0 +1,105 @@
""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d. """
""" from https://github.com/keithito/tacotron """
'''
Cleaners are transformations that run over the input text at both training and eval time.
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
1. "english_cleaners" for English text
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
the symbols in symbols.py to match your data).
'''
import re
from unidecode import unidecode
from phonemizer import phonemize
# Regular expression matching whitespace:
_whitespace_re = re.compile(r'\s+')
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
('mrs', 'misess'),
('mr', 'mister'),
('dr', 'doctor'),
('st', 'saint'),
('co', 'company'),
('jr', 'junior'),
('maj', 'major'),
('gen', 'general'),
('drs', 'doctors'),
('rev', 'reverend'),
('lt', 'lieutenant'),
('hon', 'honorable'),
('sgt', 'sergeant'),
('capt', 'captain'),
('esq', 'esquire'),
('ltd', 'limited'),
('col', 'colonel'),
('ft', 'fort'),
]]
def expand_abbreviations(text):
for regex, replacement in _abbreviations:
text = re.sub(regex, replacement, text)
return text
def expand_numbers(text):
return normalize_numbers(text)
def lowercase(text):
return text.lower()
def collapse_whitespace(text):
return re.sub(_whitespace_re, ' ', text)
def convert_to_ascii(text):
return unidecode(text)
def basic_cleaners(text):
'''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
text = lowercase(text)
text = collapse_whitespace(text)
return text
def transliteration_cleaners(text):
'''Pipeline for non-English text that transliterates to ASCII.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = collapse_whitespace(text)
return text
def english_cleaners(text):
'''Pipeline for English text, including abbreviation expansion.'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_abbreviations(text)
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True)
phonemes = collapse_whitespace(phonemes)
return phonemes
def english_cleaners2(text):
'''Pipeline for English text, including abbreviation expansion. + punctuation + stress'''
text = convert_to_ascii(text)
text = lowercase(text)
text = expand_abbreviations(text)
phonemes = phonemize(text, language='en-us', backend='espeak', strip=True, preserve_punctuation=True, with_stress=True)
phonemes = collapse_whitespace(phonemes)
return phonemes
def japanese_cleaners(text):
phonemes = text.split('-')
return phonemes

64
demo/mod/text/symbols.py Executable file
View File

@ -0,0 +1,64 @@
""" The following information was added with reference to https://github.com/jaywalnut310/vits/tree/1eef52ed50743f77fca9ff6773ba673497f6bf9d """
""" from https://github.com/keithito/tacotron """
'''
Defines the set of symbols used in text input to the model.
'''
_pad = '_'
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'"
# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
# Special symbol ids
SPACE_ID = symbols.index(" ")
symbols = [
"A",
"E",
"I",
"N",
"O",
"U",
"a",
"b",
"by",
"ch",
"cl",
"d",
"dy",
"e",
"f",
"g",
"gy",
"h",
"hy",
"i",
"j",
"k",
"ky",
"m",
"my",
"n",
"ny",
"o",
"p",
"py",
"r",
"ry",
"s",
"sh",
"t",
"ts",
"ty",
"u",
"v",
"w",
"y",
"z",
"pau",
"sil"
]

193
demo/mod/transforms.py Executable file
View File

@ -0,0 +1,193 @@
import torch
from torch.nn import functional as F
import numpy as np
DEFAULT_MIN_BIN_WIDTH = 1e-3
DEFAULT_MIN_BIN_HEIGHT = 1e-3
DEFAULT_MIN_DERIVATIVE = 1e-3
def piecewise_rational_quadratic_transform(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails=None,
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if tails is None:
spline_fn = rational_quadratic_spline
spline_kwargs = {}
else:
spline_fn = unconstrained_rational_quadratic_spline
spline_kwargs = {
'tails': tails,
'tail_bound': tail_bound
}
outputs, logabsdet = spline_fn(
inputs=inputs,
unnormalized_widths=unnormalized_widths,
unnormalized_heights=unnormalized_heights,
unnormalized_derivatives=unnormalized_derivatives,
inverse=inverse,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative,
**spline_kwargs
)
return outputs, logabsdet
def searchsorted(bin_locations, inputs, eps=1e-6):
bin_locations[..., -1] += eps
return torch.sum(
inputs[..., None] >= bin_locations,
dim=-1
) - 1
def unconstrained_rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
tails='linear',
tail_bound=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
outside_interval_mask = ~inside_interval_mask
outputs = torch.zeros_like(inputs)
logabsdet = torch.zeros_like(inputs)
if tails == 'linear':
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
constant = np.log(np.exp(1 - min_derivative) - 1)
unnormalized_derivatives[..., 0] = constant
unnormalized_derivatives[..., -1] = constant
outputs[outside_interval_mask] = inputs[outside_interval_mask]
logabsdet[outside_interval_mask] = 0
else:
raise RuntimeError('{} tails are not implemented.'.format(tails))
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
inputs=inputs[inside_interval_mask],
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
inverse=inverse,
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
min_bin_width=min_bin_width,
min_bin_height=min_bin_height,
min_derivative=min_derivative
)
return outputs, logabsdet
def rational_quadratic_spline(inputs,
unnormalized_widths,
unnormalized_heights,
unnormalized_derivatives,
inverse=False,
left=0., right=1., bottom=0., top=1.,
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
min_derivative=DEFAULT_MIN_DERIVATIVE):
if torch.min(inputs) < left or torch.max(inputs) > right:
raise ValueError('Input to a transform is not within its domain')
num_bins = unnormalized_widths.shape[-1]
if min_bin_width * num_bins > 1.0:
raise ValueError('Minimal bin width too large for the number of bins')
if min_bin_height * num_bins > 1.0:
raise ValueError('Minimal bin height too large for the number of bins')
widths = F.softmax(unnormalized_widths, dim=-1)
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
cumwidths = torch.cumsum(widths, dim=-1)
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
cumwidths = (right - left) * cumwidths + left
cumwidths[..., 0] = left
cumwidths[..., -1] = right
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
heights = F.softmax(unnormalized_heights, dim=-1)
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
cumheights = torch.cumsum(heights, dim=-1)
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
cumheights = (top - bottom) * cumheights + bottom
cumheights[..., 0] = bottom
cumheights[..., -1] = top
heights = cumheights[..., 1:] - cumheights[..., :-1]
if inverse:
bin_idx = searchsorted(cumheights, inputs)[..., None]
else:
bin_idx = searchsorted(cumwidths, inputs)[..., None]
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
delta = heights / widths
input_delta = delta.gather(-1, bin_idx)[..., 0]
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
input_heights = heights.gather(-1, bin_idx)[..., 0]
if inverse:
a = (((inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta)
+ input_heights * (input_delta - input_derivatives)))
b = (input_heights * input_derivatives
- (inputs - input_cumheights) * (input_derivatives
+ input_derivatives_plus_one
- 2 * input_delta))
c = - input_delta * (inputs - input_cumheights)
discriminant = b.pow(2) - 4 * a * c
assert (discriminant >= 0).all()
root = (2 * c) / (-b - torch.sqrt(discriminant))
outputs = root * input_bin_widths + input_cumwidths
theta_one_minus_theta = root * (1 - root)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - root).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, -logabsdet
else:
theta = (inputs - input_cumwidths) / input_bin_widths
theta_one_minus_theta = theta * (1 - theta)
numerator = input_heights * (input_delta * theta.pow(2)
+ input_derivatives * theta_one_minus_theta)
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
* theta_one_minus_theta)
outputs = input_cumheights + numerator / denominator
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
+ 2 * input_delta * theta_one_minus_theta
+ input_derivatives * (1 - theta).pow(2))
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
return outputs, logabsdet

270
demo/mod/utils.py Executable file
View File

@ -0,0 +1,270 @@
import os
import glob
import sys
import argparse
import logging
import json
import subprocess
import numpy as np
from scipy.io.wavfile import read
import torch
MATPLOTLIB_FLAG = False
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logger = logging
def load_checkpoint(checkpoint_path, model, optimizer=None):
assert os.path.isfile(checkpoint_path), f"No such file or directory: {checkpoint_path}"
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
iteration = checkpoint_dict['iteration']
learning_rate = checkpoint_dict['learning_rate']
if optimizer is not None:
optimizer.load_state_dict(checkpoint_dict['optimizer'])
saved_state_dict = checkpoint_dict['model']
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
new_state_dict= {}
for k, v in state_dict.items():
try:
new_state_dict[k] = saved_state_dict[k]
except:
logger.info("%s is not in the checkpoint" % k)
new_state_dict[k] = v
if hasattr(model, 'module'):
model.module.load_state_dict(new_state_dict)
else:
model.load_state_dict(new_state_dict)
logger.info("Loaded checkpoint '{}' (iteration {})" .format(
checkpoint_path, iteration))
return model, optimizer, learning_rate, iteration
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
logger.info("Saving model and optimizer state at iteration {} to {}".format(
iteration, checkpoint_path))
if hasattr(model, 'module'):
state_dict = model.module.state_dict()
else:
state_dict = model.state_dict()
torch.save({'model': state_dict,
'iteration': iteration,
'optimizer': optimizer.state_dict(),
'learning_rate': learning_rate}, checkpoint_path)
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
for k, v in scalars.items():
writer.add_scalar(k, v, global_step)
for k, v in histograms.items():
writer.add_histogram(k, v, global_step)
for k, v in images.items():
writer.add_image(k, v, global_step, dataformats='HWC')
for k, v in audios.items():
writer.add_audio(k, v, global_step, audio_sampling_rate)
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
f_list = glob.glob(os.path.join(dir_path, regex))
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
x = f_list[-1]
print(x)
return x
def plot_spectrogram_to_numpy(spectrogram):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(10,2))
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
interpolation='none')
plt.colorbar(im, ax=ax)
plt.xlabel("Frames")
plt.ylabel("Channels")
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def plot_alignment_to_numpy(alignment, info=None):
global MATPLOTLIB_FLAG
if not MATPLOTLIB_FLAG:
import matplotlib
matplotlib.use("Agg")
MATPLOTLIB_FLAG = True
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.WARNING)
import matplotlib.pylab as plt
import numpy as np
fig, ax = plt.subplots(figsize=(6, 4))
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
if info is not None:
xlabel += '\n\n' + info
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
fig.canvas.draw()
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
plt.close()
return data
def load_wav_to_torch(full_path):
sampling_rate, data = read(full_path) # scipy.io.wavfile
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
def load_filepaths_and_text(filename, split="|"):
with open(filename, encoding='utf-8') as f:
filepaths_and_text = [line.strip().split(split) for line in f]
return filepaths_and_text
def get_hparams(init=True):
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
help='JSON file for configuration')
parser.add_argument('-m', '--model', type=str, required=True,
help='Model name')
parser.add_argument('-fg', '--fine_tuning_g', type=str, default=None,
help='If fine tuning, please specify model(G)')
parser.add_argument('-fd', '--fine_tuning_d', type=str, default=None,
help='If fine tuning, please specify model(D)')
args = parser.parse_args()
model_dir = os.path.join("./logs", args.model)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
config_path = args.config
config_save_path = os.path.join(model_dir, "config.json")
if init:
with open(config_path, "r") as f:
data = f.read()
with open(config_save_path, "w") as f:
f.write(data)
else:
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
#Added about fine tuning
if args.fine_tuning_g != None and args.fine_tuning_d != None:
config['fine_flag'] = True
config['fine_model_g'] = args.fine_tuning_g
config['fine_model_d'] = args.fine_tuning_d
else:
config['fine_flag'] = False
hparams = HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_dir(model_dir):
config_save_path = os.path.join(model_dir, "config.json")
with open(config_save_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams =HParams(**config)
hparams.model_dir = model_dir
return hparams
def get_hparams_from_file(config_path):
with open(config_path, "r") as f:
data = f.read()
config = json.loads(data)
hparams =HParams(**config)
return hparams
def check_git_hash(model_dir):
source_dir = os.path.dirname(os.path.realpath(__file__))
if not os.path.exists(os.path.join(source_dir, ".git")):
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
source_dir
))
return
cur_hash = subprocess.getoutput("git rev-parse HEAD")
path = os.path.join(model_dir, "githash")
if os.path.exists(path):
saved_hash = open(path).read()
if saved_hash != cur_hash:
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
saved_hash[:8], cur_hash[:8]))
else:
open(path, "w").write(cur_hash)
def get_logger(model_dir, filename="train.log"):
global logger
logger = logging.getLogger(os.path.basename(model_dir))
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
if not os.path.exists(model_dir):
os.makedirs(model_dir)
h = logging.FileHandler(os.path.join(model_dir, filename))
h.setLevel(logging.DEBUG)
h.setFormatter(formatter)
logger.addHandler(h)
return logger
class HParams():
def __init__(self, **kwargs):
for k, v in kwargs.items():
if type(v) == dict:
v = HParams(**v)
self[k] = v
def keys(self):
return self.__dict__.keys()
def items(self):
return self.__dict__.items()
def values(self):
return self.__dict__.values()
def __len__(self):
return len(self.__dict__)
def __getitem__(self, key):
return getattr(self, key)
def __setitem__(self, key, value):
return setattr(self, key, value)
def __contains__(self, key):
return key in self.__dict__
def __repr__(self):
return self.__dict__.__repr__()

85
demo/requirements.txt Normal file
View File

@ -0,0 +1,85 @@
absl-py==1.2.0
appdirs==1.4.4
attrs==22.1.0
audioread==3.0.0
Babel==2.10.3
bidict==0.22.0
cachetools==5.2.0
certifi==2022.6.15
cffi==1.15.1
charset-normalizer==2.1.1
clldutils==3.12.0
colorama==0.4.5
colorlog==6.6.0
csvw==3.1.1
cycler==0.11.0
Cython==0.29.32
decorator==5.1.1
dlinfo==1.2.1
dnspython==2.2.1
eventlet==0.33.1
fonttools==4.36.0
google-auth==2.11.0
google-auth-oauthlib==0.4.6
greenlet==1.1.2
grpcio==1.47.0
idna==3.3
importlib-metadata==4.12.0
isodate==0.6.1
joblib==1.1.0
jsonschema==4.14.0
kiwisolver==1.4.4
language-tags==1.1.0
librosa==0.9.2
llvmlite==0.39.0
Markdown==3.4.1
MarkupSafe==2.1.1
matplotlib==3.5.3
numba==0.56.0
numpy==1.22.4
oauthlib==3.2.0
packaging==21.3
phonemizer==3.2.1
Pillow==9.2.0
pooch==1.6.0
protobuf==3.19.4
psutil==5.9.1
py==1.11.0
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pyopenjtalk==0.2.0
pyparsing==3.0.9
pyrsistent==0.18.1
python-dateutil==2.8.2
python-engineio==4.3.4
python-socketio==5.7.1
pytz==2022.2.1
rdflib==6.2.0
regex==2022.8.17
requests==2.28.1
requests-oauthlib==1.3.1
resampy==0.4.0
retry==0.9.2
rfc3986==1.5.0
rsa==4.9
scikit-learn==1.1.2
scipy==1.9.0
segments==2.2.1
six==1.16.0
SoundFile==0.10.3.post1
tabulate==0.8.10
tensorboard==2.10.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
threadpoolctl==3.1.0
torch==1.12.1+cu113
torchaudio==0.12.1+cu113
torchvision==0.13.1+cu113
tqdm==4.64.0
typing-extensions==4.3.0
Unidecode==1.3.4
uritemplate==4.1.1
urllib3==1.26.11
Werkzeug==2.2.2
zipp==3.8.1

136
demo/serverFlask.py Executable file
View File

@ -0,0 +1,136 @@
from flask import Flask, request, Markup, abort, jsonify
from flask_cors import CORS
import logging
from logging.config import dictConfig
import sys
import base64
import torch
import numpy as np
from scipy.io.wavfile import write, read
from datetime import datetime
import traceback
import struct
sys.path.append("mod")
sys.path.append("mod/text")
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
dictConfig({
'version': 1,
'formatters': {'default': {
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
}},
'handlers': {'wsgi': {
'class': 'logging.StreamHandler',
'stream': 'ext://flask.logging.wsgi_errors_stream',
'formatter': 'default'
}},
'root': {
'level': 'INFO',
'handlers': ['wsgi']
}
})
app = Flask(__name__, static_folder="../frontend/dist", static_url_path='/')
CORS(app, resources={r"/*": {"origins": "*"}})
class VoiceChanger():
def __init__(self, config, model):
self.hps =utils.get_hparams_from_file(config)
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model)
self.net_g.eval()
self.gpu_num = torch.cuda.device_count()
print("GPU_NUM:",self.gpu_num)
utils.load_checkpoint( model, self.net_g, None)
def on_request(self, gpu, srcId, dstId, timestamp, wav):
if wav==0:
samplerate, data=read("dummy.wav")
unpackedData = data
else:
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
try:
if gpu<0 or self.gpu_num==0 :
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else:
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
audio1 = audio1.astype(np.int16)
return audio1
@app.route('/test', methods=['GET', 'POST'])
def test():
try:
if request.method == 'GET':
return request.args.get('query', '')
elif request.method == 'POST':
print("POST REQUEST PROCESSING....")
gpu = int(request.json['gpu'])
srcId = int(request.json['srcId'])
dstId = int(request.json['dstId'])
timestamp = int(request.json['timestamp'])
buffer = request.json['buffer']
wav = base64.b64decode(buffer)
# print(wav)
# print(base64.b64encode(wav))
changedVoice = voiceChanger.on_request(gpu, srcId, dstId, timestamp, wav)
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
# print("changedVoice",changedVoice)
# print("CV64",changedVoiceBase64)
data = {
"gpu":gpu,
"srcId":srcId,
"dstId":dstId,
"timestamp":timestamp,
"changedVoiceBase64":changedVoiceBase64
}
return jsonify(data)
else:
return abort(400)
except Exception as e:
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
if __name__ == '__main__':
args = sys.argv
PORT = args[1]
CONFIG = args[2]
MODEL = args[3]
app.logger.info('INITIALIZE MODEL')
voiceChanger = VoiceChanger(CONFIG, MODEL)
voiceChanger.on_request(0,0,0,0,0)
app.logger.info('START APP')
app.run(debug=True, host='0.0.0.0',port=PORT)

96
demo/serverSIO.py Executable file
View File

@ -0,0 +1,96 @@
import eventlet
import socketio
import sys
from datetime import datetime
import struct
import torch
import numpy as np
from scipy.io.wavfile import write
sys.path.append("mod")
sys.path.append("mod/text")
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
class MyCustomNamespace(socketio.Namespace):
def __init__(self, namespace, config, model):
super().__init__(namespace)
self.hps =utils.get_hparams_from_file(config)
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model)
self.net_g.eval()
self.gpu_num = torch.cuda.device_count()
print("GPU_NUM:",self.gpu_num)
utils.load_checkpoint( model, self.net_g, None)
def on_connect(self, sid, environ):
print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
# print('[{}] connet env : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , environ))
def on_request_message(self, sid, msg):
# print("MESSGaa", msg)
gpu = int(msg[0])
srcId = int(msg[1])
dstId = int(msg[2])
timestamp = int(msg[3])
data = msg[4]
# print(srcId, dstId, timestamp)
unpackedData = np.array(struct.unpack('<%sh'%(len(data) // struct.calcsize('<h') ), data))
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
# self.emit('response', msg)
if gpu<0 or self.gpu_num==0 :
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else:
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
audio1 = audio1.astype(np.int16)
bin = struct.pack('<%sh'%len(audio1), *audio1)
# print("return timestamp", timestamp)
self.emit('response',[timestamp, bin])
def on_disconnect(self, sid):
# print('[{}] disconnect'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
pass;
if __name__ == '__main__':
args = sys.argv
PORT = args[1]
CONFIG = args[2]
MODEL = args[3]
print(f"start... PORT:{PORT}, CONFIG:{CONFIG}, MODEL:{MODEL}")
# sio = socketio.Server(cors_allowed_origins='http://localhost:8080')
sio = socketio.Server(cors_allowed_origins='*')
sio.register_namespace(MyCustomNamespace('/test', CONFIG, MODEL))
app = socketio.WSGIApp(sio,static_files={
'': '../frontend/dist',
})
eventlet.wsgi.server(eventlet.listen(('0.0.0.0',int(PORT))), app)

13
demo/setup.sh Executable file
View File

@ -0,0 +1,13 @@
#!/bin/bash
echo config: $1
echo model: $2
cp -r /resources/* .
if [[ -e ./setting.json ]]; then
cp ./setting.json ../frontend/dist/assets/setting.json
fi
python3 serverSIO.py 8080 $1 $2

14
demo/setupFlask.sh Executable file
View File

@ -0,0 +1,14 @@
#!/bin/bash
echo config: $1
echo model: $2
cp -r /resources/* .
if [[ -e ./setting.json ]]; then
cp ./setting.json ../frontend/dist/assets/setting.json
fi
pip install flask
pip install flask_cors
python3 serverFlask.py 8080 $1 $2

5
demo/start.sh Executable file
View File

@ -0,0 +1,5 @@
#!/bin/bash
# python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_94000.pth
# python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_164000.pth
python3 serverSIO.py 8080 resources/train_config_zundamon.json resources/G_210000.pth

Binary file not shown.

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-file-text"><path d="M14 2H6a2 2 0 0 0-2 2v16a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V8z"></path><polyline points="14 2 14 8 20 8"></polyline><line x1="16" y1="13" x2="8" y2="13"></line><line x1="16" y1="17" x2="8" y2="17"></line><polyline points="10 9 9 9 8 9"></polyline></svg>

After

Width:  |  Height:  |  Size: 473 B

BIN
docs/assets/icons/flect.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

1
docs/assets/icons/github.svg Executable file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-github"><path d="M9 19c-5 1.5-5-2.5-7-3m14 6v-3.87a3.37 3.37 0 0 0-.94-2.61c3.14-.35 6.44-1.54 6.44-7A5.44 5.44 0 0 0 20 4.77 5.07 5.07 0 0 0 19.91 1S18.73.65 16 2.48a13.38 13.38 0 0 0-7 0C6.27.65 5.09 1 5.09 1A5.07 5.07 0 0 0 5 4.77a5.44 5.44 0 0 0-1.5 3.78c0 5.42 3.3 6.61 6.44 7A3.37 3.37 0 0 0 9 18.13V22"></path></svg>

After

Width:  |  Height:  |  Size: 522 B

1
docs/assets/icons/home.svg Executable file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-home"><path d="M3 9l9-7 9 7v11a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2z"></path><polyline points="9 22 9 12 15 12 15 22"></polyline></svg>

After

Width:  |  Height:  |  Size: 327 B

1
docs/assets/icons/linkedin.svg Executable file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-linkedin"><path d="M16 8a6 6 0 0 1 6 6v7h-4v-7a2 2 0 0 0-2-2 2 2 0 0 0-2 2v7h-4v-7a6 6 0 0 1 6-6z"></path><rect x="2" y="9" width="4" height="12"></rect><circle cx="4" cy="4" r="2"></circle></svg>

After

Width:  |  Height:  |  Size: 395 B

1
docs/assets/icons/twitter.svg Executable file
View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="#000000" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-twitter"><path d="M23 3a10.9 10.9 0 0 1-3.14 1.53 4.48 4.48 0 0 0-7.86 3v1A10.66 10.66 0 0 1 3 4s-4 9 5 13a11.64 11.64 0 0 1-7 2c9 5 20 0 20-11.5a4.5 4.5 0 0 0-.08-.83A7.72 7.72 0 0 0 23 3z"></path></svg>

After

Width:  |  Height:  |  Size: 403 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

35
docs/assets/setting.json Executable file
View File

@ -0,0 +1,35 @@
{
"app_title": "voice-changer",
"majar_mode": "docker",
"voice_changer_server_url": "http://localhost:8080/test",
"sample_rate": 48000,
"buffer_size": 1024,
"prefix_chunk_size": 24,
"chunk_size": 24,
"speaker_ids": [100, 107, 101, 102, 103],
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
"src_id": 107,
"dst_id": 100,
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [-1, 0, 1, 2, 3, 4],
"avatar": {
"motion_capture_face": true,
"motion_capture_upperbody": true,
"lip_overwrite_with_voice": true,
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
"background_color": "#0000dd",
"chroma_key": "#0000dd",
"avatar_canvas_size": [1280, 720],
"screen_canvas_size": [1280, 720]
},
"advance": {
"avatar_draw_skip_rate": 3,
"screen_draw_skip_rate": 3,
"visualizer_draw_skip_rate": 3,
"cross_fade_lower_value": 0.1,
"cross_fade_overlap_rate": 0.03
}
}

Binary file not shown.

View File

@ -0,0 +1,75 @@
この度は「ずんだもん(人型)」モデルデータセットをお求めいただき、誠にありがとうございます。
ずんだもん(人型)モデルデータは、以下のフォーマットを用意しています。
・VRChat用データPC/Oculus対応
 VRChat上で必要なプログラム、マテリアル、PC用ではDynamicBoneを設定済みのUnity用編集データ(.unitypackage)です。
・MMDデータ
 剛体、ジョイント、マテリアル等を設定済みのモデルデータ(.pmx)です。
・VRMデータ
 VRMデータ対応の各プラットフォームで使用可能するモデルデータ(.vrm)です。SpringBone、コライダ、マテリアル等を設定済みです。
・FBXデータ
 上記フォーマット作成に使用したfbxデータです。ゲーム制作等、必要に応じてご使用下さい。
・blenderデータ
 FBXデータを作成する際のモデルデータ(.blender)です。必要に応じてご使用下さい。
・PSDデータ
 UVマップレイヤーを同梱したPSDファイルです。モデル用テクスチャを改変の際にご使用下さい。
  
なお、上記モデル用テクスチャデータ(.png)を、各々のモデルデータへ同梱しています。
モデルデータ詳細=====================================
●VRChat用データ(PC/Oculus対応
 ・VRChat用データを使用する場合、別途オーサリングソフトウェアである「Unity2019.4.31f1」が必要です。
 ・Avatar3.0専用です。Avatar2.0ではご使用できません。
 ・マテリアル・シェーダーでは「ユニティちゃんトゥーンシェーダー2.0.8」を使用しています。
  本モデルデータには上記シェーダーは同梱されておりませんので、あらかじめご用意をお願いします。
 ・PC版ではDynamicBone設定済みですが、DynamicBone本体は同梱されておりません。
  あらかじめ購入およびご用意をお願いします。
 ・シェイプキー 口:20種 目:12種 眉:3種 その他:1種を同梱しております。
 ・PC版、Oculus版共にハンドサインにて表情切替可能です
 ・ハンドサインに登録されていない表情は、Unityエディターで切り替え編集を行って下さい。
 ・PC版ではフルトラッキング対応です。基本的には腰、両足首にトラッカーを装着して下さい。
 ・PC版では衣装やアクセサリーは別パーツとなっており、Unity上で脱着が可能です。
 ・Oculus版では、仕様により衣装やアクセサリーの脱着は行えません。
 ・VRChatへの詳しい導入方法は別途「VRChatキャラクター導入マニュアル」をご覧下さい。
●MMDデータ
 ・MMDデータを使用する場合、別途「MikuMikuDance」「MikuMikuMoving」などのMMD対応ソフトウェアが必要です。
 ・剛体、ジョイント、IK設定済みです。
 ・準標準ボーン(全ての親、グループ、上半身2、腰、肩キャンセル、腕捩、手捩、親指0、足IK親)が設定済みです。
 ・衣装やアクセサリーは別パーツとなっており、その他モーフ「脱衣」パラメータを1にすることで脱衣可能です。
●VRMデータ
 ・VRMデータを使用する場合、VRMモデルを使用できる各アプリケーションが別途必要です。
 ・VRMSDK ver.0.92を使用してデータを作成しています。
 ・SpringBoneおよびSpringBoneCollider設定済みです。
 ・シェーダーはVRM MToonおよびスタンダードシェーダーを使用しています。
 ・BlendShapeは基本A,I,U,E,O,Blink,Joy,Angly,Sorrow,Funから更に種追加Wink_L,Wink_R,Star,Hachume,No_HightLight,Aozame,Hauu,Tear)の全20種使用可能です。
●FBXデータ
 ・FBXデータバージョンは2020となっています。アプリケーションによっては正常に読み込むことができない場合があります。
 ・FBXデータご使用の際は、必ずテクスチャファイルも同時にインポート願います。
 ・アプリケーションへインポートの際、座標系の問題が生じるおそれがあります(Z軸が縦方向、Y軸が奥行となります)
 ・メッシュデータ、ボーンデータ、マテリアル以外のデータ(ライト、カメラ、モーションなど)は同梱されていません。
●Blenderデータ
 ・Blenderデータバージョンは2.93です。バージョン2.93以前のBlenderでは正常に読み込むことができない場合があります。
 ・MMD以外のボーンアーマーチュアに対応しています。MMDではボーン構造が特殊なため対応していません。
 ・IKは設定しておりません。必要に応じてIKを設定願います。
 ・素体、各衣装やアクセサリーを別オブジェクトとなりますので、お好みに合わせて脱着することが可能です。
 
●ご利用規約、禁止事項、免責事項は別途「ずんだもん(人型)モデル利用規約.txt」をご覧下さい。
●お問合せ先
不具合、バグ、感想、調整、商用利用などのお問合せなどは、ホームページ(https://zunko.jp)よりご連絡下さい。
●更新履歴
2021/12/31 初版

Binary file not shown.

View File

@ -0,0 +1,68 @@
ずんだもん(人型) モデルデータ利用規約
3Dモデリング:絹井けい
販売元SSS LLC. (https://zunko.jp)※以下「当社」とします。
・本モデルを利用、もしくは改変した時点で本規約に同意したものとします。
・本ライセンスの内容は変更する場合があり、最新のものが適用されます。
・本ライセンスは日本語のものが正本とされ、翻訳と差異がある場合は常に日本語のものが優先されます。
・本モデルの二次創作物を利用することによって生じた何らかのトラブル・損失に対し、絹井けい、および当社は一切責任を負わないものとします。
・本モデルを改変した二次創作物の著作権法上の全ての権利は当社に帰属するものとします。
●利用規約●
本モデルでは基礎条項に加え、個別条項を元に以下の行為を許可します。
・二次的著作物の配布の許可
・成人向け表現(性的表現)の許可
・成人向け表現(暴力表現)の許可
・本モデルでの利用を目的とした衣装等を配布・頒布・販売する場合に限り、本モデルが含む以下のデータを流用することを許可します。ただし改変が著しく少ない場合を除きます。
1.本モデルを構成するボーン・ウェイト
2.モデルの素体パーツ「ずんだもん(人型)」メッシュデータ
・当モデルデータを営利目的で使用する場合、販売元であるSSS LLC.が運営する「東北ずん子」ウェブサイトにて記載されている「版権商用利用の手引き(https://zunko.jp/con_shoushi.html)」に従ってください。
 ただし以下の場合、事前受諾は不要です。
 1.印刷やディスクプレスなど、個人から依頼されたものを複製する場合
 2.動画や写真への映り込みなど、本モデルがメインコンテンツと判断されない場合
●お問合せ先●
ホームページhttps://zunko.jp/
版権商用利用の手引きhttps://zunko.jp/con_shoushi.html
EN:Zundamon (humanoid) Model Data Terms of Use
3D modeling: Kei Kinui
Distributed by: SSS LLC. (https://zunko.jp) hereinafter referred to as "the Company".
*By using or modifying this model, you agree to be bound by these terms.
*The content of this license is subject to change, and the latest version shall apply.
*The Japanese version of this license shall be the original, and if there are any differences between the Japanese version and the translated version, the Japanese version shall always take precedence.
*Kei Kinui and the Company shall not be held responsible for any problems or losses that may arise from the use of secondary works of this model.
*All rights under copyright law for secondary works modified from this model shall belong to the Company.
*Terms of Use*
In addition to the basic terms, this model permits the following actions based on the individual terms.
*Permission to distribute derivative works
*Permission for adult expression (sexual expression)
*Permission to use adult-oriented expressions (violent expressions)
*Permission is granted to use the following data included in this model only for the purpose of distributing or selling costumes, etc. for use in this model. However, this excludes cases where the modifications are extremely small.
1:The bones and weights that make up the model
2:Mesh data of the model's body part "Zundamon (humanoid)
*If you wish to use this model data for commercial purposes, please follow the "Guide to the Commercial Use of Copyrighted Material (https://zunko.jp/con_shoushi.html)" described on the "Tohoku Zunko" website operated by SSS LLC, the distributor.
 However, prior consent is not required in the following cases
 1:Reproduction of materials commissioned by individuals, such as printing or disc pressing.
 2:When this model is not judged to be the main content, such as reflection in videos or photos.
*contact us*
HomePage(https://zunko.jp)
Guide to the Commercial Use of Copyrighted Material(https://zunko.jp/con_shoushi.html)

1
docs/audiolet/index.js Executable file
View File

@ -0,0 +1 @@
(()=>{"use strict";class e extends AudioWorkletProcessor{initialized=!1;playBuffer=[];deltaChunkSize=24;bufferSize=1024;constructor(){super(),this.initialized=!0,this.port.onmessage=this.handleMessage.bind(this)}prevF32Data=null;handleMessage(e){if(e.data.deltaSize)return void(this.deltaChunkSize=e.data.deltaSize);const t=e.data.data,l=new Int16Array(t),n=new Float32Array(l.length);l.forEach(((e,t)=>{const l=e>=32768?-(65536-e)/32768:e/32767;n[t]=l}));let s=this.prevF32Data?this.prevF32Data.slice(this.prevF32Data.length-this.deltaChunkSize*this.bufferSize/2):null;const h=n.slice(n.length-this.deltaChunkSize*this.bufferSize*2/2,n.length-this.deltaChunkSize*this.bufferSize/2);if(s?.length!==h.length&&(s=null),s)for(let e=0;e<s.length;e++){let t=0;if(e<s.length/3)t=0;else if(e>s.length/3*2)t=1;else{const l=e-s.length/3;t=Math.min(l/(s.length/3),1)}const l=s[e]*(1-t),n=h[e]*t;h[e]=l+n}if(this.playBuffer.length>50)for(console.log("Buffer truncated");this.playBuffer.length>2;)this.playBuffer.shift();let i;for(let e=0;e<h.length;e++){const t=2*e%128;0===t&&(i=new Float32Array(128));const l=h[e],n=e+1<h.length?h[e+1]:h[e];i[t]=l,i[t+1]=(l+n)/2,i.length===t+2&&this.playBuffer.push(i)}this.prevF32Data=n}handleMessage_(e){const t=e.data.data,l=new Int16Array(t),n=new Float32Array(l.length);l.forEach(((e,t)=>{const l=e>=32768?-(65536-e)/32768:e/32767;n[t]=l}));let s=this.prevF32Data?this.prevF32Data.slice(this.prevF32Data.length/2):null;const h=n.slice(0,n.length/2);if(s?.length!==h.length&&(s=null),s)for(let e=0;e<s.length;e++){let t=0;if(e<s.length/3)t=0;else if(e>s.length/3*2)t=1;else{const l=e-s.length/3;t=Math.min(l/(s.length/100),1)}const l=s[e]*(1-t),n=h[e]*t;h[e]=l+n}if(this.playBuffer.length>100)for(console.log("Buffer truncated");this.playBuffer.length>2;)this.playBuffer.shift();let i;for(let e=0;e<h.length;e++){const t=2*e%128;0===t&&(i=new Float32Array(128));const l=h[e],n=e+1<h.length?h[e+1]:h[e];i[t]=l,i[t+1]=(l+n)/2,i.length===t+2&&this.playBuffer.push(i)}this.prevF32Data=n}process(e,t,l){if(!this.initialized)return console.log("worklet_process not ready"),!0;if(0===this.playBuffer.length)return console.log("no play buffer"),!0;const n=this.playBuffer.shift();return t[0][0].set(n),!0}}registerProcessor("voice-player-worklet-processor",e)})();

BIN
docs/coffee.png Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

BIN
docs/favicon.ico Executable file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.4 KiB

1
docs/index.html Executable file
View File

@ -0,0 +1 @@
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html>

2
docs/index.js Executable file

File diff suppressed because one or more lines are too long

70
docs/index.js.LICENSE.txt Executable file
View File

@ -0,0 +1,70 @@
/*!
localForage -- Offline Storage, Improved
Version 1.10.0
https://localforage.github.io/localForage
(c) 2013-2017 Mozilla, Apache License 2.0
*/
/*!
* Font Awesome Free 6.1.2 by @fontawesome - https://fontawesome.com
* License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License)
* Copyright 2022 Fonticons, Inc.
*/
/*!
* The buffer module from node.js, for the browser.
*
* @author Feross Aboukhadijeh <https://feross.org>
* @license MIT
*/
/*! (c) 2019-2021 pixiv Inc. - https://github.com/pixiv/three-vrm/blob/release/LICENSE */
/*! *****************************************************************************
Copyright (c) Microsoft Corporation.
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
***************************************************************************** */
/*! For license information please see index.js.LICENSE.txt */
/*! ieee754. BSD-3-Clause License. Feross Aboukhadijeh <https://feross.org/opensource> */
/**
* @license React
* react-dom.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* react.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
/**
* @license React
* scheduler.production.min.js
*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/

1687
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,9 @@
"main": "index.js",
"scripts": {
"build:docker": "date +%Y%m%d%H%M%S > trainer/dummy && DOCKER_BUILDKIT=1 docker build -f trainer/Dockerfile trainer/ -t voice-changer",
"copy:frontend":"docker run -v `pwd`/docs:/docs --entrypoint /bin/bash -ti voice-changer -c \"cp -r /voice-changer-internal/frontend/dist/* /docs\"",
"copy:backend":"docker run -v `pwd`/demo:/demo --entrypoint /bin/bash -ti voice-changer -c \"cp -r /voice-changer-internal/voice-change-service/* /demo/\"",
"create:demo":"run-p copy:frontend copy:backend",
"push:docker": "bash script/001_pushDocker.sh",
"test": "echo \"Error: no test specified\" && exit 1"
},
@ -18,5 +21,8 @@
"bugs": {
"url": "https://github.com/w-okada/voice-changer/issues"
},
"homepage": "https://github.com/w-okada/voice-changer#readme"
"homepage": "https://github.com/w-okada/voice-changer#readme",
"devDependencies": {
"npm-run-all": "^4.1.5"
}
}

View File

@ -1,13 +0,0 @@
#!/bin/bash
EXP_NAME=$1
shift
docker run -it --gpus all --shm-size=128M \
-v `pwd`/exp/${EXP_NAME}/dataset:/MMVC_Trainer/dataset \
-v `pwd`/exp/${EXP_NAME}/logs:/MMVC_Trainer/logs \
-v `pwd`/exp/${EXP_NAME}/filelists:/MMVC_Trainer/filelists \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"

159
start2.sh Normal file
View File

@ -0,0 +1,159 @@
#!/bin/bash
# 参考:https://programwiz.org/2022/03/22/how-to-write-shell-script-for-option-parsing/
DOCKER_IMAGE=dannadori/voice-changer:20220831_151141
TENSORBOARD_PORT=6006
VOICE_CHANGER_PORT=8080
set -eu
echo "------"
echo "$@"
echo "------"
usage() {
echo "
usage:
For training
$0 [-t] -n <exp_name> [-b batch_size] [-r]
-t: トレーニングモードで実行する場合に指定してください。(train)
-n: トレーニングの名前です。(name)
-b: バッチサイズです。(batchsize)
-r: トレーニング再開の場合に指定してください。(resume)
For changing voice
$0 [-v] [-c config] [-m model] [-g on/off]
-v: ボイスチェンジャーモードで実行する場合に指定してください。(voice changer)
-c: トレーニングで使用したConfigのファイル名です。(config)
-m: トレーニング済みのモデルのファイル名です。(model)
-g: GPU使用/不使用。デフォルトはonなのでGPUを使う場合は指定不要。(gpu)
For help
$0 [-h]
-h: show this help
" >&2
}
warn () {
echo "! ! ! $1 ! ! !"
exit 1
}
training_flag=false
name=999_exp
batch_size=10
resume_flag=false
voice_change_flag=false
config=
model=
gpu=on
escape_flag=false
# オプション解析
while getopts tn:b:rvc:m:g:hx OPT; do
case $OPT in
t)
training_flag=true
;;
n)
name="$OPTARG"
;;
b)
batch_size="$OPTARG"
;;
r)
resume_flag=true
;;
v)
voice_change_flag=true
;;
c)
config="$OPTARG"
;;
m)
model="$OPTARG"
;;
g)
gpu="$OPTARG"
;;
h | \?)
usage && exit 1
;;
x)
escape_flag=true
esac
done
# モード解析
if $training_flag && $voice_change_flag; then
warn "-tトレーニングモード と -vボイチェンモードは同時に指定できません。"
elif $training_flag; then
echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
elif $voice_change_flag; then
echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
elif $escape_flag; then
/bin/bash
else
warn "-tトレーニングモード と -vボイチェンモードのいずれかを指定してください。"
fi
if $training_flag; then
if $resume_flag; then
echo "トレーニングを再開します"
docker run -it --gpus all --shm-size=128M \
-v `pwd`/exp/${name}/dataset:/MMVC_Trainer/dataset \
-v `pwd`/exp/${name}/logs:/MMVC_Trainer/logs \
-v `pwd`/exp/${name}/filelists:/MMVC_Trainer/filelists \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p ${TENSORBOARD_PORT}:6006 $DOCKER_IMAGE -t -b ${batch_size} -r
else
echo "トレーニングを開始します"
docker run -it --gpus all --shm-size=128M \
-v `pwd`/exp/${name}/dataset:/MMVC_Trainer/dataset \
-v `pwd`/exp/${name}/logs:/MMVC_Trainer/logs \
-v `pwd`/exp/${name}/filelists:/MMVC_Trainer/filelists \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p ${TENSORBOARD_PORT}:6006 $DOCKER_IMAGE -t -b ${batch_size}
fi
fi
if $voice_change_flag; then
if [[ -z "$config" ]]; then
warn "コンフィグファイル(-c)を指定してください"
fi
if [[ -z "$model" ]]; then
warn "モデルファイル(-m)を指定してください"
fi
if [ "${gpu}" = "on" ]; then
echo "GPUをマウントして起動します。"
docker run -it --gpus all --shm-size=128M \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p ${VOICE_CHANGER_PORT}:8080 $DOCKER_IMAGE -v -c ${config} -m ${model}
elif [ "${gpu}" = "off" ]; then
echo "CPUのみで稼働します。GPUは使用できません。"
docker run -it --shm-size=128M \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p ${VOICE_CHANGER_PORT}:8080 $DOCKER_IMAGE -v -c ${config} -m ${model}
else
echo ${gpu}
warn "-g は onかoffで指定して下さい。"
fi
fi

View File

@ -1,7 +0,0 @@
#!/bin/bash
docker run -it --gpus all --shm-size=128M \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"

View File

@ -1,7 +0,0 @@
#!/bin/bash
docker run -it --shm-size=128M \
-v `pwd`/vc_resources:/resources \
-e LOCAL_UID=$(id -u $USER) \
-e LOCAL_GID=$(id -g $USER) \
-p 6006:6006 -p 8080:8080 dannadori/voice-changer:20220826_093743 "$@"

View File

@ -1,8 +1,10 @@
{
"app_title": "voice-changer",
"majar_mode": "docker",
"voice_changer_server_url": "http://localhost:8080/test",
"sample_rate": 48000,
"buffer_size": 1024,
"prefix_chunk_size": 24,
"chunk_size": 24,
"speaker_ids": [100, 107, 101, 102, 103],
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
@ -11,7 +13,7 @@
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [-1, 0, 1, 2, 3, 4, 5, 100, 200],
"available_gpus": [-1, 0, 1, 2, 3, 4],
"avatar": {
"motion_capture_face": true,
"motion_capture_upperbody": true,
@ -26,6 +28,8 @@
"advance": {
"avatar_draw_skip_rate": 3,
"screen_draw_skip_rate": 3,
"visualizer_draw_skip_rate": 3
"visualizer_draw_skip_rate": 3,
"cross_fade_lower_value": 0.1,
"cross_fade_overlap_rate": 0.03
}
}

View File

@ -0,0 +1,35 @@
{
"app_title": "voice-changer",
"majar_mode": "colab",
"voice_changer_server_url": "http://localhost:8080/test",
"sample_rate": 48000,
"buffer_size": 1024,
"prefix_chunk_size": 24,
"chunk_size": 24,
"speaker_ids": [100, 107, 101, 102, 103],
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つぐみ"],
"src_id": 107,
"dst_id": 100,
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [-1, 0, 1, 2, 3, 4],
"avatar": {
"motion_capture_face": true,
"motion_capture_upperbody": true,
"lip_overwrite_with_voice": true,
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
"background_color": "#0000dd",
"chroma_key": "#0000dd",
"avatar_canvas_size": [1280, 720],
"screen_canvas_size": [1280, 720]
},
"advance": {
"avatar_draw_skip_rate": 3,
"screen_draw_skip_rate": 3,
"visualizer_draw_skip_rate": 3,
"cross_fade_lower_value": 0.1,
"cross_fade_overlap_rate": 0.03
}
}

View File

@ -1,4 +1,4 @@
FROM dannadori/voice-changer-internal:20220826_093634 as front
FROM dannadori/voice-changer-internal:20220831_150941 as front
FROM debian:bullseye-slim as base
ARG DEBIAN_FRONTEND=noninteractive

View File

@ -76,20 +76,20 @@ done
# ## コマンドライン引数から、オプション引数分を削除
# # shift $((OPTIND - 1))
# モード解析
if $training_flag && $voice_change_flag; then
warn "-tトレーニングモード と -vボイチェンモードは同時に指定できません。"
exit 1
elif $training_flag; then
echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
elif $voice_change_flag; then
echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
elif $escape_flag; then
/bin/bash
else
warn "-tトレーニングモード と -vボイチェンモードのいずれかを指定してください。"
exit 1
fi
# # モード解析
# if $training_flag && $voice_change_flag; then
# warn "-tトレーニングモード と -vボイチェンモードは同時に指定できません。"
# exit 1
# elif $training_flag; then
# echo "■■■ ト レ ー ニ ン グ モ ー ド ■■■"
# elif $voice_change_flag; then
# echo "■■■ ボ イ チ ェ ン モ ー ド ■■■"
# elif $escape_flag; then
# /bin/bash
# else
# warn "-tトレーニングモード と -vボイチェンモードのいずれかを指定してください。"
# exit 1
# fi