This commit is contained in:
wataru 2022-09-18 22:09:33 +09:00
parent f8ed234ada
commit c510a52b10
10 changed files with 5196 additions and 14 deletions

158
demo/SoftVcServerFastAPI.py Executable file
View File

@ -0,0 +1,158 @@
import uvicorn
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
from fastapi.staticfiles import StaticFiles
import logging
from logging.config import dictConfig
import os, sys, math, base64, struct, traceback, time
import torch, torchaudio
import numpy as np
from scipy.io.wavfile import write, read
from datetime import datetime
args = sys.argv
PORT = args[1]
MODE = args[2]
logger = logging.getLogger('uvicorn')
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.mount("/front", StaticFiles(directory="voice-changer/frontend/dist", html=True), name="static")
if MODE == "colab":
print("ENV: colab")
hubert_model = torch.hub.load("bshall/hubert:main", "hubert_soft").cuda()
acoustic_model = torch.hub.load("bshall/acoustic-model:main", "hubert_soft").cuda()
hifigan_model = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft").cuda()
else:
print("ENV: Docker")
app.mount("/front", StaticFiles(directory="../frontend/dist", html=True), name="static")
sys.path.append("/hubert")
from hubert import hubert_discrete, hubert_soft, kmeans100
sys.path.append("/acoustic-model")
from acoustic import hubert_discrete, hubert_soft
sys.path.append("/hifigan")
from hifigan import hifigan
hubert_model = torch.load("/models/bshall_hubert_main.pt").cuda()
acoustic_model = torch.load("/models/bshall_acoustic-model_main.pt").cuda()
hifigan_model = torch.load("/models/bshall_hifigan_main.pt").cuda()
def applyVol(i, chunk, vols):
curVol = vols[i] / 2
if curVol < 0.0001:
line = torch.zeros(chunk.size())
else:
line = torch.ones(chunk.size())
volApplied = torch.mul(line, chunk)
volApplied = volApplied.unsqueeze(0)
return volApplied
@app.get("/test")
def get_test():
try:
return request.args.get('query', '')
except Exception as e:
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
class VoiceModel(BaseModel):
gpu: int
srcId: int
dstId: int
timestamp: int
buffer: str
@app.post("/test")
def post_test(voice:VoiceModel):
try:
print("POST REQUEST PROCESSING....")
gpu = voice.gpu
srcId = voice.srcId
dstId = voice.dstId
timestamp = voice.timestamp
buffer = voice.buffer
wav = base64.b64decode(buffer)
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
# received_data_file = f"received_data_{timestamp}.wav"
received_data_file = "received_data.wav"
write(received_data_file, 24000, unpackedData.astype(np.int16))
source, sr = torchaudio.load(received_data_file) # デフォルトでnormalize=Trueがついており、float32に変換して読んでくれるらしいのでこれを使う。https://pytorch.org/audio/stable/backend.html
source_16k = torchaudio.functional.resample(source, 24000, 16000)
source_16k = source_16k.unsqueeze(0).cuda()
# SOFT-VC
with torch.inference_mode():
units = hubert_model.units(source_16k)
mel = acoustic_model.generate(units).transpose(1, 2)
target = hifigan_model(mel)
dest = torchaudio.functional.resample(target, 16000,24000)
dest = dest.squeeze().cpu()
# ソースの音量取得
source = source.cpu()
specgram = torchaudio.transforms.MelSpectrogram(sample_rate=24000)(source)
vol_apply_window_size = math.ceil(len(source[0]) / specgram.size()[2])
specgram = specgram.transpose(1,2)
vols = [ torch.max(i) for i in specgram[0]]
chunks = torch.split(dest, vol_apply_window_size,0)
chunks = [applyVol(i,c,vols) for i, c in enumerate(chunks)]
dest = torch.cat(chunks,1)
arr = np.array(dest.squeeze())
int_size = 2**(16 - 1) - 1
arr = (arr * int_size).astype(np.int16)
write("converted_data.wav", 24000, arr)
changedVoiceBase64 = base64.b64encode(arr).decode('utf-8')
data = {
"gpu":gpu,
"srcId":srcId,
"dstId":dstId,
"timestamp":timestamp,
"changedVoiceBase64":changedVoiceBase64
}
json_compatible_item_data = jsonable_encoder(data)
return JSONResponse(content=json_compatible_item_data)
except Exception as e:
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
if __name__ == '__main__':
args = sys.argv
PORT = args[1]
MODE = args[2]
logger.info('INITIALIZE MODEL')
logger.info('START APP')
uvicorn.run(f"{os.path.basename(__file__)[:-3]}:app", host="0.0.0.0", port=int(PORT), reload=True, log_level="info")

144
demo/serverFastAPI.py Executable file
View File

@ -0,0 +1,144 @@
from flask import Flask, request, Markup, abort, jsonify, send_from_directory
from flask_cors import CORS
import logging
from logging.config import dictConfig
import sys
import base64
import torch
import numpy as np
from scipy.io.wavfile import write, read
from datetime import datetime
import traceback
import struct
sys.path.append("mod")
sys.path.append("mod/text")
import utils
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
dictConfig({
'version': 1,
'formatters': {'default': {
'format': '[%(asctime)s] %(levelname)s in %(module)s: %(message)s',
}},
'handlers': {'wsgi': {
'class': 'logging.StreamHandler',
'stream': 'ext://flask.logging.wsgi_errors_stream',
'formatter': 'default'
}},
'root': {
'level': 'INFO',
'handlers': ['wsgi']
}
})
app = Flask(__name__)
@app.route("/<path:path>")
def static_dir(path):
return send_from_directory("../frontend/dist", path)
@app.route('/', methods=['GET'])
def redirect_to_index():
return send_from_directory("../frontend/dist", 'index.html')
CORS(app, resources={r"/*": {"origins": "*"}})
class VoiceChanger():
def __init__(self, config, model):
self.hps =utils.get_hparams_from_file(config)
self.net_g = SynthesizerTrn(
len(symbols),
self.hps.data.filter_length // 2 + 1,
self.hps.train.segment_size // self.hps.data.hop_length,
n_speakers=self.hps.data.n_speakers,
**self.hps.model)
self.net_g.eval()
self.gpu_num = torch.cuda.device_count()
print("GPU_NUM:",self.gpu_num)
utils.load_checkpoint( model, self.net_g, None)
def on_request(self, gpu, srcId, dstId, timestamp, wav):
if wav==0:
samplerate, data=read("dummy.wav")
unpackedData = data
else:
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
try:
if gpu<0 or self.gpu_num==0 :
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else:
with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
audio1 = audio1.astype(np.int16)
return audio1
@app.route('/test', methods=['GET', 'POST'])
def test():
try:
if request.method == 'GET':
return request.args.get('query', '')
elif request.method == 'POST':
print("POST REQUEST PROCESSING....")
gpu = int(request.json['gpu'])
srcId = int(request.json['srcId'])
dstId = int(request.json['dstId'])
timestamp = int(request.json['timestamp'])
buffer = request.json['buffer']
wav = base64.b64decode(buffer)
# print(wav)
# print(base64.b64encode(wav))
changedVoice = voiceChanger.on_request(gpu, srcId, dstId, timestamp, wav)
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
# print("changedVoice",changedVoice)
# print("CV64",changedVoiceBase64)
data = {
"gpu":gpu,
"srcId":srcId,
"dstId":dstId,
"timestamp":timestamp,
"changedVoiceBase64":changedVoiceBase64
}
return jsonify(data)
else:
return abort(400)
except Exception as e:
print("REQUEST PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
if __name__ == '__main__':
args = sys.argv
PORT = args[1]
CONFIG = args[2]
MODEL = args[3]
app.logger.info('INITIALIZE MODEL')
voiceChanger = VoiceChanger(CONFIG, MODEL)
voiceChanger.on_request(0,0,0,0,0)
app.logger.info('START APP')
app.run(debug=True, host='0.0.0.0',port=PORT)

View File

@ -37,9 +37,6 @@ dictConfig({
} }
}) })
#app = Flask(__name__, static_folder="../frontend/dist", static_url_path='/')
app = Flask(__name__) app = Flask(__name__)
@app.route("/<path:path>") @app.route("/<path:path>")
def static_dir(path): def static_dir(path):

View File

@ -15,8 +15,6 @@ from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn from models import SynthesizerTrn
from text.symbols import symbols from text.symbols import symbols
class MyCustomNamespace(socketio.Namespace): class MyCustomNamespace(socketio.Namespace):
def __init__(self, namespace, config, model): def __init__(self, namespace, config, model):
super().__init__(namespace) super().__init__(namespace)
@ -91,6 +89,7 @@ if __name__ == '__main__':
sio.register_namespace(MyCustomNamespace('/test', CONFIG, MODEL)) sio.register_namespace(MyCustomNamespace('/test', CONFIG, MODEL))
app = socketio.WSGIApp(sio,static_files={ app = socketio.WSGIApp(sio,static_files={
'': '../frontend/dist', '': '../frontend/dist',
'/': '../frontend/dist/index.html',
}) })
eventlet.wsgi.server(eventlet.listen(('0.0.0.0',int(PORT))), app) eventlet.wsgi.server(eventlet.listen(('0.0.0.0',int(PORT))), app)

View File

@ -1,13 +1,31 @@
#!/bin/bash #!/bin/bash
echo config: $1 CONFIG=$1
echo model: $2 MODEL=$2
TYPE=$3
echo config: $CONFIG
echo model: $MODEL
echo type: $TYPE
cp -r /resources/* . cp -r /resources/* .
if [[ -e ./setting.json ]]; then if [[ -e ./setting.json ]]; then
cp ./setting.json ../frontend/dist/assets/setting.json cp ./setting.json ../frontend/dist/assets/setting.json
fi fi
python3 serverSIO.py 8080 $1 $2 if [ "${TYPE}" = "SOFT_VC" ] ; then
echo "SOFT_VCを起動します"
python3 SoftVcServerFlask.py 8080
elif [ "${TYPE}" = "SOFT_VC_FAST_API" ] ; then
echo "SOFT_VC_FAST_APIを起動します"
python3 SoftVcServerFastAPI.py 8080
else
echo "MMVCを起動します"
python3 serverSIO.py 8080 $CONFIG $MODEL
fi

View File

@ -1 +1,13 @@
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html> <!DOCTYPE html>
<html lang="ja" style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>voice recorder</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
<noscript>
<strong>javascriptを有効にしてください</strong>
</noscript>
</body>
</html>

4820
frontend/dist/index.js vendored

File diff suppressed because one or more lines are too long

View File

@ -2,9 +2,9 @@
# 参考:https://programwiz.org/2022/03/22/how-to-write-shell-script-for-option-parsing/ # 参考:https://programwiz.org/2022/03/22/how-to-write-shell-script-for-option-parsing/
DOCKER_IMAGE=dannadori/voice-changer:20220903_150931 DOCKER_IMAGE=dannadori/voice-changer:20220918_220447
TENSORBOARD_PORT=6006 TENSORBOARD_PORT=6006
VOICE_CHANGER_PORT=8080 VOICE_CHANGER_PORT=8081
set -eu set -eu

View File

@ -0,0 +1,38 @@
{
"app_title": "voice-changer",
"majar_mode": "colab",
"voice_changer_server_url": "/test",
"sample_rate": 48000,
"buffer_size": 1024,
"prefix_chunk_size": 60,
"chunk_size": 60,
"speaker_ids": [999, 107],
"speaker_names": ["---", "user"],
"src_id": 107,
"dst_id": 999,
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [0],
"avatar": {
"enable_avatar": true,
"motion_capture_face": true,
"motion_capture_upperbody": true,
"lip_overwrite_with_voice": true,
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
"background_color": "#0000dd",
"chroma_key": "#0000dd",
"avatar_canvas_size": [1280, 720],
"screen_canvas_size": [1280, 720]
},
"advance": {
"avatar_draw_skip_rate": 3,
"screen_draw_skip_rate": 3,
"visualizer_draw_skip_rate": 3,
"cross_fade_lower_value": 0.1,
"cross_fade_offset_rate": 0.3,
"cross_fade_end_rate": 0.6,
"cross_fade_type": 2
}
}

View File

@ -1,4 +1,4 @@
FROM dannadori/voice-changer-internal:20220903_150759 as front FROM dannadori/voice-changer-internal:20220918_215800 as front
FROM debian:bullseye-slim as base FROM debian:bullseye-slim as base
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive