mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 13:35:12 +03:00
commit
c664fabf4b
@ -13,14 +13,19 @@ import torch
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import write, read
|
||||
|
||||
sys.path.append("mod")
|
||||
sys.path.append("mod/text")
|
||||
# sys.path.append("mod")
|
||||
# sys.path.append("mod/text")
|
||||
|
||||
sys.path.append("/MMVC_Trainer")
|
||||
sys.path.append("/MMVC_Trainer/text")
|
||||
|
||||
import utils
|
||||
import commons
|
||||
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
|
||||
from models import SynthesizerTrn
|
||||
from text.symbols import symbols
|
||||
|
||||
from mel_processing import spectrogram_torch
|
||||
from text import text_to_sequence, cleaned_text_to_sequence
|
||||
|
||||
class VoiceChanger():
|
||||
def __init__(self, config, model):
|
||||
@ -48,16 +53,52 @@ class VoiceChanger():
|
||||
try:
|
||||
if gpu<0 or self.gpu_num==0 :
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
# dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
# data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
|
||||
audio = torch.FloatTensor(unpackedData.astype(np.float32))
|
||||
audio_norm = audio /self.hps.data.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
sid = torch.LongTensor([int(srcId)])
|
||||
|
||||
data = (text_norm, spec, audio_norm, sid)
|
||||
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cpu()
|
||||
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
# dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
# data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
|
||||
audio = torch.FloatTensor(unpackedData.astype(np.float32))
|
||||
audio_norm = audio /self.hps.data.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
sid = torch.LongTensor([int(srcId)])
|
||||
|
||||
data = (text_norm, spec, audio_norm, sid)
|
||||
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
|
||||
|
@ -9,12 +9,17 @@ import torch
|
||||
import numpy as np
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
sys.path.append("mod")
|
||||
sys.path.append("mod/text")
|
||||
sys.path.append("/MMVC_Trainer")
|
||||
sys.path.append("/MMVC_Trainer/text")
|
||||
|
||||
|
||||
import utils
|
||||
import commons
|
||||
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
|
||||
from models import SynthesizerTrn
|
||||
from text.symbols import symbols
|
||||
from mel_processing import spectrogram_torch
|
||||
from text import text_to_sequence, cleaned_text_to_sequence
|
||||
|
||||
class MyCustomNamespace(socketio.Namespace):
|
||||
def __init__(self, namespace, config, model):
|
||||
@ -50,17 +55,49 @@ class MyCustomNamespace(socketio.Namespace):
|
||||
|
||||
if gpu<0 or self.gpu_num==0 :
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
|
||||
audio = torch.FloatTensor(unpackedData.astype(np.float32))
|
||||
audio_norm = audio /self.hps.data.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
sid = torch.LongTensor([int(srcId)])
|
||||
|
||||
data = (text_norm, spec, audio_norm, sid)
|
||||
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cpu()
|
||||
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
else:
|
||||
with torch.no_grad():
|
||||
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
|
||||
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
|
||||
audio = torch.FloatTensor(unpackedData.astype(np.float32))
|
||||
audio_norm = audio /self.hps.data.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||
center=False)
|
||||
spec = torch.squeeze(spec, 0)
|
||||
sid = torch.LongTensor([int(srcId)])
|
||||
|
||||
data = (text_norm, spec, audio_norm, sid)
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
|
||||
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
|
14
frontend/dist/index.html
vendored
14
frontend/dist/index.html
vendored
@ -1 +1,13 @@
|
||||
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html>
|
||||
<!DOCTYPE html>
|
||||
<html lang="ja" style="width: 100%; height: 100%; overflow: hidden">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<title>voice recorder</title>
|
||||
<script defer src="index.js"></script></head>
|
||||
<body style="width: 100%; height: 100%; margin: 0px">
|
||||
<div id="app" style="width: 100%; height: 100%"></div>
|
||||
<noscript>
|
||||
<strong>javascriptを有効にしてください</strong>
|
||||
</noscript>
|
||||
</body>
|
||||
</html>
|
||||
|
4820
frontend/dist/index.js
vendored
4820
frontend/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -1,7 +1,7 @@
|
||||
#!/bin/bash
|
||||
set -eu
|
||||
|
||||
DOCKER_IMAGE=dannadori/voice-changer:20221003_002318
|
||||
DOCKER_IMAGE=dannadori/voice-changer:20221028_191234
|
||||
#DOCKER_IMAGE=voice-changer
|
||||
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
FROM dannadori/voice-changer-internal:20221002_193031 as front
|
||||
FROM dannadori/voice-changer-internal:20221028_190940 as front
|
||||
FROM debian:bullseye-slim as base
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
@ -8,7 +8,7 @@ RUN apt-get install -y python3-pip git
|
||||
RUN apt-get install -y espeak
|
||||
RUN apt-get install -y cmake
|
||||
|
||||
RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.0
|
||||
RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.3
|
||||
|
||||
RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
|
||||
|
||||
@ -49,18 +49,22 @@ COPY fine_model/D_180000.pth /MMVC_Trainer/fine_model/D_180000.pth
|
||||
|
||||
### Copy from base
|
||||
COPY --from=base --chmod=777 /usr/local/lib/python3.9/dist-packages /usr/local/lib/python3.9/dist-packages
|
||||
COPY --from=base --chmod=777 /MMVC_Trainer /MMVC_Trainer
|
||||
|
||||
### Copy from frontend
|
||||
##### MMVC Trainer
|
||||
COPY --from=front --chmod=777 /MMVC_Trainer /MMVC_Trainer
|
||||
RUN chmod 0777 /MMVC_Trainer
|
||||
|
||||
WORKDIR /MMVC_Trainer
|
||||
ADD /setup.sh /MMVC_Trainer/
|
||||
ADD /exec.sh /MMVC_Trainer/
|
||||
|
||||
### Copy from frontend
|
||||
##### Voice changer Internal
|
||||
COPY --from=front --chmod=777 /voice-changer-internal/frontend/dist /voice-changer-internal/frontend/dist
|
||||
COPY --from=front --chmod=777 /voice-changer-internal/voice-change-service /voice-changer-internal/voice-change-service
|
||||
RUN chmod 0777 /voice-changer-internal/voice-change-service
|
||||
|
||||
##### Soft VC
|
||||
COPY --from=front /hubert /hubert
|
||||
COPY --from=front /acoustic-model /acoustic-model
|
||||
COPY --from=front /hifigan /hifigan
|
||||
|
Loading…
Reference in New Issue
Block a user