Merge pull request #73 from w-okada/dev

update
This commit is contained in:
w-okada 2022-10-28 19:26:06 +09:00 committed by GitHub
commit c664fabf4b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 4931 additions and 21 deletions

View File

@ -13,14 +13,19 @@ import torch
import numpy as np import numpy as np
from scipy.io.wavfile import write, read from scipy.io.wavfile import write, read
sys.path.append("mod") # sys.path.append("mod")
sys.path.append("mod/text") # sys.path.append("mod/text")
sys.path.append("/MMVC_Trainer")
sys.path.append("/MMVC_Trainer/text")
import utils import utils
import commons
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn from models import SynthesizerTrn
from text.symbols import symbols from text.symbols import symbols
from mel_processing import spectrogram_torch
from text import text_to_sequence, cleaned_text_to_sequence
class VoiceChanger(): class VoiceChanger():
def __init__(self, config, model): def __init__(self, config, model):
@ -48,16 +53,52 @@ class VoiceChanger():
try: try:
if gpu<0 or self.gpu_num==0 : if gpu<0 or self.gpu_num==0 :
with torch.no_grad(): with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True) # dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"]) # data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])
data = (text_norm, spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data]) data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu() sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy() audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else: else:
with torch.no_grad(): with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True) # dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"]) # data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])
data = (text_norm, spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data]) data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu) sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)

View File

@ -9,12 +9,17 @@ import torch
import numpy as np import numpy as np
from scipy.io.wavfile import write from scipy.io.wavfile import write
sys.path.append("mod") sys.path.append("/MMVC_Trainer")
sys.path.append("mod/text") sys.path.append("/MMVC_Trainer/text")
import utils import utils
import commons
from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn from models import SynthesizerTrn
from text.symbols import symbols from text.symbols import symbols
from mel_processing import spectrogram_torch
from text import text_to_sequence, cleaned_text_to_sequence
class MyCustomNamespace(socketio.Namespace): class MyCustomNamespace(socketio.Namespace):
def __init__(self, namespace, config, model): def __init__(self, namespace, config, model):
@ -50,17 +55,49 @@ class MyCustomNamespace(socketio.Namespace):
if gpu<0 or self.gpu_num==0 : if gpu<0 or self.gpu_num==0 :
with torch.no_grad(): with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"]) text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])
data = (text_norm, spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data]) data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cpu() sid_tgt1 = torch.LongTensor([dstId]).cpu()
audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy() audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
else: else:
with torch.no_grad(): with torch.no_grad():
dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"]) text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
center=False)
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])
data = (text_norm, spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data]) data = TextAudioSpeakerCollate()([data])
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data] x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu) sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy() audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()

View File

@ -1 +1,13 @@
<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html> <!DOCTYPE html>
<html lang="ja" style="width: 100%; height: 100%; overflow: hidden">
<head>
<meta charset="utf-8" />
<title>voice recorder</title>
<script defer src="index.js"></script></head>
<body style="width: 100%; height: 100%; margin: 0px">
<div id="app" style="width: 100%; height: 100%"></div>
<noscript>
<strong>javascriptを有効にしてください</strong>
</noscript>
</body>
</html>

4820
frontend/dist/index.js vendored

File diff suppressed because one or more lines are too long

View File

@ -1,7 +1,7 @@
#!/bin/bash #!/bin/bash
set -eu set -eu
DOCKER_IMAGE=dannadori/voice-changer:20221003_002318 DOCKER_IMAGE=dannadori/voice-changer:20221028_191234
#DOCKER_IMAGE=voice-changer #DOCKER_IMAGE=voice-changer

View File

@ -1,4 +1,4 @@
FROM dannadori/voice-changer-internal:20221002_193031 as front FROM dannadori/voice-changer-internal:20221028_190940 as front
FROM debian:bullseye-slim as base FROM debian:bullseye-slim as base
ARG DEBIAN_FRONTEND=noninteractive ARG DEBIAN_FRONTEND=noninteractive
@ -8,7 +8,7 @@ RUN apt-get install -y python3-pip git
RUN apt-get install -y espeak RUN apt-get install -y espeak
RUN apt-get install -y cmake RUN apt-get install -y cmake
RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.0 RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.3
RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113 RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
@ -49,18 +49,22 @@ COPY fine_model/D_180000.pth /MMVC_Trainer/fine_model/D_180000.pth
### Copy from base ### Copy from base
COPY --from=base --chmod=777 /usr/local/lib/python3.9/dist-packages /usr/local/lib/python3.9/dist-packages COPY --from=base --chmod=777 /usr/local/lib/python3.9/dist-packages /usr/local/lib/python3.9/dist-packages
COPY --from=base --chmod=777 /MMVC_Trainer /MMVC_Trainer
### Copy from frontend
##### MMVC Trainer
COPY --from=front --chmod=777 /MMVC_Trainer /MMVC_Trainer
RUN chmod 0777 /MMVC_Trainer RUN chmod 0777 /MMVC_Trainer
WORKDIR /MMVC_Trainer WORKDIR /MMVC_Trainer
ADD /setup.sh /MMVC_Trainer/ ADD /setup.sh /MMVC_Trainer/
ADD /exec.sh /MMVC_Trainer/ ADD /exec.sh /MMVC_Trainer/
### Copy from frontend ##### Voice changer Internal
COPY --from=front --chmod=777 /voice-changer-internal/frontend/dist /voice-changer-internal/frontend/dist COPY --from=front --chmod=777 /voice-changer-internal/frontend/dist /voice-changer-internal/frontend/dist
COPY --from=front --chmod=777 /voice-changer-internal/voice-change-service /voice-changer-internal/voice-change-service COPY --from=front --chmod=777 /voice-changer-internal/voice-change-service /voice-changer-internal/voice-change-service
RUN chmod 0777 /voice-changer-internal/voice-change-service RUN chmod 0777 /voice-changer-internal/voice-change-service
##### Soft VC
COPY --from=front /hubert /hubert COPY --from=front /hubert /hubert
COPY --from=front /acoustic-model /acoustic-model COPY --from=front /acoustic-model /acoustic-model
COPY --from=front /hifigan /hifigan COPY --from=front /hifigan /hifigan