Merge pull request #73 from w-okada/dev

update
2025-01-23 21:45:00 +03:00 · 2022-10-28 19:26:06 +09:00 · 2022-10-28 19:26:06 +09:00 · c664fabf4b
commit c664fabf4b
parent 2d70330d06 56235cf76e
6 changed files with 4931 additions and 21 deletions
--- a/demo/serverFastAPI.py
+++ b/demo/serverFastAPI.py
@ -13,14 +13,19 @@ import torch
 import numpy as np
 from scipy.io.wavfile import write, read
-sys.path.append("mod")
+# sys.path.append("mod")
-sys.path.append("mod/text")
+# sys.path.append("mod/text")
 sys.path.append("/MMVC_Trainer")
 sys.path.append("/MMVC_Trainer/text")
 import utils
 import commons 
 from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
 from models import SynthesizerTrn
 from text.symbols import symbols
-
+from mel_processing import spectrogram_torch
 from text import text_to_sequence, cleaned_text_to_sequence
 class VoiceChanger():
    def __init__(self, config, model):
@ -48,16 +53,52 @@ class VoiceChanger():
        try:
            if gpu<0 or self.gpu_num==0 :
                with torch.no_grad():
-                    dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
+                    # dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
-                    data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
+                    # data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
                    text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
                    text_norm = commons.intersperse(text_norm, 0)
                    text_norm = torch.LongTensor(text_norm)
                    audio = torch.FloatTensor(unpackedData.astype(np.float32))
                    audio_norm = audio /self.hps.data.max_wav_value
                    audio_norm = audio_norm.unsqueeze(0)
                    spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
                            self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
                            center=False)
                    spec = torch.squeeze(spec, 0)
                    sid = torch.LongTensor([int(srcId)])
                    data =  (text_norm, spec, audio_norm, sid)
                    data = TextAudioSpeakerCollate()([data])
                    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
                    sid_tgt1 = torch.LongTensor([dstId]).cpu()
                    audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
            else:
                with torch.no_grad():
-                    dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
+                    # dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
-                    data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
+                    # data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
                    text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
                    text_norm = commons.intersperse(text_norm, 0)
                    text_norm = torch.LongTensor(text_norm)
                    audio = torch.FloatTensor(unpackedData.astype(np.float32))
                    audio_norm = audio /self.hps.data.max_wav_value
                    audio_norm = audio_norm.unsqueeze(0)
                    spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
                            self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
                            center=False)
                    spec = torch.squeeze(spec, 0)
                    sid = torch.LongTensor([int(srcId)])
                    data =  (text_norm, spec, audio_norm, sid)
                    data = TextAudioSpeakerCollate()([data])
                    x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
                    sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
--- a/demo/serverSIO.py
+++ b/demo/serverSIO.py
@ -9,12 +9,17 @@ import torch
 import numpy as np
 from scipy.io.wavfile import write
-sys.path.append("mod")
+sys.path.append("/MMVC_Trainer")
-sys.path.append("mod/text")
+sys.path.append("/MMVC_Trainer/text")
 import utils
 import commons 
 from data_utils import TextAudioSpeakerLoader, TextAudioSpeakerCollate
 from models import SynthesizerTrn
 from text.symbols import symbols
 from mel_processing import spectrogram_torch
 from text import text_to_sequence, cleaned_text_to_sequence
 class MyCustomNamespace(socketio.Namespace): 
    def __init__(self, namespace, config, model):
@ -50,17 +55,49 @@ class MyCustomNamespace(socketio.Namespace):
        if gpu<0 or self.gpu_num==0 :
            with torch.no_grad():
-                dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
+                
-                data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
+                text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
                text_norm = commons.intersperse(text_norm, 0)
                text_norm = torch.LongTensor(text_norm)
                audio = torch.FloatTensor(unpackedData.astype(np.float32))
                audio_norm = audio /self.hps.data.max_wav_value
                audio_norm = audio_norm.unsqueeze(0)
                spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
                        self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
                        center=False)
                spec = torch.squeeze(spec, 0)
                sid = torch.LongTensor([int(srcId)])
                data =  (text_norm, spec, audio_norm, sid)
                data = TextAudioSpeakerCollate()([data])
                x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cpu() for x in data]
                sid_tgt1 = torch.LongTensor([dstId]).cpu()
                audio1 = (self.net_g.cpu().voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
        else:
            with torch.no_grad():
-                dataset = TextAudioSpeakerLoader("dummy.txt", self.hps.data, no_use_textfile=True)
+
-                data = dataset.get_audio_text_speaker_pair([ unpackedData, srcId, "a"])
+                text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
                text_norm = commons.intersperse(text_norm, 0)
                text_norm = torch.LongTensor(text_norm)
                audio = torch.FloatTensor(unpackedData.astype(np.float32))
                audio_norm = audio /self.hps.data.max_wav_value
                audio_norm = audio_norm.unsqueeze(0)
                spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
                        self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
                        center=False)
                spec = torch.squeeze(spec, 0)
                sid = torch.LongTensor([int(srcId)])
                data =  (text_norm, spec, audio_norm, sid)
                data = TextAudioSpeakerCollate()([data])
                x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
                sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
                audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
--- a/frontend/dist/index.html
+++ b/frontend/dist/index.html
@ -1 +1,13 @@
-<!doctype html><html lang="ja" style="width:100%;height:100%;overflow:hidden"><head><meta charset="utf-8"/><title>voice recorder</title><script defer="defer" src="index.js"></script></head><body style="width:100%;height:100%;margin:0"><div id="app" style="width:100%;height:100%"></div><noscript><strong>javascriptを有効にしてください</strong></noscript></body></html>
+<!DOCTYPE html>
 <html lang="ja" style="width: 100%; height: 100%; overflow: hidden">
    <head>
        <meta charset="utf-8" />
        <title>voice recorder</title>
    <script defer src="index.js"></script></head>
    <body style="width: 100%; height: 100%; margin: 0px">
        <div id="app" style="width: 100%; height: 100%"></div>
        <noscript>
            <strong>javascriptを有効にしてください</strong>
        </noscript>
    </body>
 </html>
--- a/frontend/dist/index.js
+++ b/frontend/dist/index.js
--- a/start2.sh
+++ b/start2.sh
@ -1,7 +1,7 @@
 #!/bin/bash
 set -eu
-DOCKER_IMAGE=dannadori/voice-changer:20221003_002318
+DOCKER_IMAGE=dannadori/voice-changer:20221028_191234
 #DOCKER_IMAGE=voice-changer
--- a/trainer/Dockerfile
+++ b/trainer/Dockerfile
@ -1,4 +1,4 @@
-FROM dannadori/voice-changer-internal:20221002_193031 as front
+FROM dannadori/voice-changer-internal:20221028_190940 as front
 FROM debian:bullseye-slim as base
 ARG DEBIAN_FRONTEND=noninteractive
@ -8,7 +8,7 @@ RUN apt-get install -y python3-pip git
 RUN apt-get install -y espeak
 RUN apt-get install -y cmake
-RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.0
+RUN git clone --depth 1 https://github.com/isletennos/MMVC_Trainer.git -b v1.3.1.3
 RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
@ -49,18 +49,22 @@ COPY fine_model/D_180000.pth /MMVC_Trainer/fine_model/D_180000.pth
 ### Copy from base
 COPY --from=base --chmod=777 /usr/local/lib/python3.9/dist-packages /usr/local/lib/python3.9/dist-packages
-COPY --from=base --chmod=777 /MMVC_Trainer /MMVC_Trainer
+
 ### Copy from frontend
 ##### MMVC Trainer
 COPY --from=front --chmod=777 /MMVC_Trainer /MMVC_Trainer
 RUN chmod 0777 /MMVC_Trainer 
 WORKDIR /MMVC_Trainer
 ADD /setup.sh  /MMVC_Trainer/
 ADD /exec.sh  /MMVC_Trainer/
-### Copy from frontend
+##### Voice changer Internal
 COPY --from=front --chmod=777 /voice-changer-internal/frontend/dist /voice-changer-internal/frontend/dist
 COPY --from=front --chmod=777 /voice-changer-internal/voice-change-service /voice-changer-internal/voice-change-service
 RUN chmod 0777 /voice-changer-internal/voice-change-service 
 ##### Soft VC
 COPY --from=front /hubert /hubert
 COPY --from=front /acoustic-model /acoustic-model
 COPY --from=front /hifigan /hifigan