This commit is contained in:
wataru 2022-11-03 04:05:42 +09:00
parent 1ec80ca306
commit e4c5ce27e1
7 changed files with 329 additions and 76 deletions

View File

@ -22,6 +22,7 @@ import numpy as np
from mods.ssl import create_self_signed_cert
from mods.VoiceChanger import VoiceChanger
from mods.Whisper import Whisper
class UvicornSuppressFilter(logging.Filter):
def filter(self, record):
@ -40,6 +41,7 @@ class VoiceModel(BaseModel):
srcId: int
dstId: int
timestamp: int
prefixChunkSize: int
buffer: str
@ -52,8 +54,23 @@ class MyCustomNamespace(socketio.AsyncNamespace):
self.voiceChanger.destroy()
self.voiceChanger = VoiceChanger(config, model)
def changeVoice(self, gpu, srcId, dstId, timestamp, unpackedData):
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, unpackedData)
def loadWhisperModel(self, model):
self.whisper = Whisper()
self.whisper.loadModel("tiny")
print("load")
def changeVoice(self, gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData):
if hasattr(self, 'whisper') == True:
self.whisper.addData(unpackedData)
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
def transcribe(self):
if hasattr(self, 'whisper') == True:
self.whisper.transcribe(0)
else:
print("whisper not found")
def on_connect(self, sid, environ):
# print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
@ -65,10 +82,11 @@ class MyCustomNamespace(socketio.AsyncNamespace):
srcId = int(msg[1])
dstId = int(msg[2])
timestamp = int(msg[3])
data = msg[4]
prefixChunkSize = int(msg[4])
data = msg[5]
# print(srcId, dstId, timestamp)
unpackedData = np.array(struct.unpack('<%sh'%(len(data) // struct.calcsize('<h') ), data))
audio1 = self.changeVoice(gpu, srcId, dstId, timestamp, unpackedData)
audio1 = self.changeVoice(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
bin = struct.pack('<%sh'%len(audio1), *audio1)
@ -136,6 +154,9 @@ if __name__ == thisFilename or args.colab == True:
sio.register_namespace(namespace)
if CONFIG and MODEL:
namespace.loadModel(CONFIG, MODEL)
namespace.loadWhisperModel("base")
app_socketio = socketio.ASGIApp(
sio,
other_asgi_app=app_fastapi,
@ -167,7 +188,6 @@ if __name__ == thisFilename or args.colab == True:
return {"Error": "uploaded file is not found."}
@app_fastapi.post("/upload_file")
async def post_upload_file(
file:UploadFile = File(...),
@ -206,6 +226,16 @@ if __name__ == thisFilename or args.colab == True:
return {"File saved to": f"{target_file_name}"}
@app_fastapi.get("/transcribe")
def get_transcribe():
try:
namespace.transcribe()
except Exception as e:
print("TRANSCRIBE PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())
return str(e)
@app_fastapi.post("/test")
async def post_test(voice:VoiceModel):
try:
@ -214,6 +244,7 @@ if __name__ == thisFilename or args.colab == True:
srcId = voice.srcId
dstId = voice.dstId
timestamp = voice.timestamp
prefixChunkSize = voice.prefixChunkSize
buffer = voice.buffer
wav = base64.b64decode(buffer)
@ -224,7 +255,7 @@ if __name__ == thisFilename or args.colab == True:
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
changedVoice = namespace.changeVoice(gpu, srcId, dstId, timestamp, unpackedData)
changedVoice = namespace.changeVoice(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
data = {
@ -232,6 +263,7 @@ if __name__ == thisFilename or args.colab == True:
"srcId":srcId,
"dstId":dstId,
"timestamp":timestamp,
"prefixChunkSize":prefixChunkSize,
"changedVoiceBase64":changedVoiceBase64
}
@ -320,7 +352,7 @@ if __name__ == '__main__':
reload=True,
ssl_keyfile = key_path,
ssl_certfile = cert_path,
log_level="critical"
log_level="critical"
)
else:
# HTTP サーバ起動

View File

@ -24,30 +24,30 @@ class VoiceChanger():
self.net_g.eval()
self.gpu_num = torch.cuda.device_count()
utils.load_checkpoint( model, self.net_g, None)
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
self.text_norm = torch.LongTensor(text_norm)
self.audio_buffer = torch.zeros(1, 0)
self.prev_audio = np.zeros(1)
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num})")
def destroy(self):
del self.net_g
def on_request(self, gpu, srcId, dstId, timestamp, wav):
# if wav==0:
# samplerate, data=read("dummy.wav")
# unpackedData = data
# else:
# unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
# write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
def on_request(self, gpu, srcId, dstId, timestamp, prefixChunkSize, wav):
unpackedData = wav
convertSize = unpackedData.shape[0] + (prefixChunkSize * 512)
try:
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
text_norm = commons.intersperse(text_norm, 0)
text_norm = torch.LongTensor(text_norm)
audio = torch.FloatTensor(unpackedData.astype(np.float32))
audio_norm = audio /self.hps.data.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1)
audio_norm = self.audio_buffer[:,-convertSize:]
self.audio_buffer = audio_norm
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
@ -55,7 +55,7 @@ class VoiceChanger():
spec = torch.squeeze(spec, 0)
sid = torch.LongTensor([int(srcId)])
data = (text_norm, spec, audio_norm, sid)
data = (self.text_norm , spec, audio_norm, sid)
data = TextAudioSpeakerCollate()([data])
if gpu<0 or self.gpu_num==0 :
@ -68,6 +68,17 @@ class VoiceChanger():
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
# if len(self.prev_audio) > unpackedData.shape[0]:
# prevLastFragment = self.prev_audio[-unpackedData.shape[0]:]
# curSecondLastFragment = audio1[-unpackedData.shape[0]*2:-unpackedData.shape[0]]
# print("prev, cur", prevLastFragment.shape, curSecondLastFragment.shape)
# self.prev_audio = audio1
# print("self.prev_audio", self.prev_audio.shape)
audio1 = audio1[-unpackedData.shape[0]*2:]
except Exception as e:
print("VC PROCESSING!!!! EXCEPTION!!!", e)
print(traceback.format_exc())

36
demo/mods/Whisper.py Executable file
View File

@ -0,0 +1,36 @@
import whisper
import numpy as np
import torchaudio
from scipy.io.wavfile import write
_MODELS = {
"tiny": "/whisper/tiny.pt",
"base": "/whisper/base.pt",
"small": "/whisper/small.pt",
"medium": "/whisper/medium.pt",
}
class Whisper():
def __init__(self):
self.storedSizeFromTry = 0
def loadModel(self, model):
# self.model = whisper.load_model(_MODELS[model], device="cpu")
self.model = whisper.load_model(_MODELS[model])
self.data = np.zeros(1).astype(np.float)
def addData(self, unpackedData):
self.data = np.concatenate([self.data, unpackedData], 0)
def transcribe(self, audio):
received_data_file = "received_data.wav"
write(received_data_file, 24000, self.data.astype(np.int16))
source, sr = torchaudio.load(received_data_file)
target = torchaudio.functional.resample(source, 24000, 16000)
result = self.model.transcribe(received_data_file)
print("WHISPER1:::", result["text"])
print("WHISPER2:::", result["segments"])
self.data = np.zeros(1).astype(np.float)
return result["text"]

View File

@ -6,21 +6,44 @@
"buffer_size": 1024,
"prefix_chunk_size": 24,
"chunk_size": 24,
"speaker_ids": [100, 107, 101, 102, 103],
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つむぎ"],
"speakers": [
{
"id": 100,
"name": "ずんだもん"
},
{
"id": 107,
"name": "user"
},
{
"id": 101,
"name": "そら"
},
{
"id": 102,
"name": "めたん"
},
{
"id": 103,
"name": "つむぎ"
}
],
"src_id": 107,
"dst_id": 100,
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [-1, 0, 1, 2, 3, 4],
"screen": {
"enable_screen": true,
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg"
},
"avatar": {
"enable_avatar": true,
"motion_capture_face": true,
"motion_capture_upperbody": true,
"lip_overwrite_with_voice": true,
"enable_avatar": false,
"motion_capture_face": false,
"motion_capture_upperbody": false,
"lip_overwrite_with_voice": false,
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
"background_color": "#0000dd",
"chroma_key": "#0000dd",
"avatar_canvas_size": [1280, 720],
@ -34,5 +57,9 @@
"cross_fade_offset_rate": 0.3,
"cross_fade_end_rate": 0.6,
"cross_fade_type": 2
},
"transcribe": {
"lang": "日本語(ja-JP)",
"expire_time": 5
}
}

View File

@ -6,21 +6,44 @@
"buffer_size": 1024,
"prefix_chunk_size": 24,
"chunk_size": 24,
"speaker_ids": [100, 107, 101, 102, 103],
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つむぎ"],
"speakers": [
{
"id": 100,
"name": "ずんだもん"
},
{
"id": 107,
"name": "user"
},
{
"id": 101,
"name": "そら"
},
{
"id": 102,
"name": "めたん"
},
{
"id": 103,
"name": "つむぎ"
}
],
"src_id": 107,
"dst_id": 100,
"vf_enable": true,
"voice_changer_mode": "realtime",
"gpu": 0,
"available_gpus": [-1, 0, 1, 2, 3, 4],
"screen": {
"enable_screen": true,
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg"
},
"avatar": {
"enable_avatar": true,
"motion_capture_face": true,
"motion_capture_upperbody": true,
"lip_overwrite_with_voice": true,
"enable_avatar": false,
"motion_capture_face": false,
"motion_capture_upperbody": false,
"lip_overwrite_with_voice": false,
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
"background_color": "#0000dd",
"chroma_key": "#0000dd",
"avatar_canvas_size": [1280, 720],
@ -34,5 +57,9 @@
"cross_fade_offset_rate": 0.3,
"cross_fade_end_rate": 0.6,
"cross_fade_type": 2
},
"transcribe": {
"lang": "日本語(ja-JP)",
"expire_time": 5
}
}

202
frontend/dist/index.js vendored

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
FROM dannadori/voice-changer-internal:20221030_232317 as front
FROM dannadori/voice-changer-internal:20221103_035426 as front
FROM debian:bullseye-slim as base
ARG DEBIAN_FRONTEND=noninteractive