mirror of
https://github.com/w-okada/voice-changer.git
synced 2025-01-23 05:25:01 +03:00
update
This commit is contained in:
parent
1ec80ca306
commit
e4c5ce27e1
@ -22,6 +22,7 @@ import numpy as np
|
||||
|
||||
from mods.ssl import create_self_signed_cert
|
||||
from mods.VoiceChanger import VoiceChanger
|
||||
from mods.Whisper import Whisper
|
||||
|
||||
class UvicornSuppressFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
@ -40,6 +41,7 @@ class VoiceModel(BaseModel):
|
||||
srcId: int
|
||||
dstId: int
|
||||
timestamp: int
|
||||
prefixChunkSize: int
|
||||
buffer: str
|
||||
|
||||
|
||||
@ -52,8 +54,23 @@ class MyCustomNamespace(socketio.AsyncNamespace):
|
||||
self.voiceChanger.destroy()
|
||||
self.voiceChanger = VoiceChanger(config, model)
|
||||
|
||||
def changeVoice(self, gpu, srcId, dstId, timestamp, unpackedData):
|
||||
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, unpackedData)
|
||||
def loadWhisperModel(self, model):
|
||||
self.whisper = Whisper()
|
||||
self.whisper.loadModel("tiny")
|
||||
print("load")
|
||||
|
||||
def changeVoice(self, gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData):
|
||||
if hasattr(self, 'whisper') == True:
|
||||
self.whisper.addData(unpackedData)
|
||||
|
||||
return self.voiceChanger.on_request(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
||||
|
||||
def transcribe(self):
|
||||
if hasattr(self, 'whisper') == True:
|
||||
self.whisper.transcribe(0)
|
||||
else:
|
||||
print("whisper not found")
|
||||
|
||||
|
||||
def on_connect(self, sid, environ):
|
||||
# print('[{}] connet sid : {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S') , sid))
|
||||
@ -65,10 +82,11 @@ class MyCustomNamespace(socketio.AsyncNamespace):
|
||||
srcId = int(msg[1])
|
||||
dstId = int(msg[2])
|
||||
timestamp = int(msg[3])
|
||||
data = msg[4]
|
||||
prefixChunkSize = int(msg[4])
|
||||
data = msg[5]
|
||||
# print(srcId, dstId, timestamp)
|
||||
unpackedData = np.array(struct.unpack('<%sh'%(len(data) // struct.calcsize('<h') ), data))
|
||||
audio1 = self.changeVoice(gpu, srcId, dstId, timestamp, unpackedData)
|
||||
audio1 = self.changeVoice(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
||||
|
||||
bin = struct.pack('<%sh'%len(audio1), *audio1)
|
||||
|
||||
@ -136,6 +154,9 @@ if __name__ == thisFilename or args.colab == True:
|
||||
sio.register_namespace(namespace)
|
||||
if CONFIG and MODEL:
|
||||
namespace.loadModel(CONFIG, MODEL)
|
||||
namespace.loadWhisperModel("base")
|
||||
|
||||
|
||||
app_socketio = socketio.ASGIApp(
|
||||
sio,
|
||||
other_asgi_app=app_fastapi,
|
||||
@ -167,7 +188,6 @@ if __name__ == thisFilename or args.colab == True:
|
||||
return {"Error": "uploaded file is not found."}
|
||||
|
||||
|
||||
|
||||
@app_fastapi.post("/upload_file")
|
||||
async def post_upload_file(
|
||||
file:UploadFile = File(...),
|
||||
@ -206,6 +226,16 @@ if __name__ == thisFilename or args.colab == True:
|
||||
return {"File saved to": f"{target_file_name}"}
|
||||
|
||||
|
||||
|
||||
@app_fastapi.get("/transcribe")
|
||||
def get_transcribe():
|
||||
try:
|
||||
namespace.transcribe()
|
||||
except Exception as e:
|
||||
print("TRANSCRIBE PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
return str(e)
|
||||
|
||||
@app_fastapi.post("/test")
|
||||
async def post_test(voice:VoiceModel):
|
||||
try:
|
||||
@ -214,6 +244,7 @@ if __name__ == thisFilename or args.colab == True:
|
||||
srcId = voice.srcId
|
||||
dstId = voice.dstId
|
||||
timestamp = voice.timestamp
|
||||
prefixChunkSize = voice.prefixChunkSize
|
||||
buffer = voice.buffer
|
||||
wav = base64.b64decode(buffer)
|
||||
|
||||
@ -224,7 +255,7 @@ if __name__ == thisFilename or args.colab == True:
|
||||
unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
|
||||
write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
|
||||
|
||||
changedVoice = namespace.changeVoice(gpu, srcId, dstId, timestamp, unpackedData)
|
||||
changedVoice = namespace.changeVoice(gpu, srcId, dstId, timestamp, prefixChunkSize, unpackedData)
|
||||
changedVoiceBase64 = base64.b64encode(changedVoice).decode('utf-8')
|
||||
|
||||
data = {
|
||||
@ -232,6 +263,7 @@ if __name__ == thisFilename or args.colab == True:
|
||||
"srcId":srcId,
|
||||
"dstId":dstId,
|
||||
"timestamp":timestamp,
|
||||
"prefixChunkSize":prefixChunkSize,
|
||||
"changedVoiceBase64":changedVoiceBase64
|
||||
}
|
||||
|
||||
@ -320,7 +352,7 @@ if __name__ == '__main__':
|
||||
reload=True,
|
||||
ssl_keyfile = key_path,
|
||||
ssl_certfile = cert_path,
|
||||
log_level="critical"
|
||||
log_level="critical"
|
||||
)
|
||||
else:
|
||||
# HTTP サーバ起動
|
||||
|
@ -24,30 +24,30 @@ class VoiceChanger():
|
||||
self.net_g.eval()
|
||||
self.gpu_num = torch.cuda.device_count()
|
||||
utils.load_checkpoint( model, self.net_g, None)
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
self.text_norm = torch.LongTensor(text_norm)
|
||||
self.audio_buffer = torch.zeros(1, 0)
|
||||
self.prev_audio = np.zeros(1)
|
||||
|
||||
print(f"VoiceChanger Initialized (GPU_NUM:{self.gpu_num})")
|
||||
|
||||
def destroy(self):
|
||||
del self.net_g
|
||||
|
||||
def on_request(self, gpu, srcId, dstId, timestamp, wav):
|
||||
# if wav==0:
|
||||
# samplerate, data=read("dummy.wav")
|
||||
# unpackedData = data
|
||||
# else:
|
||||
# unpackedData = np.array(struct.unpack('<%sh'%(len(wav) // struct.calcsize('<h') ), wav))
|
||||
# write("logs/received_data.wav", 24000, unpackedData.astype(np.int16))
|
||||
|
||||
def on_request(self, gpu, srcId, dstId, timestamp, prefixChunkSize, wav):
|
||||
unpackedData = wav
|
||||
convertSize = unpackedData.shape[0] + (prefixChunkSize * 512)
|
||||
|
||||
try:
|
||||
|
||||
text_norm = text_to_sequence("a", self.hps.data.text_cleaners)
|
||||
text_norm = commons.intersperse(text_norm, 0)
|
||||
text_norm = torch.LongTensor(text_norm)
|
||||
|
||||
audio = torch.FloatTensor(unpackedData.astype(np.float32))
|
||||
audio_norm = audio /self.hps.data.max_wav_value
|
||||
audio_norm = audio_norm.unsqueeze(0)
|
||||
self.audio_buffer = torch.cat([self.audio_buffer, audio_norm], axis=1)
|
||||
audio_norm = self.audio_buffer[:,-convertSize:]
|
||||
self.audio_buffer = audio_norm
|
||||
|
||||
spec = spectrogram_torch(audio_norm, self.hps.data.filter_length,
|
||||
self.hps.data.sampling_rate, self.hps.data.hop_length, self.hps.data.win_length,
|
||||
@ -55,7 +55,7 @@ class VoiceChanger():
|
||||
spec = torch.squeeze(spec, 0)
|
||||
sid = torch.LongTensor([int(srcId)])
|
||||
|
||||
data = (text_norm, spec, audio_norm, sid)
|
||||
data = (self.text_norm , spec, audio_norm, sid)
|
||||
data = TextAudioSpeakerCollate()([data])
|
||||
|
||||
if gpu<0 or self.gpu_num==0 :
|
||||
@ -68,6 +68,17 @@ class VoiceChanger():
|
||||
x, x_lengths, spec, spec_lengths, y, y_lengths, sid_src = [x.cuda(gpu) for x in data]
|
||||
sid_tgt1 = torch.LongTensor([dstId]).cuda(gpu)
|
||||
audio1 = (self.net_g.cuda(gpu).voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt1)[0][0,0].data * self.hps.data.max_wav_value).cpu().float().numpy()
|
||||
|
||||
# if len(self.prev_audio) > unpackedData.shape[0]:
|
||||
# prevLastFragment = self.prev_audio[-unpackedData.shape[0]:]
|
||||
# curSecondLastFragment = audio1[-unpackedData.shape[0]*2:-unpackedData.shape[0]]
|
||||
# print("prev, cur", prevLastFragment.shape, curSecondLastFragment.shape)
|
||||
# self.prev_audio = audio1
|
||||
# print("self.prev_audio", self.prev_audio.shape)
|
||||
|
||||
audio1 = audio1[-unpackedData.shape[0]*2:]
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print("VC PROCESSING!!!! EXCEPTION!!!", e)
|
||||
print(traceback.format_exc())
|
||||
|
36
demo/mods/Whisper.py
Executable file
36
demo/mods/Whisper.py
Executable file
@ -0,0 +1,36 @@
|
||||
import whisper
|
||||
import numpy as np
|
||||
import torchaudio
|
||||
from scipy.io.wavfile import write
|
||||
|
||||
_MODELS = {
|
||||
"tiny": "/whisper/tiny.pt",
|
||||
"base": "/whisper/base.pt",
|
||||
"small": "/whisper/small.pt",
|
||||
"medium": "/whisper/medium.pt",
|
||||
}
|
||||
|
||||
|
||||
class Whisper():
|
||||
def __init__(self):
|
||||
self.storedSizeFromTry = 0
|
||||
|
||||
def loadModel(self, model):
|
||||
# self.model = whisper.load_model(_MODELS[model], device="cpu")
|
||||
self.model = whisper.load_model(_MODELS[model])
|
||||
self.data = np.zeros(1).astype(np.float)
|
||||
|
||||
def addData(self, unpackedData):
|
||||
self.data = np.concatenate([self.data, unpackedData], 0)
|
||||
|
||||
def transcribe(self, audio):
|
||||
received_data_file = "received_data.wav"
|
||||
write(received_data_file, 24000, self.data.astype(np.int16))
|
||||
source, sr = torchaudio.load(received_data_file)
|
||||
target = torchaudio.functional.resample(source, 24000, 16000)
|
||||
result = self.model.transcribe(received_data_file)
|
||||
print("WHISPER1:::", result["text"])
|
||||
print("WHISPER2:::", result["segments"])
|
||||
self.data = np.zeros(1).astype(np.float)
|
||||
return result["text"]
|
||||
|
41
frontend/dist/assets/setting.json
vendored
41
frontend/dist/assets/setting.json
vendored
@ -6,21 +6,44 @@
|
||||
"buffer_size": 1024,
|
||||
"prefix_chunk_size": 24,
|
||||
"chunk_size": 24,
|
||||
"speaker_ids": [100, 107, 101, 102, 103],
|
||||
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つむぎ"],
|
||||
"speakers": [
|
||||
{
|
||||
"id": 100,
|
||||
"name": "ずんだもん"
|
||||
},
|
||||
{
|
||||
"id": 107,
|
||||
"name": "user"
|
||||
},
|
||||
{
|
||||
"id": 101,
|
||||
"name": "そら"
|
||||
},
|
||||
{
|
||||
"id": 102,
|
||||
"name": "めたん"
|
||||
},
|
||||
{
|
||||
"id": 103,
|
||||
"name": "つむぎ"
|
||||
}
|
||||
],
|
||||
"src_id": 107,
|
||||
"dst_id": 100,
|
||||
"vf_enable": true,
|
||||
"voice_changer_mode": "realtime",
|
||||
"gpu": 0,
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4],
|
||||
"screen": {
|
||||
"enable_screen": true,
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg"
|
||||
},
|
||||
"avatar": {
|
||||
"enable_avatar": true,
|
||||
"motion_capture_face": true,
|
||||
"motion_capture_upperbody": true,
|
||||
"lip_overwrite_with_voice": true,
|
||||
"enable_avatar": false,
|
||||
"motion_capture_face": false,
|
||||
"motion_capture_upperbody": false,
|
||||
"lip_overwrite_with_voice": false,
|
||||
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
|
||||
"background_color": "#0000dd",
|
||||
"chroma_key": "#0000dd",
|
||||
"avatar_canvas_size": [1280, 720],
|
||||
@ -34,5 +57,9 @@
|
||||
"cross_fade_offset_rate": 0.3,
|
||||
"cross_fade_end_rate": 0.6,
|
||||
"cross_fade_type": 2
|
||||
},
|
||||
"transcribe": {
|
||||
"lang": "日本語(ja-JP)",
|
||||
"expire_time": 5
|
||||
}
|
||||
}
|
||||
|
41
frontend/dist/assets/setting_mmvc.json
vendored
41
frontend/dist/assets/setting_mmvc.json
vendored
@ -6,21 +6,44 @@
|
||||
"buffer_size": 1024,
|
||||
"prefix_chunk_size": 24,
|
||||
"chunk_size": 24,
|
||||
"speaker_ids": [100, 107, 101, 102, 103],
|
||||
"speaker_names": ["ずんだもん", "user", "そら", "めたん", "つむぎ"],
|
||||
"speakers": [
|
||||
{
|
||||
"id": 100,
|
||||
"name": "ずんだもん"
|
||||
},
|
||||
{
|
||||
"id": 107,
|
||||
"name": "user"
|
||||
},
|
||||
{
|
||||
"id": 101,
|
||||
"name": "そら"
|
||||
},
|
||||
{
|
||||
"id": 102,
|
||||
"name": "めたん"
|
||||
},
|
||||
{
|
||||
"id": 103,
|
||||
"name": "つむぎ"
|
||||
}
|
||||
],
|
||||
"src_id": 107,
|
||||
"dst_id": 100,
|
||||
"vf_enable": true,
|
||||
"voice_changer_mode": "realtime",
|
||||
"gpu": 0,
|
||||
"available_gpus": [-1, 0, 1, 2, 3, 4],
|
||||
"screen": {
|
||||
"enable_screen": true,
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg"
|
||||
},
|
||||
"avatar": {
|
||||
"enable_avatar": true,
|
||||
"motion_capture_face": true,
|
||||
"motion_capture_upperbody": true,
|
||||
"lip_overwrite_with_voice": true,
|
||||
"enable_avatar": false,
|
||||
"motion_capture_face": false,
|
||||
"motion_capture_upperbody": false,
|
||||
"lip_overwrite_with_voice": false,
|
||||
"avatar_url": "./assets/vrm/zundamon/zundamon.vrm",
|
||||
"backgournd_image_url": "./assets/images/bg_natural_sougen.jpg",
|
||||
"background_color": "#0000dd",
|
||||
"chroma_key": "#0000dd",
|
||||
"avatar_canvas_size": [1280, 720],
|
||||
@ -34,5 +57,9 @@
|
||||
"cross_fade_offset_rate": 0.3,
|
||||
"cross_fade_end_rate": 0.6,
|
||||
"cross_fade_type": 2
|
||||
},
|
||||
"transcribe": {
|
||||
"lang": "日本語(ja-JP)",
|
||||
"expire_time": 5
|
||||
}
|
||||
}
|
||||
|
202
frontend/dist/index.js
vendored
202
frontend/dist/index.js
vendored
File diff suppressed because one or more lines are too long
@ -1,4 +1,4 @@
|
||||
FROM dannadori/voice-changer-internal:20221030_232317 as front
|
||||
FROM dannadori/voice-changer-internal:20221103_035426 as front
|
||||
FROM debian:bullseye-slim as base
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
|
Loading…
Reference in New Issue
Block a user