Spaces:
Runtime error
Runtime error
File size: 3,841 Bytes
254ac8d cac3bb3 254ac8d ed0b2d9 85e039a cac3bb3 254ac8d 2f6b046 254ac8d cac3bb3 d0a19ad cac3bb3 d0a19ad 85e039a cac3bb3 85e039a cac3bb3 d0a19ad cac3bb3 d0a19ad cac3bb3 85e039a d0a19ad cac3bb3 d0a19ad 85e039a 254ac8d cac3bb3 254ac8d cac3bb3 254ac8d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import torch
import gradio as gr
from transformers import pipeline
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from datetime import datetime
import time
import psutil
from mtranslate import translate
MODEL_NAME = "cahya/whisper-medium-id" #this always needs to stay in line 8 :D sorry for the hackiness
lang = "id"
title = "Indonesian Whisperer"
description = "Cross Language Speech to Speech using OpenAI Whisper and Coqui TTS"
info = "more info at [indonesian Whisperer](https://github.com/cahya-wirawan/indonesian-whisperer)"
badge = "https://img.shields.io/badge/Powered%20by-Indonesian%20Whisperer-red"
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def transcribe(microphone, file_upload):
warn_output = ""
if (microphone is not None) and (file_upload is not None):
warn_output = (
"WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
)
elif (microphone is None) and (file_upload is None):
return "ERROR: You have to either use the microphone or upload an audio file"
file = microphone if microphone is not None else file_upload
text = pipe(file)["text"]
return warn_output + text
LANGUAGES = list(CoquiTTS.langs.keys())
default_lang = "en"
coquiTTS = CoquiTTS()
def tts(language: str, audio_microphone: str, audio_file: str):
print(f"### {datetime.now()} TTS", language, audio_file)
transcription = transcribe(audio_microphone, audio_file)
print(f"### {datetime.now()} transcribed:", transcription)
translation = translate(transcription, language, "id")
# return output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(translation, fp, speaker={"language": language})
print(f"### {datetime.now()} fp.name:", fp.name)
return transcription, translation, fp.name
with gr.Blocks() as blocks:
gr.Markdown("<h1 style='text-align: center; margin-bottom: 1rem'>"
+ title
+ "</h1>")
gr.Markdown(description)
with gr.Row():# equal_height=False
with gr.Column():# variant="panel"
audio_microphone = gr.Audio(label="Microphone", source="microphone", type="filepath", optional=True)
audio_upload = gr.Audio(label="Upload", source="upload", type="filepath", optional=True)
print("upload:", audio_upload)
radio = gr.Radio(
label="Target Language",
choices=LANGUAGES,
value=default_lang
)
with gr.Row(): # mobile_collapse=False
submit = gr.Button("Submit", variant="primary")
with gr.Column():
text_source = gr.Textbox(label="Source Language")
text_target = gr.Textbox(label="Target Language")
audio = gr.Audio(label="Target Audio", interactive=False)
memory = psutil.virtual_memory()
gr.Markdown(info)
system_status = info = f"""
*Memory: {memory.total/(1024*1024*1024):.2f}GB, used: {memory.percent}%, available: {memory.available/(1024*1024*1024):.2f}GB*
"""
gr.Markdown(system_status)
gr.Markdown("<center>"
+f'<img src={badge} alt="visitors badge"/>'
+"</center>")
# actions
submit.click(
tts,
[radio, audio_microphone, audio_upload],
[text_source, text_target, audio],
)
radio.change(lambda lang: CoquiTTS.langs[lang]["sentence"], radio)
blocks.launch()
|