Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import uuid | |
import os | |
ASR_API = "http://astarwiz.com:9998/asr" | |
TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak' | |
TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave' | |
LANGUAGE_MAP = { | |
"en": "English", | |
"ma": "Malay", | |
"ta": "Tamil", | |
"zh": "Chinese" | |
} | |
# Add a password for developer mode | |
DEVELOPER_PASSWORD = os.getenv("DEV_PWD") | |
def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64): | |
print(input_text) | |
one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant" | |
vllm_api = 'http://astarwiz.com:2333/' + "v1/completions" | |
data = { | |
"prompt": one_vllm_input, | |
'model': "./Edu-4B-NewTok-V2-20240904/", | |
'min_tokens': min_new_tokens, | |
'max_tokens': max_new_tokens, | |
'temperature': 0.1, | |
'top_p': 0.75, | |
'repetition_penalty': 1.1, | |
"stop_token_ids": [151645, ], | |
} | |
response = requests.post(vllm_api, headers={"Content-Type": "application/json"}, json=data).json() | |
print(response) | |
if "choices" in response.keys(): | |
return response["choices"][0]['text'].strip() | |
else: | |
return "The system got some error during vLLM generation. Please try it again." | |
def transcribe_and_speak(audio, source_lang, target_lang): | |
if not audio: | |
return "Please provide an audio input.", None, None | |
# ASR | |
file_id = str(uuid.uuid4()) | |
files = {'file': open(audio, 'rb')} | |
data = { | |
'language': 'ms' if source_lang == 'ma' else source_lang, | |
'model_name': 'whisper-large-v2-local-cs' | |
} | |
asr_response = requests.post(ASR_API, files=files, data=data) | |
print(asr_response.json()) | |
if asr_response.status_code == 200: | |
transcription = asr_response.json()['text'] | |
else: | |
return "ASR failed", None, None | |
translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {transcription}" | |
translated_text = inference_via_llm_api(translation_prompt) | |
print(f"Translation: {translated_text}") | |
# TTS | |
tts_params = { | |
'language': target_lang, | |
'speed': 1.1, | |
'speaker': 'MS' if target_lang == 'en' else 'msFemale' if target_lang == 'ma' else 'ta_female1' if target_lang == 'ta' else 'childChinese2', | |
'text': translated_text | |
} | |
tts_response = requests.get(TTS_SPEAK_SERVICE, params=tts_params) | |
if tts_response.status_code == 200: | |
audio_file = tts_response.text.strip() | |
audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}" | |
return transcription, translated_text, audio_url | |
else: | |
return transcription, translated_text, "TTS failed" | |
def check_password(password): | |
return password == DEVELOPER_PASSWORD | |
def user_interface(audio, source_lang, target_lang): | |
_, _, audio_url = transcribe_and_speak(audio, source_lang, target_lang) | |
return audio_url | |
with gr.Blocks() as demo: | |
gr.Markdown("# Speech Translation") | |
with gr.Tab("User Mode"): | |
gr.Markdown("Speak into the microphone or upload an audio file. The app will translate and speak it back to you.") | |
with gr.Row(): | |
user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") | |
user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") | |
with gr.Row(): | |
user_button = gr.Button("Translate and Speak") | |
with gr.Row(): | |
user_audio_output = gr.Audio(label="Translated Speech") | |
user_button.click( | |
fn=user_interface, | |
inputs=[user_audio_input, user_source_lang, user_target_lang], | |
outputs=[user_audio_output] | |
) | |
with gr.Tab("Developer Mode"): | |
password_input = gr.Textbox(type="password", label="Enter Developer Password") | |
login_button = gr.Button("Login") | |
login_error = gr.Markdown(visible=False) | |
dev_interface = gr.Column(visible=False) | |
with dev_interface: | |
gr.Markdown("Developer Mode: Transcription, Translation, and TTS") | |
with gr.Row(): | |
dev_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") | |
dev_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") | |
dev_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") | |
with gr.Row(): | |
dev_button = gr.Button("Transcribe, Translate, and Speak") | |
with gr.Row(): | |
dev_text_output = gr.Textbox(label="Transcription") | |
with gr.Row(): | |
dev_translation_output = gr.Textbox(label="Translation") | |
with gr.Row(): | |
dev_audio_output = gr.Audio(label="Translated Speech") | |
dev_button.click( | |
fn=transcribe_and_speak, | |
inputs=[dev_audio_input, dev_source_lang, dev_target_lang], | |
outputs=[dev_text_output, dev_translation_output, dev_audio_output] | |
) | |
def login(password): | |
if check_password(password): | |
return gr.Column(visible=True), gr.Markdown(visible=False) | |
else: | |
return gr.Column(visible=False), gr.Markdown("Incorrect password. Please try again.", visible=True) | |
login_button.click( | |
fn=login, | |
inputs=[password_input], | |
outputs=[dev_interface, login_error] | |
) | |
demo.launch(auth=(os.getenv("DEV_USER"), os.getenv("DEV_PWD"))) |