import gradio as gr import requests import uuid import os ASR_API = "http://astarwiz.com:9998/asr" TTS_SPEAK_SERVICE = 'http://astarwiz.com:9603/speak' TTS_WAVE_SERVICE = 'http://astarwiz.com:9603/wave' LANGUAGE_MAP = { "en": "English", "ma": "Malay", "ta": "Tamil", "zh": "Chinese" } # Add a password for developer mode DEVELOPER_PASSWORD = os.getenv("DEV_PWD") def inference_via_llm_api(input_text, min_new_tokens=2, max_new_tokens=64): print(input_text) one_vllm_input = f"<|im_start|>system\nYou are a translation expert.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant" vllm_api = 'http://astarwiz.com:2333/' + "v1/completions" data = { "prompt": one_vllm_input, 'model': "./Edu-4B-NewTok-V2-20240904/", 'min_tokens': min_new_tokens, 'max_tokens': max_new_tokens, 'temperature': 0.1, 'top_p': 0.75, 'repetition_penalty': 1.1, "stop_token_ids": [151645, ], } response = requests.post(vllm_api, headers={"Content-Type": "application/json"}, json=data).json() print(response) if "choices" in response.keys(): return response["choices"][0]['text'].strip() else: return "The system got some error during vLLM generation. Please try it again." def transcribe_and_speak(audio, source_lang, target_lang): if not audio: return "Please provide an audio input.", None, None # ASR file_id = str(uuid.uuid4()) files = {'file': open(audio, 'rb')} data = { 'language': 'ms' if source_lang == 'ma' else source_lang, 'model_name': 'whisper-large-v2-local-cs' } asr_response = requests.post(ASR_API, files=files, data=data) print(asr_response.json()) if asr_response.status_code == 200: transcription = asr_response.json()['text'] else: return "ASR failed", None, None translation_prompt = f"Translate the following text from {LANGUAGE_MAP[source_lang]} to {LANGUAGE_MAP[target_lang]}: {transcription}" translated_text = inference_via_llm_api(translation_prompt) print(f"Translation: {translated_text}") # TTS tts_params = { 'language': target_lang, 'speed': 1.1, 'speaker': 'MS' if target_lang == 'en' else 'msFemale' if target_lang == 'ma' else 'ta_female1' if target_lang == 'ta' else 'childChinese2', 'text': translated_text } tts_response = requests.get(TTS_SPEAK_SERVICE, params=tts_params) if tts_response.status_code == 200: audio_file = tts_response.text.strip() audio_url = f"{TTS_WAVE_SERVICE}?file={audio_file}" return transcription, translated_text, audio_url else: return transcription, translated_text, "TTS failed" def check_password(password): return password == DEVELOPER_PASSWORD def user_interface(audio, source_lang, target_lang): _, _, audio_url = transcribe_and_speak(audio, source_lang, target_lang) return audio_url with gr.Blocks() as demo: gr.Markdown("# ASR and TTS Demo") with gr.Tab("User Mode"): gr.Markdown("Speak into the microphone or upload an audio file. The app will translate and speak it back to you.") with gr.Row(): user_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") user_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") user_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") with gr.Row(): user_button = gr.Button("Translate and Speak") with gr.Row(): user_audio_output = gr.Audio(label="Translated Speech") user_button.click( fn=user_interface, inputs=[user_audio_input, user_source_lang, user_target_lang], outputs=[user_audio_output] ) with gr.Tab("Developer Mode"): password_input = gr.Textbox(type="password", label="Enter Developer Password") login_button = gr.Button("Login") login_error = gr.Markdown(visible=False) dev_interface = gr.Column(visible=False) with dev_interface: gr.Markdown("Developer Mode: Transcription, Translation, and TTS") with gr.Row(): dev_audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath") dev_source_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Source Language", value="en") dev_target_lang = gr.Dropdown(choices=["en", "ma", "ta", "zh"], label="Target Language", value="zh") with gr.Row(): dev_button = gr.Button("Transcribe, Translate, and Speak") with gr.Row(): dev_text_output = gr.Textbox(label="Transcription") with gr.Row(): dev_translation_output = gr.Textbox(label="Translation") with gr.Row(): dev_audio_output = gr.Audio(label="Translated Speech") dev_button.click( fn=transcribe_and_speak, inputs=[dev_audio_input, dev_source_lang, dev_target_lang], outputs=[dev_text_output, dev_translation_output, dev_audio_output] ) def login(password): if check_password(password): return gr.Column(visible=True), gr.Markdown(visible=False) else: return gr.Column(visible=False), gr.Markdown("Incorrect password. Please try again.", visible=True) login_button.click( fn=login, inputs=[password_input], outputs=[dev_interface, login_error] ) demo.launch()