techysanoj's picture
Update app.py
97549eb verified
import gradio as gr
from transformers import pipeline
from gtts import gTTS
import tempfile
import os
# Initialize the speech-to-text transcriber
transcriber = pipeline("automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-english")
# Initialize the question-answering model
qa_model = pipeline("question-answering", model="AVISHKAARAM/avishkaarak-ekta-hindi")
def answer_question(context, question=None, audio=None):
try:
# If audio is provided, transcribe it
if audio:
transcription_result = transcriber(audio)["text"]
question_text = transcription_result
else:
question_text = question
# Generate an answer to the question
qa_result = qa_model(question=question_text, context=context)
answer = qa_result["answer"]
# Convert the answer to speech
tts = gTTS(text=answer, lang="en")
audio_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name
tts.save(audio_path)
return answer, audio_path
except Exception as e:
return str(e), None
# Define the Gradio interface
context_input = gr.Textbox(label="Context", lines=3)
question_input = gr.Textbox(label="Question")
audio_input = gr.Audio(type="filepath", label="Question (Audio Input)")
output_text = gr.Textbox(label="Answer")
output_audio = gr.Audio(label="Answer (Audio Output)")
interface = gr.Interface(
fn=answer_question,
inputs=[context_input, question_input, audio_input],
outputs=[output_text, output_audio],
title="Multimodal Question Answering",
description="Provide a context and either a text question or an audio question to get an answer.",
examples=[
["The capital of France is Paris.", "What is the capital of France?", None],
["OpenAI is famous for developing GPT-3.", "What is OpenAI known for?", None],
],
)
# Launch the Gradio app
if __name__ == "__main__":
interface.launch()