from transformers import BlipForQuestionAnswering, AutoProcessor  # For the pre-trained VQA model and processor
from PIL import Image  # For image handling
import gradio as gr  # For creating the interface
from gtts import gTTS  # For converting text to speech
import os  # For file handling

# Load the model and processor from Hugging Face
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Define the function that handles the image and question input, and returns an audio response
def answer_question_with_audio(image, question):
    # If the input is a file path, open the image
    if isinstance(image, str):
        image = Image.open(image)

    # Process the image and question using the processor to get inputs for the model
    inputs = processor(image, question, return_tensors="pt")

    # Generate the model's response to the question
    out = model.generate(**inputs)

    # Decode the model's output to get a human-readable answer
    answer_text = processor.decode(out[0], skip_special_tokens=True)

    # Convert the text answer to audio using gTTS
    tts = gTTS(text=answer_text, lang='en')

    # Save the audio file
    audio_path = "answer.mp3"
    tts.save(audio_path)

    # Return the path to the audio file
    return audio_path

# Create a Gradio interface with image and text inputs, and an audio output
interface = gr.Interface(
    fn=answer_question_with_audio,  # Function to call when the interface is used
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")],  # Inputs: Image and Textbox
    outputs=gr.Audio(label="Answer (Audio)"),  # Output: Audio response
    title="Visual Question Answering with Audio",  # Title of the interface
    description="Upload an image and ask a question. The answer will be provided as an audio response."  # Description
)

# Launch the Gradio interface with public sharing enabled
interface.launch(share=True)