from transformers import BlipForQuestionAnswering, AutoProcessor # For the pre-trained VQA model and processor from PIL import Image # For image handling import gradio as gr # For creating the interface from gtts import gTTS # For converting text to speech import os # For file handling # Load the model and processor from Hugging Face model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") # Define the function that handles the image and question input, and returns an audio response def answer_question_with_audio(image, question): # If the input is a file path, open the image if isinstance(image, str): image = Image.open(image) # Process the image and question using the processor to get inputs for the model inputs = processor(image, question, return_tensors="pt") # Generate the model's response to the question out = model.generate(**inputs) # Decode the model's output to get a human-readable answer answer_text = processor.decode(out[0], skip_special_tokens=True) # Convert the text answer to audio using gTTS tts = gTTS(text=answer_text, lang='en') # Save the audio file audio_path = "answer.mp3" tts.save(audio_path) # Return the path to the audio file return audio_path # Create a Gradio interface with image and text inputs, and an audio output interface = gr.Interface( fn=answer_question_with_audio, # Function to call when the interface is used inputs=[gr.Image(type="pil"), gr.Textbox(label="Question")], # Inputs: Image and Textbox outputs=gr.Audio(label="Answer (Audio)"), # Output: Audio response title="Visual Question Answering with Audio", # Title of the interface description="Upload an image and ask a question. The answer will be provided as an audio response." # Description ) # Launch the Gradio interface with public sharing enabled interface.launch(share=True)