from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import gradio as gr

# Load BLIP processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def generate_caption(image):
    try:
        # No need to open the image, Gradio provides it as a PIL object
        inputs = processor(images=image, return_tensors="pt")  # Use the image directly
        
        # Generate caption
        outputs = model.generate(**inputs)
        
        # Decode and return the generated caption
        caption = processor.decode(outputs[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        return f"Error generating caption: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil"),
    outputs=gr.Textbox(label="Generated Caption"),  # Use Textbox for text output
    title="Image Captioning",
    description="Generate captions for your images"
)

# Launch the interface
iface.launch()