import torch from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import gradio as gr # Load the BLIP model and processor try: processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") model = model.to(device='cuda' if torch.cuda.is_available() else 'cpu') model.eval() except Exception as e: print(f"Error loading model or processor: {e}") exit() def process_image(image, question): device = 'cuda' if torch.cuda.is_available() else 'cpu' # Convert Gradio image to PIL Image image = Image.fromarray(image).convert('RGB') # Preprocess the image and question inputs = processor(image, question, return_tensors="pt").to(device) # Perform inference try: with torch.no_grad(): outputs = model.generate(**inputs) answer = processor.decode(outputs[0], skip_special_tokens=True) return answer except Exception as e: return f"Error during model inference: {e}" # Define the Gradio interface interface = gr.Interface( fn=process_image, inputs=[gr.Image(type='numpy'), gr.Textbox(label="Question")], outputs=gr.Textbox(), title="Image Question Answering", description="Upload an image and ask a question about it. The model will provide an answer." ) # Launch the Gradio app interface.launch()