Spaces:

JASHWANTH877
/

VQA

Runtime error

App Files Files Community

JASHWANTH877 commited on Oct 24, 2024

Commit

e322deb

verified ·

1 Parent(s): 311726f

Upload 2 files

Browse files

Files changed (2) hide show

app.py +52 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import gradio as gr
+from PIL import Image
+import torch
+from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+from huggingface_hub import login
+import os
+# Retrieve and use API token from environment variables
+token = os.getenv("access_token")
+if token:
+    login(token=token, add_to_git_credential=True)
+# Load model and processor
+model_id = "google/paligemma-3b-mix-224"
+model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
+processor = AutoProcessor.from_pretrained(model_id)
+def generate_conversational_response(image, user_input):
+    # Ensure the image is in PIL format
+    if not isinstance(image, Image.Image):
+        image = Image.open(image)
+    # Prepare the prompt with the user's input
+    prompt = f"{user_input}"
+    # Process the image and text prompt
+    model_inputs = processor(text=prompt, images=image, return_tensors="pt")
+    input_len = model_inputs["input_ids"].shape[-1]
+    # Generate the response
+    with torch.inference_mode():
+        generation = model.generate(**model_inputs, max_new_tokens=1024, do_sample=False)
+        generation = generation[0][input_len:]
+        decoded = processor.decode(generation, skip_special_tokens=True)
+    return decoded
+# Set up Gradio interface
+interface = gr.Interface(
+    fn=generate_conversational_response,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),  # Allows users to upload local images
+        gr.Textbox(lines=2, placeholder="Enter your question or starting input here", label="Starting Input")
+    ],
+    outputs="text",
+    title="Image-Based Conversational AI",
+    description="Upload an image from your local system and provide a starting input. The model will generate a caption and respond to your query based on the image."
+)
+# Launch the interface
+interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+gradio
+pillow
+transformers