JASHWANTH877 commited on
Commit
e322deb
·
verified ·
1 Parent(s): 311726f

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +52 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from PIL import Image
3
+ import torch
4
+ from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
5
+ from huggingface_hub import login
6
+ import os
7
+
8
+ # Retrieve and use API token from environment variables
9
+ token = os.getenv("access_token")
10
+ if token:
11
+ login(token=token, add_to_git_credential=True)
12
+
13
+ # Load model and processor
14
+ model_id = "google/paligemma-3b-mix-224"
15
+ model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
16
+ processor = AutoProcessor.from_pretrained(model_id)
17
+
18
+ def generate_conversational_response(image, user_input):
19
+ # Ensure the image is in PIL format
20
+ if not isinstance(image, Image.Image):
21
+ image = Image.open(image)
22
+
23
+ # Prepare the prompt with the user's input
24
+ prompt = f"{user_input}"
25
+
26
+ # Process the image and text prompt
27
+ model_inputs = processor(text=prompt, images=image, return_tensors="pt")
28
+ input_len = model_inputs["input_ids"].shape[-1]
29
+
30
+ # Generate the response
31
+ with torch.inference_mode():
32
+ generation = model.generate(**model_inputs, max_new_tokens=1024, do_sample=False)
33
+ generation = generation[0][input_len:]
34
+ decoded = processor.decode(generation, skip_special_tokens=True)
35
+
36
+ return decoded
37
+
38
+ # Set up Gradio interface
39
+ interface = gr.Interface(
40
+ fn=generate_conversational_response,
41
+ inputs=[
42
+ gr.Image(type="pil", label="Upload Image"), # Allows users to upload local images
43
+ gr.Textbox(lines=2, placeholder="Enter your question or starting input here", label="Starting Input")
44
+ ],
45
+ outputs="text",
46
+ title="Image-Based Conversational AI",
47
+ description="Upload an image from your local system and provide a starting input. The model will generate a caption and respond to your query based on the image."
48
+ )
49
+
50
+ # Launch the interface
51
+ interface.launch()
52
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ gradio
3
+ pillow
4
+ transformers