Daemontatox commited on
Commit
8a7082b
1 Parent(s): b079eea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -111
app.py CHANGED
@@ -1,150 +1,101 @@
1
- from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
2
  from PIL import Image
 
3
  import torch
4
  from threading import Thread
5
  import gradio as gr
6
  from gradio import FileData
7
  import time
8
  import spaces
9
- from typing import Dict, List, Union
10
-
11
- # Utility function to process vision information
12
- def process_vision_info(messages):
13
- image_inputs, video_inputs = [], []
14
- for msg in messages:
15
- if msg["role"] == "user":
16
- for content in msg["content"]:
17
- if content["type"] == "image":
18
- if isinstance(content["image"], str):
19
- image = Image.open(content["image"]).convert('RGB')
20
- else:
21
- image = content["image"].convert('RGB')
22
- image_inputs.append(image)
23
- elif content["type"] == "video":
24
- video_inputs.append(content["video"])
25
- return image_inputs, video_inputs
26
-
27
- SYSTEM_PROMPT = """You are a Vision Language Model specialized in visual document analysis. Your task is to analyze visual data and accurately answer user queries using a Chain-of-Thought (COT) approach. Self-reflection and error correction are crucial.
28
 
29
  **Reasoning Process:**
30
- 1. **Initial Reasoning:**
31
- * Use `<Thinking>` to describe your initial understanding, identify relevant sections, and generate a preliminary answer.
32
- 2. **Reflection and Error Check:**
33
- * Use `<Reflection>` to critically examine your initial reasoning: section relevance, data accuracy, and alternative interpretations.
34
- 3. **Refinement and Correction:**
35
- * Use `<Correction>` to detail any corrections to your approach and why. If no corrections needed, state "No correction needed".
36
- 4. **Final Answer:**
37
- * Present your final answer with clear reasoning steps and synthesis.
38
 
39
- Always perform self-reflection and provide accurate, succinct answers."""
 
40
 
41
- @spaces.GPU
42
- def bot_streaming(message: Dict[str, Union[str, List[FileData]]],
43
- history: List,
44
- max_new_tokens: int = 4048) -> str:
45
- # Initialize model and processor if not already done
46
- if not hasattr(bot_streaming, "model"):
47
- bot_streaming.model = Qwen2VLForConditionalGeneration.from_pretrained(
48
- "Qwen/Qwen2-VL-7B-Instruct",
49
- torch_dtype=torch.bfloat16,
50
- device_map="auto"
51
- )
52
- bot_streaming.processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 
 
 
 
 
 
 
 
53
 
 
 
54
  txt = message["text"]
55
  messages = [{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}]
56
  images = []
57
 
58
- # Process history and build message chain
59
- for i, msg in enumerate(history):
60
  if isinstance(msg[0], tuple):
61
- # Handle image messages
62
- messages.append({
63
- "role": "user",
64
- "content": [
65
- {"type": "text", "text": history[i+1][0]},
66
- {"type": "image"}
67
- ]
68
- })
69
- messages.append({
70
- "role": "assistant",
71
- "content": [{"type": "text", "text": history[i+1][1]}]
72
- })
73
  images.append(Image.open(msg[0][0]).convert("RGB"))
74
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
75
- continue
76
  elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
77
- messages.append({
78
- "role": "user",
79
- "content": [{"type": "text", "text": msg[0]}]
80
- })
81
- messages.append({
82
- "role": "assistant",
83
- "content": [{"type": "text", "text": msg[1]}]
84
- })
85
-
86
- # Handle current message with possible image
87
  if len(message["files"]) == 1:
88
  if isinstance(message["files"][0], str):
89
  image = Image.open(message["files"][0]).convert("RGB")
90
  else:
91
  image = Image.open(message["files"][0]["path"]).convert("RGB")
92
  images.append(image)
93
- messages.append({
94
- "role": "user",
95
- "content": [{"type": "text", "text": txt}, {"type": "image"}]
96
- })
97
  else:
98
- messages.append({
99
- "role": "user",
100
- "content": [{"type": "text", "text": txt}]
101
- })
102
-
103
- # Process input
104
- text = bot_streaming.processor.apply_chat_template(
105
- messages,
106
- tokenize=False,
107
- add_generation_prompt=True
108
- )
109
 
110
- image_inputs, video_inputs = process_vision_info(messages)
111
- inputs = bot_streaming.processor(
112
- text=[text],
113
- images=image_inputs if images else None,
114
- videos=video_inputs,
115
- padding=True,
116
- return_tensors="pt"
117
- ).to("cuda")
118
-
119
- # Setup streaming generation
120
- streamer = TextIteratorStreamer(bot_streaming.processor, skip_special_tokens=True, skip_prompt=True)
121
- generation_kwargs = dict(
122
- **inputs,
123
- streamer=streamer,
124
- max_new_tokens=max_new_tokens,
125
- do_sample=True,
126
- temperature=0.7,
127
- top_p=0.9
128
- )
129
 
130
- # Start generation in separate thread
131
- thread = Thread(target=bot_streaming.model.generate, kwargs=generation_kwargs)
132
  thread.start()
133
-
134
- # Stream output
135
  buffer = ""
 
136
  for new_text in streamer:
137
  buffer += new_text
138
  time.sleep(0.01)
139
  yield buffer
140
 
141
- # Create Gradio interface
142
  demo = gr.ChatInterface(
143
  fn=bot_streaming,
144
- title="Qwen Visual Language Assistant",
145
  examples=[
146
- [{"text": "Describe what you see in this image.", "files":["./examples/sample.jpg"]}, 200],
147
- [{"text": "What are the key elements in this document?", "files":["./examples/doc.png"]}, 250],
 
 
 
148
  ],
149
  textbox=gr.MultimodalTextbox(),
150
  additional_inputs=[
@@ -157,11 +108,10 @@ demo = gr.ChatInterface(
157
  )
158
  ],
159
  cache_examples=False,
160
- description="Upload an image and ask questions about it. The model will provide detailed analysis with chain-of-thought reasoning.",
161
  stop_btn="Stop Generation",
162
  fill_height=True,
163
  multimodal=True
164
  )
165
 
166
- if __name__ == "__main__":
167
- demo.launch(debug=True)
 
1
+ from transformers import MllamaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
2
  from PIL import Image
3
+ import requests
4
  import torch
5
  from threading import Thread
6
  import gradio as gr
7
  from gradio import FileData
8
  import time
9
  import spaces
10
+
11
+ ckpt = "meta-llama/Llama-3.2-11B-Vision-Instruct"
12
+ model =MllamaForConditionalGeneration.from_pretrained(ckpt,
13
+ torch_dtype=torch.bfloat16).to("cuda")
14
+ processor = AutoProcessor.from_pretrained(ckpt)
15
+
16
+ SYSTEM_PROMPT = """ You are a Vision Language Model specialized in visual document analysis. Your task is to analyze visual data and accurately answer user queries using a Chain-of-Thought (COT) approach. Self-reflection and error correction are crucial.
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  **Reasoning Process:**
 
 
 
 
 
 
 
 
19
 
20
+ 1. **Initial Reasoning:**
21
+ * Use `<Thinking>` to describe your initial understanding, identify relevant sections, and generate a preliminary answer.
22
 
23
+ 2. **Reflection and Error Check:**
24
+ * Use `<Reflection>` to critically examine your initial reasoning: section relevance, data accuracy, and alternative interpretations. Identify any potential errors.
25
+
26
+ 3. **Refinement and Correction:**
27
+ * Use `<Correction>` to detail any corrections to your approach and why. Refine your answer. If no corrections needed, state "No correction needed".
28
+
29
+ 4. **Final Answer:**
30
+ * Present your final answer in this format:
31
+ **Reasoning Steps:**
32
+ 1. **Identification:** Briefly identify relevant document sections.
33
+ 2. **Extraction:** State extracted visual/textual features.
34
+ 3. **Synthesis:** Explain how extracted data led to the answer.
35
+ **Answer:** [Your detailed, accurate answer here]
36
+
37
+ **Requirements:**
38
+
39
+ * Use the COT structure and tags (`<Thinking>`, `<Reflection>`, `<Correction>`).
40
+ * Provide accurate, succinct answers.
41
+ * Always perform self-reflection and error correction.
42
+ * No corrections need to be clearly indicated"""
43
 
44
+ @spaces.GPU
45
+ def bot_streaming(message, history, max_new_tokens=4048):
46
  txt = message["text"]
47
  messages = [{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}]
48
  images = []
49
 
50
+ for i, msg in enumerate(history):
 
51
  if isinstance(msg[0], tuple):
52
+ messages.append({"role": "user", "content": [{"type": "text", "text": history[i+1][0]}, {"type": "image"}]})
53
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": history[i+1][1]}]})
 
 
 
 
 
 
 
 
 
 
54
  images.append(Image.open(msg[0][0]).convert("RGB"))
55
  elif isinstance(history[i-1], tuple) and isinstance(msg[0], str):
56
+ pass
57
  elif isinstance(history[i-1][0], str) and isinstance(msg[0], str):
58
+ messages.append({"role": "user", "content": [{"type": "text", "text": msg[0]}]})
59
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": msg[1]}]})
60
+
 
 
 
 
 
 
 
61
  if len(message["files"]) == 1:
62
  if isinstance(message["files"][0], str):
63
  image = Image.open(message["files"][0]).convert("RGB")
64
  else:
65
  image = Image.open(message["files"][0]["path"]).convert("RGB")
66
  images.append(image)
67
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}, {"type": "image"}]})
 
 
 
68
  else:
69
+ messages.append({"role": "user", "content": [{"type": "text", "text": txt}]})
70
+
71
+ texts = processor.apply_chat_template(messages, add_generation_prompt=True)
72
+
73
+ if images == []:
74
+ inputs = processor(text=texts, return_tensors="pt").to("cuda")
75
+ else:
76
+ inputs = processor(text=texts, images=images, return_tensors="pt").to("cuda")
 
 
 
77
 
78
+ streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
79
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_new_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
 
82
  thread.start()
 
 
83
  buffer = ""
84
+
85
  for new_text in streamer:
86
  buffer += new_text
87
  time.sleep(0.01)
88
  yield buffer
89
 
 
90
  demo = gr.ChatInterface(
91
  fn=bot_streaming,
92
+ title="Overthinking Llama",
93
  examples=[
94
+ [{"text": "Which era does this piece belong to? Give details about the era.", "files":["./examples/rococo.jpg"]}, 200],
95
+ [{"text": "Where do the droughts happen according to this diagram?", "files":["./examples/weather_events.png"]}, 250],
96
+ [{"text": "What happens when you take out white cat from this chain?", "files":["./examples/ai2d_test.jpg"]}, 250],
97
+ [{"text": "How long does it take from invoice date to due date? Be short and concise.", "files":["./examples/invoice.png"]}, 250],
98
+ [{"text": "Where to find this monument? Can you give me other recommendations around the area?", "files":["./examples/wat_arun.jpg"]}, 250],
99
  ],
100
  textbox=gr.MultimodalTextbox(),
101
  additional_inputs=[
 
108
  )
109
  ],
110
  cache_examples=False,
111
+ description="Upload an invoice or timesheet , Ask a question and let the model overthink the Answer",
112
  stop_btn="Stop Generation",
113
  fill_height=True,
114
  multimodal=True
115
  )
116
 
117
+ demo.launch(debug=True)