SZhanZ commited on
Commit
d1798f9
·
1 Parent(s): 15f1882
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/image1.jpg filter=lfs diff=lfs merge=lfs -text
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
File without changes
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
4
+ from PIL import Image, ImageDraw
5
+
6
+ def draw_bbox(image, bbox):
7
+ x1, y1, x2, y2 = bbox
8
+ draw = ImageDraw.Draw(image)
9
+ draw.rectangle((x1, y1, x2, y2), outline="red", width=5)
10
+ return image
11
+
12
+ def extract_bbox_answer(content):
13
+ bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
14
+ bbox_match = re.search(bbox_pattern, content)
15
+ if bbox_match:
16
+ bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
17
+ return bbox
18
+ return [0, 0, 0, 0]
19
+
20
+ def process_image_and_text(image, text):
21
+ """Process image and text input, return thinking process and bbox"""
22
+ question = f"Please provide the bounding box coordinate of the region this sentence describes: {text}."
23
+ QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
24
+
25
+ messages = [
26
+ {
27
+ "role": "user",
28
+ "content": [
29
+ {"type": "image"},
30
+ {"type": "text", "text": QUESTION_TEMPLATE.format(Question=question)},
31
+ ],
32
+ }
33
+ ]
34
+
35
+ text = processor.apply_chat_template(
36
+ messages, tokenize=False, add_generation_prompt=True
37
+ )
38
+
39
+ inputs = processor(
40
+ text=[text],
41
+ images=image,
42
+ return_tensors="pt",
43
+ padding=True,
44
+ padding_side="left",
45
+ add_special_tokens=False,
46
+ )
47
+
48
+ inputs = inputs.to("cuda")
49
+
50
+ with torch.no_grad():
51
+ generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
52
+ generated_ids_trimmed = [
53
+ out_ids[len(inputs.input_ids[0]):] for out_ids in generated_ids
54
+ ]
55
+
56
+ output_text = processor.batch_decode(
57
+ generated_ids_trimmed, skip_special_tokens=True
58
+ )[0]
59
+ print("output_text: ", output_text)
60
+
61
+ # Extract thinking process
62
+ think_match = re.search(r'<think>(.*?)</think>', output_text, re.DOTALL)
63
+ thinking_process = think_match.group(1).strip() if think_match else "No thinking process found"
64
+
65
+ # Get bbox and draw
66
+ bbox = extract_bbox_answer(output_text)
67
+
68
+ # Draw bbox on the image
69
+ result_image = image.copy()
70
+ result_image = draw_bbox(result_image, bbox)
71
+
72
+ return thinking_process, result_image
73
+
74
+ if __name__ == "__main__":
75
+ import gradio as gr
76
+
77
+ # model_path = "/data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-GRPO-REC/checkpoint-500"
78
+ model_path = "SZhanZ/Qwen2.5VL-VLM-R1-REC-step500"
79
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, device_map="cuda:0")
80
+ processor = AutoProcessor.from_pretrained(model_path)
81
+
82
+ def gradio_interface(image, text):
83
+ thinking, result_image = process_image_and_text(image, text)
84
+ return thinking, result_image
85
+
86
+ demo = gr.Interface(
87
+ fn=gradio_interface,
88
+ inputs=[
89
+ gr.Image(type="pil", label="Input Image"),
90
+ gr.Textbox(label="Description Text")
91
+ ],
92
+ outputs=[
93
+ gr.Textbox(label="Thinking Process"),
94
+ gr.Image(type="pil", label="Result with Bbox")
95
+ ],
96
+ title="Visual Referring Expression Demo",
97
+ description="Upload an image and input description text, the system will return the thinking process and region annotation",
98
+ examples=[
99
+ ["examples/image1.jpg", "food with the highest protein"],
100
+ ["examples/image2.jpg", "the cheapest laptop"],
101
+ ]
102
+ )
103
+
104
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
examples/image1.jpg ADDED

Git LFS Details

  • SHA256: e779913142b5db662be50e6e5e8d9b598913dc3a1c2c27abfbbd1dd44630cdd9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.24 MB
examples/image2.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ git+https://github.com/huggingface/transformers
3
+ Pillow>=10.0.0
4
+ httpx[socks]
5
+ accelerate>=0.26.0