shouryap commited on
Commit
7d7b091
Β·
1 Parent(s): 9b4a74d
Files changed (3) hide show
  1. README.md +6 -5
  2. app.py +90 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Groundingdi
3
- emoji: πŸ“Š
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: GroundingDINO βš” OWL
3
+ emoji: πŸ¦–πŸ¦‰
4
+ colorFrom: red
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.26.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from transformers import Owlv2Processor, Owlv2ForObjectDetection, AutoProcessor, AutoModelForZeroShotObjectDetection
3
+ import torch
4
+ import gradio as gr
5
+
6
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
7
+
8
+ owl_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to("cuda")
9
+ owl_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
10
+
11
+ dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
12
+ dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to("cuda")
13
+
14
+ @spaces.GPU
15
+ def infer(img, text_queries, score_threshold, model):
16
+
17
+ if model == "dino":
18
+ queries=""
19
+ for query in text_queries:
20
+ queries += f"{query}. "
21
+
22
+ width, height = img.shape[:2]
23
+
24
+ target_sizes=[(width, height)]
25
+ inputs = dino_processor(text=queries, images=img, return_tensors="pt").to(device)
26
+
27
+ with torch.no_grad():
28
+ outputs = dino_model(**inputs)
29
+ outputs.logits = outputs.logits.cpu()
30
+ outputs.pred_boxes = outputs.pred_boxes.cpu()
31
+ results = dino_processor.post_process_grounded_object_detection(outputs=outputs, input_ids=inputs.input_ids,
32
+ box_threshold=score_threshold,
33
+ target_sizes=target_sizes)
34
+ elif model == "owl":
35
+ size = max(img.shape[:2])
36
+ target_sizes = torch.Tensor([[size, size]])
37
+ inputs = owl_processor(text=text_queries, images=img, return_tensors="pt").to(device)
38
+
39
+ with torch.no_grad():
40
+ outputs = owl_model(**inputs)
41
+ outputs.logits = outputs.logits.cpu()
42
+ outputs.pred_boxes = outputs.pred_boxes.cpu()
43
+ results = owl_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)
44
+
45
+ boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
46
+ result_labels = []
47
+ box_coordinates = [] # List to store the box coordinates
48
+
49
+ for box, score, label in zip(boxes, scores, labels):
50
+ box = [int(i) for i in box.tolist()]
51
+ box_coordinates.append(box) # Storing box coordinates
52
+
53
+ if score < score_threshold:
54
+ continue
55
+ if model == "owl":
56
+ label = text_queries[label.cpu().item()]
57
+ result_labels.append((box, label))
58
+ elif model == "dino":
59
+ if label != "":
60
+ result_labels.append((box, label))
61
+
62
+ return result_labels, box_coordinates # Returning both labels and box coordinates
63
+
64
+ def query_image(img, text_queries, owl_threshold, dino_threshold):
65
+ text_queries = text_queries
66
+ text_queries = text_queries.split(",")
67
+ owl_output, owl_box_coords = infer(img, text_queries, owl_threshold, "owl")
68
+ dino_output, dino_box_coords = infer(img, text_queries, dino_threshold, "dino")
69
+
70
+ return (img, owl_output, owl_box_coords), (img, dino_output, dino_box_coords)
71
+
72
+
73
+ owl_threshold = gr.Slider(0, 1, value=0.16, label="OWL Threshold")
74
+ dino_threshold = gr.Slider(0, 1, value=0.12, label="Grounding DINO Threshold")
75
+ owl_output = gr.AnnotatedImage(label="OWL Output")
76
+ dino_output = gr.AnnotatedImage(label="Grounding DINO Output")
77
+
78
+ # Adding box coordinates display for both models
79
+ owl_boxes = gr.Textbox(label="OWL Boxes Coordinates", interactive=False)
80
+ dino_boxes = gr.Textbox(label="Grounding DINO Boxes Coordinates", interactive=False)
81
+
82
+ demo = gr.Interface(
83
+ query_image,
84
+ inputs=[gr.Image(label="Input Image"), gr.Textbox(label="Candidate Labels"), owl_threshold, dino_threshold],
85
+ outputs=[owl_output, dino_output, owl_boxes, dino_boxes],
86
+ title="OWLv2 βš” Grounding DINO",
87
+ description="Compare two state-of-the-art zero-shot object detection models [OWLv2](https://huggingface.co/google/owlv2-base-patch16) and [Grounding DINO](https://huggingface.co/IDEA-Research/grounding-dino-base) in this Space. Simply enter an image and the objects you want to find with comma, or try one of the examples. Play with the threshold to filter out low confidence predictions in each model."
88
+ )
89
+
90
+ demo.launch(debug=True,share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ spaces
2
+ git+https://github.com/huggingface/transformers.git
3
+ scipy