Spaces:
Runtime error
Runtime error
files
Browse files- README.md +6 -5
- app.py +90 -0
- requirements.txt +3 -0
README.md
CHANGED
@@ -1,12 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
10 |
---
|
11 |
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: GroundingDINO β OWL
|
3 |
+
emoji: π¦π¦
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.26.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import spaces
|
2 |
+
from transformers import Owlv2Processor, Owlv2ForObjectDetection, AutoProcessor, AutoModelForZeroShotObjectDetection
|
3 |
+
import torch
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
7 |
+
|
8 |
+
owl_model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble").to("cuda")
|
9 |
+
owl_processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
|
10 |
+
|
11 |
+
dino_processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
|
12 |
+
dino_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to("cuda")
|
13 |
+
|
14 |
+
@spaces.GPU
|
15 |
+
def infer(img, text_queries, score_threshold, model):
|
16 |
+
|
17 |
+
if model == "dino":
|
18 |
+
queries=""
|
19 |
+
for query in text_queries:
|
20 |
+
queries += f"{query}. "
|
21 |
+
|
22 |
+
width, height = img.shape[:2]
|
23 |
+
|
24 |
+
target_sizes=[(width, height)]
|
25 |
+
inputs = dino_processor(text=queries, images=img, return_tensors="pt").to(device)
|
26 |
+
|
27 |
+
with torch.no_grad():
|
28 |
+
outputs = dino_model(**inputs)
|
29 |
+
outputs.logits = outputs.logits.cpu()
|
30 |
+
outputs.pred_boxes = outputs.pred_boxes.cpu()
|
31 |
+
results = dino_processor.post_process_grounded_object_detection(outputs=outputs, input_ids=inputs.input_ids,
|
32 |
+
box_threshold=score_threshold,
|
33 |
+
target_sizes=target_sizes)
|
34 |
+
elif model == "owl":
|
35 |
+
size = max(img.shape[:2])
|
36 |
+
target_sizes = torch.Tensor([[size, size]])
|
37 |
+
inputs = owl_processor(text=text_queries, images=img, return_tensors="pt").to(device)
|
38 |
+
|
39 |
+
with torch.no_grad():
|
40 |
+
outputs = owl_model(**inputs)
|
41 |
+
outputs.logits = outputs.logits.cpu()
|
42 |
+
outputs.pred_boxes = outputs.pred_boxes.cpu()
|
43 |
+
results = owl_processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes)
|
44 |
+
|
45 |
+
boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
|
46 |
+
result_labels = []
|
47 |
+
box_coordinates = [] # List to store the box coordinates
|
48 |
+
|
49 |
+
for box, score, label in zip(boxes, scores, labels):
|
50 |
+
box = [int(i) for i in box.tolist()]
|
51 |
+
box_coordinates.append(box) # Storing box coordinates
|
52 |
+
|
53 |
+
if score < score_threshold:
|
54 |
+
continue
|
55 |
+
if model == "owl":
|
56 |
+
label = text_queries[label.cpu().item()]
|
57 |
+
result_labels.append((box, label))
|
58 |
+
elif model == "dino":
|
59 |
+
if label != "":
|
60 |
+
result_labels.append((box, label))
|
61 |
+
|
62 |
+
return result_labels, box_coordinates # Returning both labels and box coordinates
|
63 |
+
|
64 |
+
def query_image(img, text_queries, owl_threshold, dino_threshold):
|
65 |
+
text_queries = text_queries
|
66 |
+
text_queries = text_queries.split(",")
|
67 |
+
owl_output, owl_box_coords = infer(img, text_queries, owl_threshold, "owl")
|
68 |
+
dino_output, dino_box_coords = infer(img, text_queries, dino_threshold, "dino")
|
69 |
+
|
70 |
+
return (img, owl_output, owl_box_coords), (img, dino_output, dino_box_coords)
|
71 |
+
|
72 |
+
|
73 |
+
owl_threshold = gr.Slider(0, 1, value=0.16, label="OWL Threshold")
|
74 |
+
dino_threshold = gr.Slider(0, 1, value=0.12, label="Grounding DINO Threshold")
|
75 |
+
owl_output = gr.AnnotatedImage(label="OWL Output")
|
76 |
+
dino_output = gr.AnnotatedImage(label="Grounding DINO Output")
|
77 |
+
|
78 |
+
# Adding box coordinates display for both models
|
79 |
+
owl_boxes = gr.Textbox(label="OWL Boxes Coordinates", interactive=False)
|
80 |
+
dino_boxes = gr.Textbox(label="Grounding DINO Boxes Coordinates", interactive=False)
|
81 |
+
|
82 |
+
demo = gr.Interface(
|
83 |
+
query_image,
|
84 |
+
inputs=[gr.Image(label="Input Image"), gr.Textbox(label="Candidate Labels"), owl_threshold, dino_threshold],
|
85 |
+
outputs=[owl_output, dino_output, owl_boxes, dino_boxes],
|
86 |
+
title="OWLv2 β Grounding DINO",
|
87 |
+
description="Compare two state-of-the-art zero-shot object detection models [OWLv2](https://huggingface.co/google/owlv2-base-patch16) and [Grounding DINO](https://huggingface.co/IDEA-Research/grounding-dino-base) in this Space. Simply enter an image and the objects you want to find with comma, or try one of the examples. Play with the threshold to filter out low confidence predictions in each model."
|
88 |
+
)
|
89 |
+
|
90 |
+
demo.launch(debug=True,share=True)
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
spaces
|
2 |
+
git+https://github.com/huggingface/transformers.git
|
3 |
+
scipy
|