kellyxiaowei adirik commited on
Commit
7272bf3
·
0 Parent(s):

Duplicate from adirik/OWL-ViT

Browse files

Co-authored-by: Alara Dirik <[email protected]>

.gitattributes ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.npy filter=lfs diff=lfs merge=lfs -text
13
+ *.npz filter=lfs diff=lfs merge=lfs -text
14
+ *.onnx filter=lfs diff=lfs merge=lfs -text
15
+ *.ot filter=lfs diff=lfs merge=lfs -text
16
+ *.parquet filter=lfs diff=lfs merge=lfs -text
17
+ *.pickle filter=lfs diff=lfs merge=lfs -text
18
+ *.pkl filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pt filter=lfs diff=lfs merge=lfs -text
21
+ *.pth filter=lfs diff=lfs merge=lfs -text
22
+ *.rar filter=lfs diff=lfs merge=lfs -text
23
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
24
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
25
+ *.tflite filter=lfs diff=lfs merge=lfs -text
26
+ *.tgz filter=lfs diff=lfs merge=lfs -text
27
+ *.wasm filter=lfs diff=lfs merge=lfs -text
28
+ *.xz filter=lfs diff=lfs merge=lfs -text
29
+ *.zip filter=lfs diff=lfs merge=lfs -text
30
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
31
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ assets/Helvatica.ttc filter=lfs diff=lfs merge=lfs -text
33
+ assets/Helvatica.ttf filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: OWL-ViT Demo
3
+ emoji: 🔥
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 3.1.3
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: adirik/OWL-ViT
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import cv2
3
+ import gradio as gr
4
+ import numpy as np
5
+ from transformers import OwlViTProcessor, OwlViTForObjectDetection
6
+
7
+
8
+ # Use GPU if available
9
+ if torch.cuda.is_available():
10
+ device = torch.device("cuda")
11
+ else:
12
+ device = torch.device("cpu")
13
+
14
+ model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32").to(device)
15
+ model.eval()
16
+ processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
17
+
18
+
19
+ def query_image(img, text_queries, score_threshold):
20
+ text_queries = text_queries
21
+ text_queries = text_queries.split(",")
22
+
23
+ target_sizes = torch.Tensor([img.shape[:2]])
24
+ inputs = processor(text=text_queries, images=img, return_tensors="pt").to(device)
25
+
26
+ with torch.no_grad():
27
+ outputs = model(**inputs)
28
+
29
+ outputs.logits = outputs.logits.cpu()
30
+ outputs.pred_boxes = outputs.pred_boxes.cpu()
31
+ results = processor.post_process(outputs=outputs, target_sizes=target_sizes)
32
+ boxes, scores, labels = results[0]["boxes"], results[0]["scores"], results[0]["labels"]
33
+
34
+ font = cv2.FONT_HERSHEY_SIMPLEX
35
+
36
+ for box, score, label in zip(boxes, scores, labels):
37
+ box = [int(i) for i in box.tolist()]
38
+
39
+ if score >= score_threshold:
40
+ img = cv2.rectangle(img, box[:2], box[2:], (255,0,0), 5)
41
+ if box[3] + 25 > 768:
42
+ y = box[3] - 10
43
+ else:
44
+ y = box[3] + 25
45
+
46
+ img = cv2.putText(
47
+ img, text_queries[label], (box[0], y), font, 1, (255,0,0), 2, cv2.LINE_AA
48
+ )
49
+ return img
50
+
51
+
52
+ description = """
53
+ Gradio demo for <a href="https://huggingface.co/docs/transformers/main/en/model_doc/owlvit">OWL-ViT</a>,
54
+ introduced in <a href="https://arxiv.org/abs/2205.06230">Simple Open-Vocabulary Object Detection
55
+ with Vision Transformers</a>.
56
+ \n\nYou can use OWL-ViT to query images with text descriptions of any object.
57
+ To use it, simply upload an image and enter comma separated text descriptions of objects you want to query the image for. You
58
+ can also use the score threshold slider to set a threshold to filter out low probability predictions.
59
+
60
+ \n\nOWL-ViT is trained on text templates,
61
+ hence you can get better predictions by querying the image with text templates used in training the original model: *"photo of a star-spangled banner"*,
62
+ *"image of a shoe"*. Refer to the <a href="https://arxiv.org/abs/2103.00020">CLIP</a> paper to see the full list of text templates used to augment the training data.
63
+ \n\n<a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb">Colab demo</a>
64
+ """
65
+ demo = gr.Interface(
66
+ query_image,
67
+ inputs=[gr.Image(), "text", gr.Slider(0, 1, value=0.1)],
68
+ outputs="image",
69
+ title="Zero-Shot Object Detection with OWL-ViT",
70
+ description=description,
71
+ examples=[
72
+ ["assets/astronaut.png", "human face, rocket, star-spangled banner, nasa badge", 0.11],
73
+ ["assets/coffee.png", "coffee mug, spoon, plate", 0.1],
74
+ ["assets/butterflies.jpeg", "orange butterfly", 0.3],
75
+ ],
76
+ )
77
+ demo.launch()
assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
assets/Helvetica.ttf ADDED
Binary file (318 kB). View file
 
assets/astronaut.png ADDED
assets/butterflies.jpeg ADDED
assets/coffee.png ADDED
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # pip install -r requirements.txt
2
+
3
+ numpy>=1.18.5
4
+ torch>=1.7.0
5
+ torchvision>=0.8.1
6
+ git+https://github.com/huggingface/transformers.git
7
+ opencv-python