kevin159 commited on
Commit
f3fd87b
1 Parent(s): ead9494

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +72 -0
  2. inference.py +146 -0
  3. requirements.txt +4 -0
  4. utils.py +237 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ from huggingface_hub import hf_hub_download
4
+ from gradio_webrtc import WebRTC
5
+ from twilio.rest import Client
6
+ import os
7
+ from inference import YOLOv10
8
+
9
+ model_file = hf_hub_download(
10
+ repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
11
+ )
12
+
13
+ model = YOLOv10(model_file)
14
+
15
+ account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
16
+ auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
17
+
18
+ if account_sid and auth_token:
19
+ client = Client(account_sid, auth_token)
20
+
21
+ token = client.tokens.create()
22
+
23
+ rtc_configuration = {
24
+ "iceServers": token.ice_servers,
25
+ "iceTransportPolicy": "relay",
26
+ }
27
+ else:
28
+ rtc_configuration = None
29
+
30
+
31
+ def detection(image, conf_threshold=0.3):
32
+ image = cv2.resize(image, (model.input_width, model.input_height))
33
+ new_image = model.detect_objects(image, conf_threshold)
34
+ return cv2.resize(new_image, (500, 500))
35
+
36
+
37
+ css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
38
+ .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
39
+
40
+
41
+ with gr.Blocks(css=css) as demo:
42
+ gr.HTML(
43
+ """
44
+ <h1 style='text-align: center'>
45
+ YOLOv10 Webcam Stream (Powered by WebRTC ⚡️)
46
+ </h1>
47
+ """
48
+ )
49
+ gr.HTML(
50
+ """
51
+ <h3 style='text-align: center'>
52
+ <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
53
+ </h3>
54
+ """
55
+ )
56
+ with gr.Column(elem_classes=["my-column"]):
57
+ with gr.Group(elem_classes=["my-group"]):
58
+ image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
59
+ conf_threshold = gr.Slider(
60
+ label="Confidence Threshold",
61
+ minimum=0.0,
62
+ maximum=1.0,
63
+ step=0.05,
64
+ value=0.30,
65
+ )
66
+
67
+ image.stream(
68
+ fn=detection, inputs=[image, conf_threshold], outputs=[image], time_limit=10
69
+ )
70
+
71
+ if __name__ == "__main__":
72
+ demo.launch(share=True)
inference.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import cv2
3
+ import numpy as np
4
+ import onnxruntime
5
+
6
+ from utils import draw_detections
7
+
8
+
9
+ class YOLOv10:
10
+ def __init__(self, path):
11
+
12
+ # Initialize model
13
+ self.initialize_model(path)
14
+
15
+ def __call__(self, image):
16
+ return self.detect_objects(image)
17
+
18
+ def initialize_model(self, path):
19
+ self.session = onnxruntime.InferenceSession(
20
+ path, providers=onnxruntime.get_available_providers()
21
+ )
22
+ # Get model info
23
+ self.get_input_details()
24
+ self.get_output_details()
25
+
26
+ def detect_objects(self, image, conf_threshold=0.3):
27
+ input_tensor = self.prepare_input(image)
28
+
29
+ # Perform inference on the image
30
+ new_image = self.inference(image, input_tensor, conf_threshold)
31
+
32
+ return new_image
33
+
34
+ def prepare_input(self, image):
35
+ self.img_height, self.img_width = image.shape[:2]
36
+
37
+ input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
38
+
39
+ # Resize input image
40
+ input_img = cv2.resize(input_img, (self.input_width, self.input_height))
41
+
42
+ # Scale input pixel values to 0 to 1
43
+ input_img = input_img / 255.0
44
+ input_img = input_img.transpose(2, 0, 1)
45
+ input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
46
+
47
+ return input_tensor
48
+
49
+ def inference(self, image, input_tensor, conf_threshold=0.3):
50
+ start = time.perf_counter()
51
+ outputs = self.session.run(
52
+ self.output_names, {self.input_names[0]: input_tensor}
53
+ )
54
+
55
+ print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
56
+ boxes, scores, class_ids, = self.process_output(outputs, conf_threshold)
57
+ return self.draw_detections(image, boxes, scores, class_ids)
58
+
59
+ def process_output(self, output, conf_threshold=0.3):
60
+ predictions = np.squeeze(output[0])
61
+
62
+ # Filter out object confidence scores below threshold
63
+ scores = predictions[:, 4]
64
+ predictions = predictions[scores > conf_threshold, :]
65
+ scores = scores[scores > conf_threshold]
66
+
67
+ if len(scores) == 0:
68
+ return [], [], []
69
+
70
+ # Get the class with the highest confidence
71
+ class_ids = predictions[:, 5].astype(int)
72
+
73
+ # Get bounding boxes for each object
74
+ boxes = self.extract_boxes(predictions)
75
+
76
+ return boxes, scores, class_ids
77
+
78
+ def extract_boxes(self, predictions):
79
+ # Extract boxes from predictions
80
+ boxes = predictions[:, :4]
81
+
82
+ # Scale boxes to original image dimensions
83
+ boxes = self.rescale_boxes(boxes)
84
+
85
+ # Convert boxes to xyxy format
86
+ #boxes = xywh2xyxy(boxes)
87
+
88
+ return boxes
89
+
90
+ def rescale_boxes(self, boxes):
91
+ # Rescale boxes to original image dimensions
92
+ input_shape = np.array(
93
+ [self.input_width, self.input_height, self.input_width, self.input_height]
94
+ )
95
+ boxes = np.divide(boxes, input_shape, dtype=np.float32)
96
+ boxes *= np.array(
97
+ [self.img_width, self.img_height, self.img_width, self.img_height]
98
+ )
99
+ return boxes
100
+
101
+ def draw_detections(self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4):
102
+ return draw_detections(
103
+ image, boxes, scores, class_ids, mask_alpha
104
+ )
105
+
106
+ def get_input_details(self):
107
+ model_inputs = self.session.get_inputs()
108
+ self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
109
+
110
+ self.input_shape = model_inputs[0].shape
111
+ self.input_height = self.input_shape[2]
112
+ self.input_width = self.input_shape[3]
113
+
114
+ def get_output_details(self):
115
+ model_outputs = self.session.get_outputs()
116
+ self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
117
+
118
+
119
+ if __name__ == "__main__":
120
+ import requests
121
+ import tempfile
122
+ from huggingface_hub import hf_hub_download
123
+
124
+ model_file = hf_hub_download(
125
+ repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
126
+ )
127
+
128
+ yolov8_detector = YOLOv10(model_file)
129
+
130
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
131
+ f.write(
132
+ requests.get(
133
+ "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
134
+ ).content
135
+ )
136
+ f.seek(0)
137
+ img = cv2.imread(f.name)
138
+
139
+ # # Detect Objects
140
+ combined_image = yolov8_detector.detect_objects(img)
141
+
142
+
143
+ # Draw detections
144
+ cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
145
+ cv2.imshow("Output", combined_image)
146
+ cv2.waitKey(0)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ opencv-python
2
+ twilio
3
+ gradio-webrtc==0.0.4
4
+ onnxruntime-gpu
utils.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import cv2
3
+
4
+ class_names = [
5
+ "person",
6
+ "bicycle",
7
+ "car",
8
+ "motorcycle",
9
+ "airplane",
10
+ "bus",
11
+ "train",
12
+ "truck",
13
+ "boat",
14
+ "traffic light",
15
+ "fire hydrant",
16
+ "stop sign",
17
+ "parking meter",
18
+ "bench",
19
+ "bird",
20
+ "cat",
21
+ "dog",
22
+ "horse",
23
+ "sheep",
24
+ "cow",
25
+ "elephant",
26
+ "bear",
27
+ "zebra",
28
+ "giraffe",
29
+ "backpack",
30
+ "umbrella",
31
+ "handbag",
32
+ "tie",
33
+ "suitcase",
34
+ "frisbee",
35
+ "skis",
36
+ "snowboard",
37
+ "sports ball",
38
+ "kite",
39
+ "baseball bat",
40
+ "baseball glove",
41
+ "skateboard",
42
+ "surfboard",
43
+ "tennis racket",
44
+ "bottle",
45
+ "wine glass",
46
+ "cup",
47
+ "fork",
48
+ "knife",
49
+ "spoon",
50
+ "bowl",
51
+ "banana",
52
+ "apple",
53
+ "sandwich",
54
+ "orange",
55
+ "broccoli",
56
+ "carrot",
57
+ "hot dog",
58
+ "pizza",
59
+ "donut",
60
+ "cake",
61
+ "chair",
62
+ "couch",
63
+ "potted plant",
64
+ "bed",
65
+ "dining table",
66
+ "toilet",
67
+ "tv",
68
+ "laptop",
69
+ "mouse",
70
+ "remote",
71
+ "keyboard",
72
+ "cell phone",
73
+ "microwave",
74
+ "oven",
75
+ "toaster",
76
+ "sink",
77
+ "refrigerator",
78
+ "book",
79
+ "clock",
80
+ "vase",
81
+ "scissors",
82
+ "teddy bear",
83
+ "hair drier",
84
+ "toothbrush",
85
+ ]
86
+
87
+ # Create a list of colors for each class where each color is a tuple of 3 integer values
88
+ rng = np.random.default_rng(3)
89
+ colors = rng.uniform(0, 255, size=(len(class_names), 3))
90
+
91
+
92
+ def nms(boxes, scores, iou_threshold):
93
+ # Sort by score
94
+ sorted_indices = np.argsort(scores)[::-1]
95
+
96
+ keep_boxes = []
97
+ while sorted_indices.size > 0:
98
+ # Pick the last box
99
+ box_id = sorted_indices[0]
100
+ keep_boxes.append(box_id)
101
+
102
+ # Compute IoU of the picked box with the rest
103
+ ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
104
+
105
+ # Remove boxes with IoU over the threshold
106
+ keep_indices = np.where(ious < iou_threshold)[0]
107
+
108
+ # print(keep_indices.shape, sorted_indices.shape)
109
+ sorted_indices = sorted_indices[keep_indices + 1]
110
+
111
+ return keep_boxes
112
+
113
+
114
+ def multiclass_nms(boxes, scores, class_ids, iou_threshold):
115
+ unique_class_ids = np.unique(class_ids)
116
+
117
+ keep_boxes = []
118
+ for class_id in unique_class_ids:
119
+ class_indices = np.where(class_ids == class_id)[0]
120
+ class_boxes = boxes[class_indices, :]
121
+ class_scores = scores[class_indices]
122
+
123
+ class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
124
+ keep_boxes.extend(class_indices[class_keep_boxes])
125
+
126
+ return keep_boxes
127
+
128
+
129
+ def compute_iou(box, boxes):
130
+ # Compute xmin, ymin, xmax, ymax for both boxes
131
+ xmin = np.maximum(box[0], boxes[:, 0])
132
+ ymin = np.maximum(box[1], boxes[:, 1])
133
+ xmax = np.minimum(box[2], boxes[:, 2])
134
+ ymax = np.minimum(box[3], boxes[:, 3])
135
+
136
+ # Compute intersection area
137
+ intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
138
+
139
+ # Compute union area
140
+ box_area = (box[2] - box[0]) * (box[3] - box[1])
141
+ boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
142
+ union_area = box_area + boxes_area - intersection_area
143
+
144
+ # Compute IoU
145
+ iou = intersection_area / union_area
146
+
147
+ return iou
148
+
149
+
150
+ def xywh2xyxy(x):
151
+ # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
152
+ y = np.copy(x)
153
+ y[..., 0] = x[..., 0] - x[..., 2] / 2
154
+ y[..., 1] = x[..., 1] - x[..., 3] / 2
155
+ y[..., 2] = x[..., 0] + x[..., 2] / 2
156
+ y[..., 3] = x[..., 1] + x[..., 3] / 2
157
+ return y
158
+
159
+
160
+ def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
161
+ det_img = image.copy()
162
+
163
+ img_height, img_width = image.shape[:2]
164
+ font_size = min([img_height, img_width]) * 0.0006
165
+ text_thickness = int(min([img_height, img_width]) * 0.001)
166
+
167
+ #det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
168
+
169
+ # Draw bounding boxes and labels of detections
170
+ for class_id, box, score in zip(class_ids, boxes, scores):
171
+ color = colors[class_id]
172
+
173
+ draw_box(det_img, box, color)
174
+
175
+ label = class_names[class_id]
176
+ caption = f"{label} {int(score * 100)}%"
177
+ draw_text(det_img, caption, box, color, font_size, text_thickness)
178
+
179
+ return det_img
180
+
181
+
182
+ def draw_box(
183
+ image: np.ndarray,
184
+ box: np.ndarray,
185
+ color: tuple[int, int, int] = (0, 0, 255),
186
+ thickness: int = 2,
187
+ ) -> np.ndarray:
188
+ x1, y1, x2, y2 = box.astype(int)
189
+ return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
190
+
191
+
192
+ def draw_text(
193
+ image: np.ndarray,
194
+ text: str,
195
+ box: np.ndarray,
196
+ color: tuple[int, int, int] = (0, 0, 255),
197
+ font_size: float = 0.001,
198
+ text_thickness: int = 2,
199
+ ) -> np.ndarray:
200
+ x1, y1, x2, y2 = box.astype(int)
201
+ (tw, th), _ = cv2.getTextSize(
202
+ text=text,
203
+ fontFace=cv2.FONT_HERSHEY_SIMPLEX,
204
+ fontScale=font_size,
205
+ thickness=text_thickness,
206
+ )
207
+ th = int(th * 1.2)
208
+
209
+ cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
210
+
211
+ return cv2.putText(
212
+ image,
213
+ text,
214
+ (x1, y1),
215
+ cv2.FONT_HERSHEY_SIMPLEX,
216
+ font_size,
217
+ (255, 255, 255),
218
+ text_thickness,
219
+ cv2.LINE_AA,
220
+ )
221
+
222
+
223
+ def draw_masks(
224
+ image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
225
+ ) -> np.ndarray:
226
+ mask_img = image.copy()
227
+
228
+ # Draw bounding boxes and labels of detections
229
+ for box, class_id in zip(boxes, classes):
230
+ color = colors[class_id]
231
+
232
+ x1, y1, x2, y2 = box.astype(int)
233
+
234
+ # Draw fill rectangle in mask image
235
+ cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)
236
+
237
+ return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)