Commit
·
b202543
1
Parent(s):
1cd092f
initial commit with working code (local)
Browse files- .gitignore +3 -0
- README.md +1 -0
- app.py +354 -0
- cfg/openimages.names +601 -0
- cfg/yolov3-openimages.cfg +789 -0
- darknet.py +322 -0
- detect.py +161 -0
- requirements.txt +4 -0
- utils.py +237 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.ipynb_checkpoints
|
2 |
+
__pycache__
|
3 |
+
desktop.ini
|
README.md
CHANGED
@@ -11,3 +11,4 @@ license: apache-2.0
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
app.py
ADDED
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Facial Recognition with Emotion / Sentiment Detector
|
2 |
+
|
3 |
+
# This is a custom, hard-coded version of darknet with
|
4 |
+
# YOLOv3 implementation for openimages database. This
|
5 |
+
# was written to test viability of implementing YOLO
|
6 |
+
# for face detection followed by emotion / sentiment
|
7 |
+
# analysis.
|
8 |
+
#
|
9 |
+
# Configuration, weights and data are hardcoded.
|
10 |
+
# This version takes any images, detects faces,
|
11 |
+
# and then runs emotion / sentiment analysis
|
12 |
+
#
|
13 |
+
# Author : Saikiran Tharimena
|
14 |
+
# Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
|
15 |
+
# Project : Emotion / Sentiment Detection from news images
|
16 |
+
# Date : 12 September 2022
|
17 |
+
# Version : v0.1
|
18 |
+
#
|
19 |
+
# (C) Schibsted ASA
|
20 |
+
|
21 |
+
# Libraries
|
22 |
+
import torch
|
23 |
+
from utils import *
|
24 |
+
import gradio as gr
|
25 |
+
from numpy import array
|
26 |
+
from darknet import Darknet
|
27 |
+
from torch.autograd import Variable
|
28 |
+
from torch.cuda import is_available as check_cuda
|
29 |
+
from PIL.ImageOps import grayscale
|
30 |
+
from fastai.vision.all import PILImage, load_learner
|
31 |
+
|
32 |
+
################## DARKNET ##################
|
33 |
+
# Parameters
|
34 |
+
batch_size = 1
|
35 |
+
confidence = 0.25
|
36 |
+
nms_thresh = 0.30
|
37 |
+
run_cuda = False
|
38 |
+
|
39 |
+
# CFG Files
|
40 |
+
cfg = 'cfg/yolov3-openimages.cfg'
|
41 |
+
clsnames= 'cfg/openimages.names'
|
42 |
+
weights = 'cfg/yolov3-openimages.weights'
|
43 |
+
|
44 |
+
# Load classes
|
45 |
+
classes = load_classes(clsnames)
|
46 |
+
num_classes = len(classes)
|
47 |
+
|
48 |
+
# Set up the neural network
|
49 |
+
print('Load Network')
|
50 |
+
model = Darknet(cfg)
|
51 |
+
|
52 |
+
print('Load Weights')
|
53 |
+
model.load_weights(weights)
|
54 |
+
|
55 |
+
print('Successfully loaded Network')
|
56 |
+
|
57 |
+
# Check CUDA
|
58 |
+
if run_cuda:
|
59 |
+
CUDA = check_cuda()
|
60 |
+
else:
|
61 |
+
CUDA = False
|
62 |
+
|
63 |
+
# Input dimension
|
64 |
+
inp_dim = int(model.net_info["height"])
|
65 |
+
|
66 |
+
# put the model on GPU
|
67 |
+
if CUDA:
|
68 |
+
model.cuda()
|
69 |
+
|
70 |
+
# Set the model in evaluation mode
|
71 |
+
model.eval()
|
72 |
+
|
73 |
+
def get_detections(x):
|
74 |
+
c1 = [int(y) for y in x[1:3]]
|
75 |
+
c2 = [int(y) for y in x[3:5]]
|
76 |
+
|
77 |
+
det_class = int(x[-1])
|
78 |
+
label = "{0}".format(classes[det_class])
|
79 |
+
|
80 |
+
return (label, tuple(c1 + c2))
|
81 |
+
|
82 |
+
# face detector
|
83 |
+
def detector(image):
|
84 |
+
# Just lazy to update this
|
85 |
+
imlist = [image]
|
86 |
+
loaded_ims = [image]
|
87 |
+
|
88 |
+
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))
|
89 |
+
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims]
|
90 |
+
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
|
91 |
+
|
92 |
+
leftover = 0
|
93 |
+
if (len(im_dim_list) % batch_size):
|
94 |
+
leftover = 1
|
95 |
+
|
96 |
+
if batch_size != 1:
|
97 |
+
num_batches = len(imlist) // batch_size + leftover
|
98 |
+
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size,
|
99 |
+
len(im_batches))])) for i in range(num_batches)]
|
100 |
+
|
101 |
+
write = 0
|
102 |
+
if CUDA:
|
103 |
+
im_dim_list = im_dim_list.cuda()
|
104 |
+
|
105 |
+
for i, batch in enumerate(im_batches):
|
106 |
+
# load the image
|
107 |
+
|
108 |
+
if CUDA:
|
109 |
+
batch = batch.cuda()
|
110 |
+
with torch.no_grad():
|
111 |
+
prediction = model(Variable(batch), CUDA)
|
112 |
+
|
113 |
+
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh)
|
114 |
+
|
115 |
+
if type(prediction) == int:
|
116 |
+
|
117 |
+
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
|
118 |
+
im_id = i*batch_size + im_num
|
119 |
+
|
120 |
+
continue
|
121 |
+
|
122 |
+
prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist
|
123 |
+
|
124 |
+
if not write: # If we have't initialised output
|
125 |
+
output = prediction
|
126 |
+
write = 1
|
127 |
+
else:
|
128 |
+
output = torch.cat((output, prediction))
|
129 |
+
|
130 |
+
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
|
131 |
+
im_id = i * batch_size + im_num
|
132 |
+
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
|
133 |
+
|
134 |
+
if CUDA:
|
135 |
+
torch.cuda.synchronize()
|
136 |
+
|
137 |
+
try:
|
138 |
+
output
|
139 |
+
except NameError:
|
140 |
+
return None
|
141 |
+
|
142 |
+
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
|
143 |
+
|
144 |
+
scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1)
|
145 |
+
|
146 |
+
output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
|
147 |
+
output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
|
148 |
+
|
149 |
+
output[:, 1:5] /= scaling_factor
|
150 |
+
|
151 |
+
for i in range(output.shape[0]):
|
152 |
+
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
|
153 |
+
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
|
154 |
+
|
155 |
+
detections = list(map(get_detections, output))
|
156 |
+
|
157 |
+
if CUDA:
|
158 |
+
torch.cuda.empty_cache()
|
159 |
+
|
160 |
+
return loaded_ims[0], detections
|
161 |
+
#############################################
|
162 |
+
|
163 |
+
|
164 |
+
# Emotion
|
165 |
+
learn_emotion = load_learner('models/emotions_vgg19.pkl')
|
166 |
+
learn_emotion_labels = learn_emotion.dls.vocab
|
167 |
+
|
168 |
+
# Sentiment
|
169 |
+
learn_sentiment = load_learner('models/sentiment_vgg19.pkl')
|
170 |
+
learn_sentiment_labels = learn_sentiment.dls.vocab
|
171 |
+
|
172 |
+
def crop_images(img, bbox):
|
173 |
+
"Here image should be an image object from PILImage.create"
|
174 |
+
|
175 |
+
# Coordinates of face in cv2 format
|
176 |
+
xmin, ymin, xmax, ymax = bbox[1]
|
177 |
+
|
178 |
+
# resize and crop face
|
179 |
+
return img.crop((xmin, ymin, xmax, ymax))
|
180 |
+
|
181 |
+
|
182 |
+
def detect_person_face(img, detections):
|
183 |
+
'''This function is called from within detect face.
|
184 |
+
If only a person is detected, then this will crop
|
185 |
+
image and then try to detect face again.'''
|
186 |
+
|
187 |
+
faces = []
|
188 |
+
|
189 |
+
# Loop through people
|
190 |
+
for detection in detections:
|
191 |
+
|
192 |
+
# Get cropped image of person
|
193 |
+
temp = crop_images(img, detection)
|
194 |
+
|
195 |
+
# run detector again
|
196 |
+
_, detect = detector(array(temp)[...,:3])
|
197 |
+
|
198 |
+
# check for human faces
|
199 |
+
human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face']
|
200 |
+
|
201 |
+
if len(human_face) == 0:
|
202 |
+
continue
|
203 |
+
|
204 |
+
# Force it to take only 1 face per person
|
205 |
+
# crop face and append to list
|
206 |
+
faces.append(crop_images(temp, detect[human_face[0]]))
|
207 |
+
|
208 |
+
return faces
|
209 |
+
|
210 |
+
|
211 |
+
def detect_face(img):
|
212 |
+
|
213 |
+
_, detections = detector(array(img)[...,:3])
|
214 |
+
|
215 |
+
# check for human faces
|
216 |
+
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face']
|
217 |
+
|
218 |
+
if len(human_face) == 0:
|
219 |
+
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person']
|
220 |
+
|
221 |
+
if len(human_face) == 0:
|
222 |
+
return None
|
223 |
+
else:
|
224 |
+
# Only get human face detections
|
225 |
+
faces = detect_person_face(img, [detections[idx] for idx in human_face])
|
226 |
+
|
227 |
+
else:
|
228 |
+
# Only get human face detections
|
229 |
+
faces = []
|
230 |
+
|
231 |
+
for idx in human_face:
|
232 |
+
faces.append(crop_images(img, detections[idx]))
|
233 |
+
|
234 |
+
return faces
|
235 |
+
|
236 |
+
|
237 |
+
# Predict
|
238 |
+
def predict(img):
|
239 |
+
|
240 |
+
img = PILImage.create(img)
|
241 |
+
|
242 |
+
# Detect faces
|
243 |
+
faces = detect_face(img)
|
244 |
+
|
245 |
+
output = []
|
246 |
+
|
247 |
+
if len(faces) == 0:
|
248 |
+
|
249 |
+
img = img.resize(48, 48)
|
250 |
+
|
251 |
+
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img)))
|
252 |
+
|
253 |
+
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img)))
|
254 |
+
|
255 |
+
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))}
|
256 |
+
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))}
|
257 |
+
|
258 |
+
output = [img.resize((48, 48)), emotions, sentiments, None, None, None, None, None, None]
|
259 |
+
|
260 |
+
else: # Max 3 for now
|
261 |
+
for face in faces[:3]:
|
262 |
+
|
263 |
+
img = face.resize((48, 48))
|
264 |
+
|
265 |
+
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img)))
|
266 |
+
|
267 |
+
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img)))
|
268 |
+
|
269 |
+
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))}
|
270 |
+
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))}
|
271 |
+
|
272 |
+
output.append(img)
|
273 |
+
output.append(emotions)
|
274 |
+
output.append(sentiments)
|
275 |
+
|
276 |
+
temp = output[-3:]
|
277 |
+
while len(output) < 9:
|
278 |
+
output = output + temp
|
279 |
+
|
280 |
+
return output
|
281 |
+
|
282 |
+
# Gradio
|
283 |
+
title = 'Face Recognition with Emotion and Sentiment Detector'
|
284 |
+
|
285 |
+
description = gr.Markdown(
|
286 |
+
"""Ever wondered what a person might be feeling looking at their picture?
|
287 |
+
Well, now you can! Try this fun app. Just upload a facial image in JPG or
|
288 |
+
PNG format. Voila! you can now see what they might have felt when the picture
|
289 |
+
was taken.
|
290 |
+
|
291 |
+
This is an updated version of Facial Expression Classifier:
|
292 |
+
https://huggingface.co/spaces/schibsted/facial_expression_classifier
|
293 |
+
""").value
|
294 |
+
|
295 |
+
article = gr.Markdown(
|
296 |
+
"""**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and
|
297 |
+
interpret results at your own risk! It was built as a demo for AI course. Samples images
|
298 |
+
were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective
|
299 |
+
brands. All rights reserved.
|
300 |
+
|
301 |
+
**PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis
|
302 |
+
based on the pictures. We are restricting pictures to only include close-up facial
|
303 |
+
images.
|
304 |
+
|
305 |
+
**DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709
|
306 |
+
images in the training set and 3,589 images in the test set. However, for this demo all
|
307 |
+
pictures were combined into a single dataset and 80:20 split was used for training. Images
|
308 |
+
are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral.
|
309 |
+
In addition to these 7 classes, images were re-classified into 3 sentiment categories based
|
310 |
+
on emotions:
|
311 |
+
|
312 |
+
Positive (Happy, Surprise)
|
313 |
+
|
314 |
+
Negative (Angry, Disgust, Fear, Sad)
|
315 |
+
|
316 |
+
Neutral (Neutral)
|
317 |
+
|
318 |
+
FER2013 (preliminary version) dataset can be downloaded at:
|
319 |
+
https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data
|
320 |
+
|
321 |
+
**EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained
|
322 |
+
using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other
|
323 |
+
for detecting sentiment. Although, this could have been done with just one model, here two
|
324 |
+
models were trained for the demo.
|
325 |
+
|
326 |
+
**FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details.
|
327 |
+
In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment
|
328 |
+
model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through
|
329 |
+
face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not
|
330 |
+
detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't
|
331 |
+
figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was
|
332 |
+
too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture.
|
333 |
+
""").value
|
334 |
+
|
335 |
+
enable_queue=True
|
336 |
+
|
337 |
+
examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg']
|
338 |
+
|
339 |
+
gr.Interface(fn = predict,
|
340 |
+
inputs = gr.Image(),
|
341 |
+
outputs = [gr.Image(shape=(24, 24), label='Person 1'),
|
342 |
+
gr.Label(label='Emotion - Person 1'),
|
343 |
+
gr.Label(label='Sentiment - Person 1'),
|
344 |
+
gr.Image(shape=(24, 24), label='Person 2'),
|
345 |
+
gr.Label(label='Emotion - Person 2'),
|
346 |
+
gr.Label(label='Sentiment - Person 2'),
|
347 |
+
gr.Image(shape=(24, 24), label='Person 3'),
|
348 |
+
gr.Label(label='Emotion - Person 3'),
|
349 |
+
gr.Label(label='Sentiment - Person 3'),], #gr.Label(),
|
350 |
+
title = title,
|
351 |
+
examples = examples,
|
352 |
+
description = description,
|
353 |
+
article=article,
|
354 |
+
allow_flagging='never').launch(enable_queue=enable_queue)
|
cfg/openimages.names
ADDED
@@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Tortoise
|
2 |
+
Container
|
3 |
+
Magpie
|
4 |
+
Sea turtle
|
5 |
+
Football
|
6 |
+
Ambulance
|
7 |
+
Ladder
|
8 |
+
Toothbrush
|
9 |
+
Syringe
|
10 |
+
Sink
|
11 |
+
Toy
|
12 |
+
Organ
|
13 |
+
Cassette deck
|
14 |
+
Apple
|
15 |
+
Human eye
|
16 |
+
Cosmetics
|
17 |
+
Paddle
|
18 |
+
Snowman
|
19 |
+
Beer
|
20 |
+
Chopsticks
|
21 |
+
Human beard
|
22 |
+
Bird
|
23 |
+
Parking meter
|
24 |
+
Traffic light
|
25 |
+
Croissant
|
26 |
+
Cucumber
|
27 |
+
Radish
|
28 |
+
Towel
|
29 |
+
Doll
|
30 |
+
Skull
|
31 |
+
Washing machine
|
32 |
+
Glove
|
33 |
+
Tick
|
34 |
+
Belt
|
35 |
+
Sunglasses
|
36 |
+
Banjo
|
37 |
+
Cart
|
38 |
+
Ball
|
39 |
+
Backpack
|
40 |
+
Bicycle
|
41 |
+
Home appliance
|
42 |
+
Centipede
|
43 |
+
Boat
|
44 |
+
Surfboard
|
45 |
+
Boot
|
46 |
+
Headphones
|
47 |
+
Hot dog
|
48 |
+
Shorts
|
49 |
+
Fast food
|
50 |
+
Bus
|
51 |
+
Boy
|
52 |
+
Screwdriver
|
53 |
+
Bicycle wheel
|
54 |
+
Barge
|
55 |
+
Laptop
|
56 |
+
Miniskirt
|
57 |
+
Drill
|
58 |
+
Dress
|
59 |
+
Bear
|
60 |
+
Waffle
|
61 |
+
Pancake
|
62 |
+
Brown bear
|
63 |
+
Woodpecker
|
64 |
+
Blue jay
|
65 |
+
Pretzel
|
66 |
+
Bagel
|
67 |
+
Tower
|
68 |
+
Teapot
|
69 |
+
Person
|
70 |
+
Bow and arrow
|
71 |
+
Swimwear
|
72 |
+
Beehive
|
73 |
+
Brassiere
|
74 |
+
Bee
|
75 |
+
Bat
|
76 |
+
Starfish
|
77 |
+
Popcorn
|
78 |
+
Burrito
|
79 |
+
Chainsaw
|
80 |
+
Balloon
|
81 |
+
Wrench
|
82 |
+
Tent
|
83 |
+
Vehicle registration plate
|
84 |
+
Lantern
|
85 |
+
Toaster
|
86 |
+
Flashlight
|
87 |
+
Billboard
|
88 |
+
Tiara
|
89 |
+
Limousine
|
90 |
+
Necklace
|
91 |
+
Carnivore
|
92 |
+
Scissors
|
93 |
+
Stairs
|
94 |
+
Computer keyboard
|
95 |
+
Printer
|
96 |
+
Traffic sign
|
97 |
+
Chair
|
98 |
+
Shirt
|
99 |
+
Poster
|
100 |
+
Cheese
|
101 |
+
Sock
|
102 |
+
Fire hydrant
|
103 |
+
Land vehicle
|
104 |
+
Earrings
|
105 |
+
Tie
|
106 |
+
Watercraft
|
107 |
+
Cabinetry
|
108 |
+
Suitcase
|
109 |
+
Muffin
|
110 |
+
Bidet
|
111 |
+
Snack
|
112 |
+
Snowmobile
|
113 |
+
Clock
|
114 |
+
Medical equipment
|
115 |
+
Cattle
|
116 |
+
Cello
|
117 |
+
Jet ski
|
118 |
+
Camel
|
119 |
+
Coat
|
120 |
+
Suit
|
121 |
+
Desk
|
122 |
+
Cat
|
123 |
+
Bronze sculpture
|
124 |
+
Juice
|
125 |
+
Gondola
|
126 |
+
Beetle
|
127 |
+
Cannon
|
128 |
+
Computer mouse
|
129 |
+
Cookie
|
130 |
+
Office building
|
131 |
+
Fountain
|
132 |
+
Coin
|
133 |
+
Calculator
|
134 |
+
Cocktail
|
135 |
+
Computer monitor
|
136 |
+
Box
|
137 |
+
Stapler
|
138 |
+
Christmas tree
|
139 |
+
Cowboy hat
|
140 |
+
Hiking equipment
|
141 |
+
Studio couch
|
142 |
+
Drum
|
143 |
+
Dessert
|
144 |
+
Wine rack
|
145 |
+
Drink
|
146 |
+
Zucchini
|
147 |
+
Ladle
|
148 |
+
Human mouth
|
149 |
+
Dairy
|
150 |
+
Dice
|
151 |
+
Oven
|
152 |
+
Dinosaur
|
153 |
+
Ratchet
|
154 |
+
Couch
|
155 |
+
Cricket ball
|
156 |
+
Winter melon
|
157 |
+
Spatula
|
158 |
+
Whiteboard
|
159 |
+
Pencil sharpener
|
160 |
+
Door
|
161 |
+
Hat
|
162 |
+
Shower
|
163 |
+
Eraser
|
164 |
+
Fedora
|
165 |
+
Guacamole
|
166 |
+
Dagger
|
167 |
+
Scarf
|
168 |
+
Dolphin
|
169 |
+
Sombrero
|
170 |
+
Tin can
|
171 |
+
Mug
|
172 |
+
Tap
|
173 |
+
Harbor seal
|
174 |
+
Stretcher
|
175 |
+
Can opener
|
176 |
+
Goggles
|
177 |
+
Human body
|
178 |
+
Roller skates
|
179 |
+
Coffee cup
|
180 |
+
Cutting board
|
181 |
+
Blender
|
182 |
+
Plumbing fixture
|
183 |
+
Stop sign
|
184 |
+
Office supplies
|
185 |
+
Volleyball
|
186 |
+
Vase
|
187 |
+
Slow cooker
|
188 |
+
Wardrobe
|
189 |
+
Coffee
|
190 |
+
Whisk
|
191 |
+
Paper towel
|
192 |
+
Personal care
|
193 |
+
Food
|
194 |
+
Sun hat
|
195 |
+
Tree house
|
196 |
+
Flying disc
|
197 |
+
Skirt
|
198 |
+
Gas stove
|
199 |
+
Salt and pepper shakers
|
200 |
+
Mechanical fan
|
201 |
+
Face powder
|
202 |
+
Fax
|
203 |
+
Fruit
|
204 |
+
French fries
|
205 |
+
Nightstand
|
206 |
+
Barrel
|
207 |
+
Kite
|
208 |
+
Tart
|
209 |
+
Treadmill
|
210 |
+
Fox
|
211 |
+
Flag
|
212 |
+
Horn
|
213 |
+
Window blind
|
214 |
+
Human foot
|
215 |
+
Golf cart
|
216 |
+
Jacket
|
217 |
+
Egg
|
218 |
+
Street light
|
219 |
+
Guitar
|
220 |
+
Pillow
|
221 |
+
Human leg
|
222 |
+
Isopod
|
223 |
+
Grape
|
224 |
+
Human ear
|
225 |
+
Power plugs and sockets
|
226 |
+
Panda
|
227 |
+
Giraffe
|
228 |
+
Woman
|
229 |
+
Door handle
|
230 |
+
Rhinoceros
|
231 |
+
Bathtub
|
232 |
+
Goldfish
|
233 |
+
Houseplant
|
234 |
+
Goat
|
235 |
+
Baseball bat
|
236 |
+
Baseball glove
|
237 |
+
Mixing bowl
|
238 |
+
Marine invertebrates
|
239 |
+
Kitchen utensil
|
240 |
+
Light switch
|
241 |
+
House
|
242 |
+
Horse
|
243 |
+
Stationary bicycle
|
244 |
+
Hammer
|
245 |
+
Ceiling fan
|
246 |
+
Sofa bed
|
247 |
+
Adhesive tape
|
248 |
+
Harp
|
249 |
+
Sandal
|
250 |
+
Bicycle helmet
|
251 |
+
Saucer
|
252 |
+
Harpsichord
|
253 |
+
Human hair
|
254 |
+
Heater
|
255 |
+
Harmonica
|
256 |
+
Hamster
|
257 |
+
Curtain
|
258 |
+
Bed
|
259 |
+
Kettle
|
260 |
+
Fireplace
|
261 |
+
Scale
|
262 |
+
Drinking straw
|
263 |
+
Insect
|
264 |
+
Hair dryer
|
265 |
+
Kitchenware
|
266 |
+
Indoor rower
|
267 |
+
Invertebrate
|
268 |
+
Food processor
|
269 |
+
Bookcase
|
270 |
+
Refrigerator
|
271 |
+
Wood-burning stove
|
272 |
+
Punching bag
|
273 |
+
Common fig
|
274 |
+
Cocktail shaker
|
275 |
+
Jaguar
|
276 |
+
Golf ball
|
277 |
+
Fashion accessory
|
278 |
+
Alarm clock
|
279 |
+
Filing cabinet
|
280 |
+
Artichoke
|
281 |
+
Table
|
282 |
+
Tableware
|
283 |
+
Kangaroo
|
284 |
+
Koala
|
285 |
+
Knife
|
286 |
+
Bottle
|
287 |
+
Bottle opener
|
288 |
+
Lynx
|
289 |
+
Lavender
|
290 |
+
Lighthouse
|
291 |
+
Dumbbell
|
292 |
+
Human head
|
293 |
+
Bowl
|
294 |
+
Humidifier
|
295 |
+
Porch
|
296 |
+
Lizard
|
297 |
+
Billiard table
|
298 |
+
Mammal
|
299 |
+
Mouse
|
300 |
+
Motorcycle
|
301 |
+
Musical instrument
|
302 |
+
Swim cap
|
303 |
+
Frying pan
|
304 |
+
Snowplow
|
305 |
+
Bathroom cabinet
|
306 |
+
Missile
|
307 |
+
Bust
|
308 |
+
Man
|
309 |
+
Waffle iron
|
310 |
+
Milk
|
311 |
+
Ring binder
|
312 |
+
Plate
|
313 |
+
Mobile phone
|
314 |
+
Baked goods
|
315 |
+
Mushroom
|
316 |
+
Crutch
|
317 |
+
Pitcher
|
318 |
+
Mirror
|
319 |
+
Lifejacket
|
320 |
+
Table tennis racket
|
321 |
+
Pencil case
|
322 |
+
Musical keyboard
|
323 |
+
Scoreboard
|
324 |
+
Briefcase
|
325 |
+
Kitchen knife
|
326 |
+
Nail
|
327 |
+
Tennis ball
|
328 |
+
Plastic bag
|
329 |
+
Oboe
|
330 |
+
Chest of drawers
|
331 |
+
Ostrich
|
332 |
+
Piano
|
333 |
+
Girl
|
334 |
+
Plant
|
335 |
+
Potato
|
336 |
+
Hair spray
|
337 |
+
Sports equipment
|
338 |
+
Pasta
|
339 |
+
Penguin
|
340 |
+
Pumpkin
|
341 |
+
Pear
|
342 |
+
Infant bed
|
343 |
+
Polar bear
|
344 |
+
Mixer
|
345 |
+
Cupboard
|
346 |
+
Jacuzzi
|
347 |
+
Pizza
|
348 |
+
Digital clock
|
349 |
+
Pig
|
350 |
+
Reptile
|
351 |
+
Rifle
|
352 |
+
Lipstick
|
353 |
+
Skateboard
|
354 |
+
Raven
|
355 |
+
High heels
|
356 |
+
Red panda
|
357 |
+
Rose
|
358 |
+
Rabbit
|
359 |
+
Sculpture
|
360 |
+
Saxophone
|
361 |
+
Shotgun
|
362 |
+
Seafood
|
363 |
+
Submarine sandwich
|
364 |
+
Snowboard
|
365 |
+
Sword
|
366 |
+
Picture frame
|
367 |
+
Sushi
|
368 |
+
Loveseat
|
369 |
+
Ski
|
370 |
+
Squirrel
|
371 |
+
Tripod
|
372 |
+
Stethoscope
|
373 |
+
Submarine
|
374 |
+
Scorpion
|
375 |
+
Segway
|
376 |
+
Training bench
|
377 |
+
Snake
|
378 |
+
Coffee table
|
379 |
+
Skyscraper
|
380 |
+
Sheep
|
381 |
+
Television
|
382 |
+
Trombone
|
383 |
+
Tea
|
384 |
+
Tank
|
385 |
+
Taco
|
386 |
+
Telephone
|
387 |
+
Torch
|
388 |
+
Tiger
|
389 |
+
Strawberry
|
390 |
+
Trumpet
|
391 |
+
Tree
|
392 |
+
Tomato
|
393 |
+
Train
|
394 |
+
Tool
|
395 |
+
Picnic basket
|
396 |
+
Cooking spray
|
397 |
+
Trousers
|
398 |
+
Bowling equipment
|
399 |
+
Football helmet
|
400 |
+
Truck
|
401 |
+
Measuring cup
|
402 |
+
Coffeemaker
|
403 |
+
Violin
|
404 |
+
Vehicle
|
405 |
+
Handbag
|
406 |
+
Paper cutter
|
407 |
+
Wine
|
408 |
+
Weapon
|
409 |
+
Wheel
|
410 |
+
Worm
|
411 |
+
Wok
|
412 |
+
Whale
|
413 |
+
Zebra
|
414 |
+
Auto part
|
415 |
+
Jug
|
416 |
+
Pizza cutter
|
417 |
+
Cream
|
418 |
+
Monkey
|
419 |
+
Lion
|
420 |
+
Bread
|
421 |
+
Platter
|
422 |
+
Chicken
|
423 |
+
Eagle
|
424 |
+
Helicopter
|
425 |
+
Owl
|
426 |
+
Duck
|
427 |
+
Turtle
|
428 |
+
Hippopotamus
|
429 |
+
Crocodile
|
430 |
+
Toilet
|
431 |
+
Toilet paper
|
432 |
+
Squid
|
433 |
+
Clothing
|
434 |
+
Footwear
|
435 |
+
Lemon
|
436 |
+
Spider
|
437 |
+
Deer
|
438 |
+
Frog
|
439 |
+
Banana
|
440 |
+
Rocket
|
441 |
+
Wine glass
|
442 |
+
Countertop
|
443 |
+
Tablet computer
|
444 |
+
Waste container
|
445 |
+
Swimming pool
|
446 |
+
Dog
|
447 |
+
Book
|
448 |
+
Elephant
|
449 |
+
Shark
|
450 |
+
Candle
|
451 |
+
Leopard
|
452 |
+
Axe
|
453 |
+
Hand dryer
|
454 |
+
Soap dispenser
|
455 |
+
Porcupine
|
456 |
+
Flower
|
457 |
+
Canary
|
458 |
+
Cheetah
|
459 |
+
Palm tree
|
460 |
+
Hamburger
|
461 |
+
Maple
|
462 |
+
Building
|
463 |
+
Fish
|
464 |
+
Lobster
|
465 |
+
Asparagus
|
466 |
+
Furniture
|
467 |
+
Hedgehog
|
468 |
+
Airplane
|
469 |
+
Spoon
|
470 |
+
Otter
|
471 |
+
Bull
|
472 |
+
Oyster
|
473 |
+
Horizontal bar
|
474 |
+
Convenience store
|
475 |
+
Bomb
|
476 |
+
Bench
|
477 |
+
Ice cream
|
478 |
+
Caterpillar
|
479 |
+
Butterfly
|
480 |
+
Parachute
|
481 |
+
Orange
|
482 |
+
Antelope
|
483 |
+
Beaker
|
484 |
+
Moths and butterflies
|
485 |
+
Window
|
486 |
+
Closet
|
487 |
+
Castle
|
488 |
+
Jellyfish
|
489 |
+
Goose
|
490 |
+
Mule
|
491 |
+
Swan
|
492 |
+
Peach
|
493 |
+
Coconut
|
494 |
+
Seat belt
|
495 |
+
Raccoon
|
496 |
+
Chisel
|
497 |
+
Fork
|
498 |
+
Lamp
|
499 |
+
Camera
|
500 |
+
Squash
|
501 |
+
Racket
|
502 |
+
Human face
|
503 |
+
Human arm
|
504 |
+
Vegetable
|
505 |
+
Diaper
|
506 |
+
Unicycle
|
507 |
+
Falcon
|
508 |
+
Chime
|
509 |
+
Snail
|
510 |
+
Shellfish
|
511 |
+
Cabbage
|
512 |
+
Carrot
|
513 |
+
Mango
|
514 |
+
Jeans
|
515 |
+
Flowerpot
|
516 |
+
Pineapple
|
517 |
+
Drawer
|
518 |
+
Stool
|
519 |
+
Envelope
|
520 |
+
Cake
|
521 |
+
Dragonfly
|
522 |
+
Sunflower
|
523 |
+
Microwave oven
|
524 |
+
Honeycomb
|
525 |
+
Marine mammal
|
526 |
+
Sea lion
|
527 |
+
Ladybug
|
528 |
+
Shelf
|
529 |
+
Watch
|
530 |
+
Candy
|
531 |
+
Salad
|
532 |
+
Parrot
|
533 |
+
Handgun
|
534 |
+
Sparrow
|
535 |
+
Van
|
536 |
+
Grinder
|
537 |
+
Spice rack
|
538 |
+
Light bulb
|
539 |
+
Corded phone
|
540 |
+
Sports uniform
|
541 |
+
Tennis racket
|
542 |
+
Wall clock
|
543 |
+
Serving tray
|
544 |
+
Kitchen & dining room table
|
545 |
+
Dog bed
|
546 |
+
Cake stand
|
547 |
+
Cat furniture
|
548 |
+
Bathroom accessory
|
549 |
+
Facial tissue holder
|
550 |
+
Pressure cooker
|
551 |
+
Kitchen appliance
|
552 |
+
Tire
|
553 |
+
Ruler
|
554 |
+
Luggage and bags
|
555 |
+
Microphone
|
556 |
+
Broccoli
|
557 |
+
Umbrella
|
558 |
+
Pastry
|
559 |
+
Grapefruit
|
560 |
+
Band-aid
|
561 |
+
Animal
|
562 |
+
Bell pepper
|
563 |
+
Turkey
|
564 |
+
Lily
|
565 |
+
Pomegranate
|
566 |
+
Doughnut
|
567 |
+
Glasses
|
568 |
+
Human nose
|
569 |
+
Pen
|
570 |
+
Ant
|
571 |
+
Car
|
572 |
+
Aircraft
|
573 |
+
Human hand
|
574 |
+
Skunk
|
575 |
+
Teddy bear
|
576 |
+
Watermelon
|
577 |
+
Cantaloupe
|
578 |
+
Dishwasher
|
579 |
+
Flute
|
580 |
+
Balance beam
|
581 |
+
Sandwich
|
582 |
+
Shrimp
|
583 |
+
Sewing machine
|
584 |
+
Binoculars
|
585 |
+
Rays and skates
|
586 |
+
Ipod
|
587 |
+
Accordion
|
588 |
+
Willow
|
589 |
+
Crab
|
590 |
+
Crown
|
591 |
+
Seahorse
|
592 |
+
Perfume
|
593 |
+
Alpaca
|
594 |
+
Taxi
|
595 |
+
Canoe
|
596 |
+
Remote control
|
597 |
+
Wheelchair
|
598 |
+
Rugby ball
|
599 |
+
Armadillo
|
600 |
+
Maracas
|
601 |
+
Helmet
|
cfg/yolov3-openimages.cfg
ADDED
@@ -0,0 +1,789 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[net]
|
2 |
+
# Testing
|
3 |
+
batch=1
|
4 |
+
subdivisions=1
|
5 |
+
# Training
|
6 |
+
batch=64
|
7 |
+
subdivisions=16
|
8 |
+
width=608
|
9 |
+
height=608
|
10 |
+
channels=3
|
11 |
+
momentum=0.9
|
12 |
+
decay=0.0005
|
13 |
+
angle=0
|
14 |
+
saturation = 1.5
|
15 |
+
exposure = 1.5
|
16 |
+
hue=.1
|
17 |
+
|
18 |
+
learning_rate=0.001
|
19 |
+
burn_in=5000
|
20 |
+
max_batches = 500200
|
21 |
+
policy=steps
|
22 |
+
steps=400000,450000
|
23 |
+
scales=.1,.1
|
24 |
+
|
25 |
+
[convolutional]
|
26 |
+
batch_normalize=1
|
27 |
+
filters=32
|
28 |
+
size=3
|
29 |
+
stride=1
|
30 |
+
pad=1
|
31 |
+
activation=leaky
|
32 |
+
|
33 |
+
# Downsample
|
34 |
+
|
35 |
+
[convolutional]
|
36 |
+
batch_normalize=1
|
37 |
+
filters=64
|
38 |
+
size=3
|
39 |
+
stride=2
|
40 |
+
pad=1
|
41 |
+
activation=leaky
|
42 |
+
|
43 |
+
[convolutional]
|
44 |
+
batch_normalize=1
|
45 |
+
filters=32
|
46 |
+
size=1
|
47 |
+
stride=1
|
48 |
+
pad=1
|
49 |
+
activation=leaky
|
50 |
+
|
51 |
+
[convolutional]
|
52 |
+
batch_normalize=1
|
53 |
+
filters=64
|
54 |
+
size=3
|
55 |
+
stride=1
|
56 |
+
pad=1
|
57 |
+
activation=leaky
|
58 |
+
|
59 |
+
[shortcut]
|
60 |
+
from=-3
|
61 |
+
activation=linear
|
62 |
+
|
63 |
+
# Downsample
|
64 |
+
|
65 |
+
[convolutional]
|
66 |
+
batch_normalize=1
|
67 |
+
filters=128
|
68 |
+
size=3
|
69 |
+
stride=2
|
70 |
+
pad=1
|
71 |
+
activation=leaky
|
72 |
+
|
73 |
+
[convolutional]
|
74 |
+
batch_normalize=1
|
75 |
+
filters=64
|
76 |
+
size=1
|
77 |
+
stride=1
|
78 |
+
pad=1
|
79 |
+
activation=leaky
|
80 |
+
|
81 |
+
[convolutional]
|
82 |
+
batch_normalize=1
|
83 |
+
filters=128
|
84 |
+
size=3
|
85 |
+
stride=1
|
86 |
+
pad=1
|
87 |
+
activation=leaky
|
88 |
+
|
89 |
+
[shortcut]
|
90 |
+
from=-3
|
91 |
+
activation=linear
|
92 |
+
|
93 |
+
[convolutional]
|
94 |
+
batch_normalize=1
|
95 |
+
filters=64
|
96 |
+
size=1
|
97 |
+
stride=1
|
98 |
+
pad=1
|
99 |
+
activation=leaky
|
100 |
+
|
101 |
+
[convolutional]
|
102 |
+
batch_normalize=1
|
103 |
+
filters=128
|
104 |
+
size=3
|
105 |
+
stride=1
|
106 |
+
pad=1
|
107 |
+
activation=leaky
|
108 |
+
|
109 |
+
[shortcut]
|
110 |
+
from=-3
|
111 |
+
activation=linear
|
112 |
+
|
113 |
+
# Downsample
|
114 |
+
|
115 |
+
[convolutional]
|
116 |
+
batch_normalize=1
|
117 |
+
filters=256
|
118 |
+
size=3
|
119 |
+
stride=2
|
120 |
+
pad=1
|
121 |
+
activation=leaky
|
122 |
+
|
123 |
+
[convolutional]
|
124 |
+
batch_normalize=1
|
125 |
+
filters=128
|
126 |
+
size=1
|
127 |
+
stride=1
|
128 |
+
pad=1
|
129 |
+
activation=leaky
|
130 |
+
|
131 |
+
[convolutional]
|
132 |
+
batch_normalize=1
|
133 |
+
filters=256
|
134 |
+
size=3
|
135 |
+
stride=1
|
136 |
+
pad=1
|
137 |
+
activation=leaky
|
138 |
+
|
139 |
+
[shortcut]
|
140 |
+
from=-3
|
141 |
+
activation=linear
|
142 |
+
|
143 |
+
[convolutional]
|
144 |
+
batch_normalize=1
|
145 |
+
filters=128
|
146 |
+
size=1
|
147 |
+
stride=1
|
148 |
+
pad=1
|
149 |
+
activation=leaky
|
150 |
+
|
151 |
+
[convolutional]
|
152 |
+
batch_normalize=1
|
153 |
+
filters=256
|
154 |
+
size=3
|
155 |
+
stride=1
|
156 |
+
pad=1
|
157 |
+
activation=leaky
|
158 |
+
|
159 |
+
[shortcut]
|
160 |
+
from=-3
|
161 |
+
activation=linear
|
162 |
+
|
163 |
+
[convolutional]
|
164 |
+
batch_normalize=1
|
165 |
+
filters=128
|
166 |
+
size=1
|
167 |
+
stride=1
|
168 |
+
pad=1
|
169 |
+
activation=leaky
|
170 |
+
|
171 |
+
[convolutional]
|
172 |
+
batch_normalize=1
|
173 |
+
filters=256
|
174 |
+
size=3
|
175 |
+
stride=1
|
176 |
+
pad=1
|
177 |
+
activation=leaky
|
178 |
+
|
179 |
+
[shortcut]
|
180 |
+
from=-3
|
181 |
+
activation=linear
|
182 |
+
|
183 |
+
[convolutional]
|
184 |
+
batch_normalize=1
|
185 |
+
filters=128
|
186 |
+
size=1
|
187 |
+
stride=1
|
188 |
+
pad=1
|
189 |
+
activation=leaky
|
190 |
+
|
191 |
+
[convolutional]
|
192 |
+
batch_normalize=1
|
193 |
+
filters=256
|
194 |
+
size=3
|
195 |
+
stride=1
|
196 |
+
pad=1
|
197 |
+
activation=leaky
|
198 |
+
|
199 |
+
[shortcut]
|
200 |
+
from=-3
|
201 |
+
activation=linear
|
202 |
+
|
203 |
+
|
204 |
+
[convolutional]
|
205 |
+
batch_normalize=1
|
206 |
+
filters=128
|
207 |
+
size=1
|
208 |
+
stride=1
|
209 |
+
pad=1
|
210 |
+
activation=leaky
|
211 |
+
|
212 |
+
[convolutional]
|
213 |
+
batch_normalize=1
|
214 |
+
filters=256
|
215 |
+
size=3
|
216 |
+
stride=1
|
217 |
+
pad=1
|
218 |
+
activation=leaky
|
219 |
+
|
220 |
+
[shortcut]
|
221 |
+
from=-3
|
222 |
+
activation=linear
|
223 |
+
|
224 |
+
[convolutional]
|
225 |
+
batch_normalize=1
|
226 |
+
filters=128
|
227 |
+
size=1
|
228 |
+
stride=1
|
229 |
+
pad=1
|
230 |
+
activation=leaky
|
231 |
+
|
232 |
+
[convolutional]
|
233 |
+
batch_normalize=1
|
234 |
+
filters=256
|
235 |
+
size=3
|
236 |
+
stride=1
|
237 |
+
pad=1
|
238 |
+
activation=leaky
|
239 |
+
|
240 |
+
[shortcut]
|
241 |
+
from=-3
|
242 |
+
activation=linear
|
243 |
+
|
244 |
+
[convolutional]
|
245 |
+
batch_normalize=1
|
246 |
+
filters=128
|
247 |
+
size=1
|
248 |
+
stride=1
|
249 |
+
pad=1
|
250 |
+
activation=leaky
|
251 |
+
|
252 |
+
[convolutional]
|
253 |
+
batch_normalize=1
|
254 |
+
filters=256
|
255 |
+
size=3
|
256 |
+
stride=1
|
257 |
+
pad=1
|
258 |
+
activation=leaky
|
259 |
+
|
260 |
+
[shortcut]
|
261 |
+
from=-3
|
262 |
+
activation=linear
|
263 |
+
|
264 |
+
[convolutional]
|
265 |
+
batch_normalize=1
|
266 |
+
filters=128
|
267 |
+
size=1
|
268 |
+
stride=1
|
269 |
+
pad=1
|
270 |
+
activation=leaky
|
271 |
+
|
272 |
+
[convolutional]
|
273 |
+
batch_normalize=1
|
274 |
+
filters=256
|
275 |
+
size=3
|
276 |
+
stride=1
|
277 |
+
pad=1
|
278 |
+
activation=leaky
|
279 |
+
|
280 |
+
[shortcut]
|
281 |
+
from=-3
|
282 |
+
activation=linear
|
283 |
+
|
284 |
+
# Downsample
|
285 |
+
|
286 |
+
[convolutional]
|
287 |
+
batch_normalize=1
|
288 |
+
filters=512
|
289 |
+
size=3
|
290 |
+
stride=2
|
291 |
+
pad=1
|
292 |
+
activation=leaky
|
293 |
+
|
294 |
+
[convolutional]
|
295 |
+
batch_normalize=1
|
296 |
+
filters=256
|
297 |
+
size=1
|
298 |
+
stride=1
|
299 |
+
pad=1
|
300 |
+
activation=leaky
|
301 |
+
|
302 |
+
[convolutional]
|
303 |
+
batch_normalize=1
|
304 |
+
filters=512
|
305 |
+
size=3
|
306 |
+
stride=1
|
307 |
+
pad=1
|
308 |
+
activation=leaky
|
309 |
+
|
310 |
+
[shortcut]
|
311 |
+
from=-3
|
312 |
+
activation=linear
|
313 |
+
|
314 |
+
|
315 |
+
[convolutional]
|
316 |
+
batch_normalize=1
|
317 |
+
filters=256
|
318 |
+
size=1
|
319 |
+
stride=1
|
320 |
+
pad=1
|
321 |
+
activation=leaky
|
322 |
+
|
323 |
+
[convolutional]
|
324 |
+
batch_normalize=1
|
325 |
+
filters=512
|
326 |
+
size=3
|
327 |
+
stride=1
|
328 |
+
pad=1
|
329 |
+
activation=leaky
|
330 |
+
|
331 |
+
[shortcut]
|
332 |
+
from=-3
|
333 |
+
activation=linear
|
334 |
+
|
335 |
+
|
336 |
+
[convolutional]
|
337 |
+
batch_normalize=1
|
338 |
+
filters=256
|
339 |
+
size=1
|
340 |
+
stride=1
|
341 |
+
pad=1
|
342 |
+
activation=leaky
|
343 |
+
|
344 |
+
[convolutional]
|
345 |
+
batch_normalize=1
|
346 |
+
filters=512
|
347 |
+
size=3
|
348 |
+
stride=1
|
349 |
+
pad=1
|
350 |
+
activation=leaky
|
351 |
+
|
352 |
+
[shortcut]
|
353 |
+
from=-3
|
354 |
+
activation=linear
|
355 |
+
|
356 |
+
|
357 |
+
[convolutional]
|
358 |
+
batch_normalize=1
|
359 |
+
filters=256
|
360 |
+
size=1
|
361 |
+
stride=1
|
362 |
+
pad=1
|
363 |
+
activation=leaky
|
364 |
+
|
365 |
+
[convolutional]
|
366 |
+
batch_normalize=1
|
367 |
+
filters=512
|
368 |
+
size=3
|
369 |
+
stride=1
|
370 |
+
pad=1
|
371 |
+
activation=leaky
|
372 |
+
|
373 |
+
[shortcut]
|
374 |
+
from=-3
|
375 |
+
activation=linear
|
376 |
+
|
377 |
+
[convolutional]
|
378 |
+
batch_normalize=1
|
379 |
+
filters=256
|
380 |
+
size=1
|
381 |
+
stride=1
|
382 |
+
pad=1
|
383 |
+
activation=leaky
|
384 |
+
|
385 |
+
[convolutional]
|
386 |
+
batch_normalize=1
|
387 |
+
filters=512
|
388 |
+
size=3
|
389 |
+
stride=1
|
390 |
+
pad=1
|
391 |
+
activation=leaky
|
392 |
+
|
393 |
+
[shortcut]
|
394 |
+
from=-3
|
395 |
+
activation=linear
|
396 |
+
|
397 |
+
|
398 |
+
[convolutional]
|
399 |
+
batch_normalize=1
|
400 |
+
filters=256
|
401 |
+
size=1
|
402 |
+
stride=1
|
403 |
+
pad=1
|
404 |
+
activation=leaky
|
405 |
+
|
406 |
+
[convolutional]
|
407 |
+
batch_normalize=1
|
408 |
+
filters=512
|
409 |
+
size=3
|
410 |
+
stride=1
|
411 |
+
pad=1
|
412 |
+
activation=leaky
|
413 |
+
|
414 |
+
[shortcut]
|
415 |
+
from=-3
|
416 |
+
activation=linear
|
417 |
+
|
418 |
+
|
419 |
+
[convolutional]
|
420 |
+
batch_normalize=1
|
421 |
+
filters=256
|
422 |
+
size=1
|
423 |
+
stride=1
|
424 |
+
pad=1
|
425 |
+
activation=leaky
|
426 |
+
|
427 |
+
[convolutional]
|
428 |
+
batch_normalize=1
|
429 |
+
filters=512
|
430 |
+
size=3
|
431 |
+
stride=1
|
432 |
+
pad=1
|
433 |
+
activation=leaky
|
434 |
+
|
435 |
+
[shortcut]
|
436 |
+
from=-3
|
437 |
+
activation=linear
|
438 |
+
|
439 |
+
[convolutional]
|
440 |
+
batch_normalize=1
|
441 |
+
filters=256
|
442 |
+
size=1
|
443 |
+
stride=1
|
444 |
+
pad=1
|
445 |
+
activation=leaky
|
446 |
+
|
447 |
+
[convolutional]
|
448 |
+
batch_normalize=1
|
449 |
+
filters=512
|
450 |
+
size=3
|
451 |
+
stride=1
|
452 |
+
pad=1
|
453 |
+
activation=leaky
|
454 |
+
|
455 |
+
[shortcut]
|
456 |
+
from=-3
|
457 |
+
activation=linear
|
458 |
+
|
459 |
+
# Downsample
|
460 |
+
|
461 |
+
[convolutional]
|
462 |
+
batch_normalize=1
|
463 |
+
filters=1024
|
464 |
+
size=3
|
465 |
+
stride=2
|
466 |
+
pad=1
|
467 |
+
activation=leaky
|
468 |
+
|
469 |
+
[convolutional]
|
470 |
+
batch_normalize=1
|
471 |
+
filters=512
|
472 |
+
size=1
|
473 |
+
stride=1
|
474 |
+
pad=1
|
475 |
+
activation=leaky
|
476 |
+
|
477 |
+
[convolutional]
|
478 |
+
batch_normalize=1
|
479 |
+
filters=1024
|
480 |
+
size=3
|
481 |
+
stride=1
|
482 |
+
pad=1
|
483 |
+
activation=leaky
|
484 |
+
|
485 |
+
[shortcut]
|
486 |
+
from=-3
|
487 |
+
activation=linear
|
488 |
+
|
489 |
+
[convolutional]
|
490 |
+
batch_normalize=1
|
491 |
+
filters=512
|
492 |
+
size=1
|
493 |
+
stride=1
|
494 |
+
pad=1
|
495 |
+
activation=leaky
|
496 |
+
|
497 |
+
[convolutional]
|
498 |
+
batch_normalize=1
|
499 |
+
filters=1024
|
500 |
+
size=3
|
501 |
+
stride=1
|
502 |
+
pad=1
|
503 |
+
activation=leaky
|
504 |
+
|
505 |
+
[shortcut]
|
506 |
+
from=-3
|
507 |
+
activation=linear
|
508 |
+
|
509 |
+
[convolutional]
|
510 |
+
batch_normalize=1
|
511 |
+
filters=512
|
512 |
+
size=1
|
513 |
+
stride=1
|
514 |
+
pad=1
|
515 |
+
activation=leaky
|
516 |
+
|
517 |
+
[convolutional]
|
518 |
+
batch_normalize=1
|
519 |
+
filters=1024
|
520 |
+
size=3
|
521 |
+
stride=1
|
522 |
+
pad=1
|
523 |
+
activation=leaky
|
524 |
+
|
525 |
+
[shortcut]
|
526 |
+
from=-3
|
527 |
+
activation=linear
|
528 |
+
|
529 |
+
[convolutional]
|
530 |
+
batch_normalize=1
|
531 |
+
filters=512
|
532 |
+
size=1
|
533 |
+
stride=1
|
534 |
+
pad=1
|
535 |
+
activation=leaky
|
536 |
+
|
537 |
+
[convolutional]
|
538 |
+
batch_normalize=1
|
539 |
+
filters=1024
|
540 |
+
size=3
|
541 |
+
stride=1
|
542 |
+
pad=1
|
543 |
+
activation=leaky
|
544 |
+
|
545 |
+
[shortcut]
|
546 |
+
from=-3
|
547 |
+
activation=linear
|
548 |
+
|
549 |
+
######################
|
550 |
+
|
551 |
+
[convolutional]
|
552 |
+
batch_normalize=1
|
553 |
+
filters=512
|
554 |
+
size=1
|
555 |
+
stride=1
|
556 |
+
pad=1
|
557 |
+
activation=leaky
|
558 |
+
|
559 |
+
[convolutional]
|
560 |
+
batch_normalize=1
|
561 |
+
size=3
|
562 |
+
stride=1
|
563 |
+
pad=1
|
564 |
+
filters=1024
|
565 |
+
activation=leaky
|
566 |
+
|
567 |
+
[convolutional]
|
568 |
+
batch_normalize=1
|
569 |
+
filters=512
|
570 |
+
size=1
|
571 |
+
stride=1
|
572 |
+
pad=1
|
573 |
+
activation=leaky
|
574 |
+
|
575 |
+
[convolutional]
|
576 |
+
batch_normalize=1
|
577 |
+
size=3
|
578 |
+
stride=1
|
579 |
+
pad=1
|
580 |
+
filters=1024
|
581 |
+
activation=leaky
|
582 |
+
|
583 |
+
[convolutional]
|
584 |
+
batch_normalize=1
|
585 |
+
filters=512
|
586 |
+
size=1
|
587 |
+
stride=1
|
588 |
+
pad=1
|
589 |
+
activation=leaky
|
590 |
+
|
591 |
+
[convolutional]
|
592 |
+
batch_normalize=1
|
593 |
+
size=3
|
594 |
+
stride=1
|
595 |
+
pad=1
|
596 |
+
filters=1024
|
597 |
+
activation=leaky
|
598 |
+
|
599 |
+
[convolutional]
|
600 |
+
size=1
|
601 |
+
stride=1
|
602 |
+
pad=1
|
603 |
+
filters=1818
|
604 |
+
activation=linear
|
605 |
+
|
606 |
+
|
607 |
+
[yolo]
|
608 |
+
mask = 6,7,8
|
609 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
610 |
+
classes=601
|
611 |
+
num=9
|
612 |
+
jitter=.3
|
613 |
+
ignore_thresh = .7
|
614 |
+
truth_thresh = 1
|
615 |
+
random=1
|
616 |
+
|
617 |
+
|
618 |
+
[route]
|
619 |
+
layers = -4
|
620 |
+
|
621 |
+
[convolutional]
|
622 |
+
batch_normalize=1
|
623 |
+
filters=256
|
624 |
+
size=1
|
625 |
+
stride=1
|
626 |
+
pad=1
|
627 |
+
activation=leaky
|
628 |
+
|
629 |
+
[upsample]
|
630 |
+
stride=2
|
631 |
+
|
632 |
+
[route]
|
633 |
+
layers = -1, 61
|
634 |
+
|
635 |
+
|
636 |
+
|
637 |
+
[convolutional]
|
638 |
+
batch_normalize=1
|
639 |
+
filters=256
|
640 |
+
size=1
|
641 |
+
stride=1
|
642 |
+
pad=1
|
643 |
+
activation=leaky
|
644 |
+
|
645 |
+
[convolutional]
|
646 |
+
batch_normalize=1
|
647 |
+
size=3
|
648 |
+
stride=1
|
649 |
+
pad=1
|
650 |
+
filters=512
|
651 |
+
activation=leaky
|
652 |
+
|
653 |
+
[convolutional]
|
654 |
+
batch_normalize=1
|
655 |
+
filters=256
|
656 |
+
size=1
|
657 |
+
stride=1
|
658 |
+
pad=1
|
659 |
+
activation=leaky
|
660 |
+
|
661 |
+
[convolutional]
|
662 |
+
batch_normalize=1
|
663 |
+
size=3
|
664 |
+
stride=1
|
665 |
+
pad=1
|
666 |
+
filters=512
|
667 |
+
activation=leaky
|
668 |
+
|
669 |
+
[convolutional]
|
670 |
+
batch_normalize=1
|
671 |
+
filters=256
|
672 |
+
size=1
|
673 |
+
stride=1
|
674 |
+
pad=1
|
675 |
+
activation=leaky
|
676 |
+
|
677 |
+
[convolutional]
|
678 |
+
batch_normalize=1
|
679 |
+
size=3
|
680 |
+
stride=1
|
681 |
+
pad=1
|
682 |
+
filters=512
|
683 |
+
activation=leaky
|
684 |
+
|
685 |
+
[convolutional]
|
686 |
+
size=1
|
687 |
+
stride=1
|
688 |
+
pad=1
|
689 |
+
filters=1818
|
690 |
+
activation=linear
|
691 |
+
|
692 |
+
|
693 |
+
[yolo]
|
694 |
+
mask = 3,4,5
|
695 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
696 |
+
classes=601
|
697 |
+
num=9
|
698 |
+
jitter=.3
|
699 |
+
ignore_thresh = .7
|
700 |
+
truth_thresh = 1
|
701 |
+
random=1
|
702 |
+
|
703 |
+
|
704 |
+
|
705 |
+
[route]
|
706 |
+
layers = -4
|
707 |
+
|
708 |
+
[convolutional]
|
709 |
+
batch_normalize=1
|
710 |
+
filters=128
|
711 |
+
size=1
|
712 |
+
stride=1
|
713 |
+
pad=1
|
714 |
+
activation=leaky
|
715 |
+
|
716 |
+
[upsample]
|
717 |
+
stride=2
|
718 |
+
|
719 |
+
[route]
|
720 |
+
layers = -1, 36
|
721 |
+
|
722 |
+
|
723 |
+
|
724 |
+
[convolutional]
|
725 |
+
batch_normalize=1
|
726 |
+
filters=128
|
727 |
+
size=1
|
728 |
+
stride=1
|
729 |
+
pad=1
|
730 |
+
activation=leaky
|
731 |
+
|
732 |
+
[convolutional]
|
733 |
+
batch_normalize=1
|
734 |
+
size=3
|
735 |
+
stride=1
|
736 |
+
pad=1
|
737 |
+
filters=256
|
738 |
+
activation=leaky
|
739 |
+
|
740 |
+
[convolutional]
|
741 |
+
batch_normalize=1
|
742 |
+
filters=128
|
743 |
+
size=1
|
744 |
+
stride=1
|
745 |
+
pad=1
|
746 |
+
activation=leaky
|
747 |
+
|
748 |
+
[convolutional]
|
749 |
+
batch_normalize=1
|
750 |
+
size=3
|
751 |
+
stride=1
|
752 |
+
pad=1
|
753 |
+
filters=256
|
754 |
+
activation=leaky
|
755 |
+
|
756 |
+
[convolutional]
|
757 |
+
batch_normalize=1
|
758 |
+
filters=128
|
759 |
+
size=1
|
760 |
+
stride=1
|
761 |
+
pad=1
|
762 |
+
activation=leaky
|
763 |
+
|
764 |
+
[convolutional]
|
765 |
+
batch_normalize=1
|
766 |
+
size=3
|
767 |
+
stride=1
|
768 |
+
pad=1
|
769 |
+
filters=256
|
770 |
+
activation=leaky
|
771 |
+
|
772 |
+
[convolutional]
|
773 |
+
size=1
|
774 |
+
stride=1
|
775 |
+
pad=1
|
776 |
+
filters=1818
|
777 |
+
activation=linear
|
778 |
+
|
779 |
+
|
780 |
+
[yolo]
|
781 |
+
mask = 0,1,2
|
782 |
+
anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
|
783 |
+
classes=601
|
784 |
+
num=9
|
785 |
+
jitter=.3
|
786 |
+
ignore_thresh = .7
|
787 |
+
truth_thresh = 1
|
788 |
+
random=1
|
789 |
+
|
darknet.py
ADDED
@@ -0,0 +1,322 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PyTorch implementation of Darknet
|
2 |
+
# This is a custom, hard-coded version of darknet with
|
3 |
+
# YOLOv3 implementation for openimages database. This
|
4 |
+
# was written to test viability of implementing YOLO
|
5 |
+
# for face detection followed by emotion / sentiment
|
6 |
+
# analysis.
|
7 |
+
#
|
8 |
+
# Configuration, weights and data are hardcoded.
|
9 |
+
# Additional options include, ability to create
|
10 |
+
# subset of data with faces exracted for labelling.
|
11 |
+
#
|
12 |
+
# Author : Saikiran Tharimena
|
13 |
+
# Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
|
14 |
+
# Project : Emotion / Sentiment Detection from news images
|
15 |
+
# Date : 12 September 2022
|
16 |
+
# Version : v0.1
|
17 |
+
#
|
18 |
+
# (C) Schibsted ASA
|
19 |
+
|
20 |
+
# Libraries
|
21 |
+
import torch
|
22 |
+
import torch.nn as nn
|
23 |
+
import torch.nn.functional as F
|
24 |
+
from torch.autograd import Variable
|
25 |
+
import numpy as np
|
26 |
+
from utils import *
|
27 |
+
|
28 |
+
|
29 |
+
def parse_cfg(cfgfile):
|
30 |
+
"""
|
31 |
+
Takes a configuration file
|
32 |
+
|
33 |
+
Returns a list of blocks. Each blocks describes a block in the neural
|
34 |
+
network to be built. Block is represented as a dictionary in the list
|
35 |
+
|
36 |
+
"""
|
37 |
+
|
38 |
+
file = open(cfgfile, 'r')
|
39 |
+
lines = file.read().split('\n') # store the lines in a list
|
40 |
+
lines = [x for x in lines if len(x) > 0] # get read of the empty lines
|
41 |
+
lines = [x for x in lines if x[0] != '#'] # get rid of comments
|
42 |
+
lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
|
43 |
+
|
44 |
+
block = {}
|
45 |
+
blocks = []
|
46 |
+
|
47 |
+
for line in lines:
|
48 |
+
if line[0] == "[": # This marks the start of a new block
|
49 |
+
if len(block) != 0: # If block is not empty, implies it is storing values of previous block.
|
50 |
+
blocks.append(block) # add it the blocks list
|
51 |
+
block = {} # re-init the block
|
52 |
+
block["type"] = line[1:-1].rstrip()
|
53 |
+
else:
|
54 |
+
key,value = line.split("=")
|
55 |
+
block[key.rstrip()] = value.lstrip()
|
56 |
+
blocks.append(block)
|
57 |
+
|
58 |
+
return blocks
|
59 |
+
|
60 |
+
|
61 |
+
class EmptyLayer(nn.Module):
|
62 |
+
def __init__(self):
|
63 |
+
super(EmptyLayer, self).__init__()
|
64 |
+
|
65 |
+
|
66 |
+
class DetectionLayer(nn.Module):
|
67 |
+
def __init__(self, anchors):
|
68 |
+
super(DetectionLayer, self).__init__()
|
69 |
+
self.anchors = anchors
|
70 |
+
|
71 |
+
|
72 |
+
def create_modules(blocks):
|
73 |
+
net_info = blocks[0] #Captures the information about the input and pre-processing
|
74 |
+
module_list = nn.ModuleList()
|
75 |
+
prev_filters = 3
|
76 |
+
output_filters = []
|
77 |
+
|
78 |
+
for index, x in enumerate(blocks[1:]):
|
79 |
+
module = nn.Sequential()
|
80 |
+
|
81 |
+
#check the type of block
|
82 |
+
#create a new module for the block
|
83 |
+
#append to module_list
|
84 |
+
|
85 |
+
#If it's a convolutional layer
|
86 |
+
if (x["type"] == "convolutional"):
|
87 |
+
#Get the info about the layer
|
88 |
+
activation = x["activation"]
|
89 |
+
try:
|
90 |
+
batch_normalize = int(x["batch_normalize"])
|
91 |
+
bias = False
|
92 |
+
except:
|
93 |
+
batch_normalize = 0
|
94 |
+
bias = True
|
95 |
+
|
96 |
+
filters= int(x["filters"])
|
97 |
+
padding = int(x["pad"])
|
98 |
+
kernel_size = int(x["size"])
|
99 |
+
stride = int(x["stride"])
|
100 |
+
|
101 |
+
if padding:
|
102 |
+
pad = (kernel_size - 1) // 2
|
103 |
+
else:
|
104 |
+
pad = 0
|
105 |
+
|
106 |
+
#Add the convolutional layer
|
107 |
+
conv = nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias = bias)
|
108 |
+
module.add_module("conv_{0}".format(index), conv)
|
109 |
+
|
110 |
+
#Add the Batch Norm Layer
|
111 |
+
if batch_normalize:
|
112 |
+
bn = nn.BatchNorm2d(filters)
|
113 |
+
module.add_module("batch_norm_{0}".format(index), bn)
|
114 |
+
|
115 |
+
#Check the activation.
|
116 |
+
#It is either Linear or a Leaky ReLU for YOLO
|
117 |
+
if activation == "leaky":
|
118 |
+
activn = nn.LeakyReLU(0.1, inplace = True)
|
119 |
+
module.add_module("leaky_{0}".format(index), activn)
|
120 |
+
|
121 |
+
#If it's an upsampling layer
|
122 |
+
#We use Bilinear2dUpsampling
|
123 |
+
elif (x["type"] == "upsample"):
|
124 |
+
stride = int(x["stride"])
|
125 |
+
upsample = nn.Upsample(scale_factor = 2, mode = "nearest")
|
126 |
+
module.add_module("upsample_{}".format(index), upsample)
|
127 |
+
|
128 |
+
#If it is a route layer
|
129 |
+
elif (x["type"] == "route"):
|
130 |
+
x["layers"] = x["layers"].split(',')
|
131 |
+
#Start of a route
|
132 |
+
start = int(x["layers"][0])
|
133 |
+
#end, if there exists one.
|
134 |
+
try:
|
135 |
+
end = int(x["layers"][1])
|
136 |
+
except:
|
137 |
+
end = 0
|
138 |
+
#Positive anotation
|
139 |
+
if start > 0:
|
140 |
+
start = start - index
|
141 |
+
if end > 0:
|
142 |
+
end = end - index
|
143 |
+
route = EmptyLayer()
|
144 |
+
module.add_module("route_{0}".format(index), route)
|
145 |
+
if end < 0:
|
146 |
+
filters = output_filters[index + start] + output_filters[index + end]
|
147 |
+
else:
|
148 |
+
filters= output_filters[index + start]
|
149 |
+
|
150 |
+
#shortcut corresponds to skip connection
|
151 |
+
elif x["type"] == "shortcut":
|
152 |
+
shortcut = EmptyLayer()
|
153 |
+
module.add_module("shortcut_{}".format(index), shortcut)
|
154 |
+
|
155 |
+
#Yolo is the detection layer
|
156 |
+
elif x["type"] == "yolo":
|
157 |
+
mask = x["mask"].split(",")
|
158 |
+
mask = [int(x) for x in mask]
|
159 |
+
|
160 |
+
anchors = x["anchors"].split(",")
|
161 |
+
anchors = [int(a) for a in anchors]
|
162 |
+
anchors = [(anchors[i], anchors[i+1]) for i in range(0, len(anchors),2)]
|
163 |
+
anchors = [anchors[i] for i in mask]
|
164 |
+
|
165 |
+
detection = DetectionLayer(anchors)
|
166 |
+
module.add_module("Detection_{}".format(index), detection)
|
167 |
+
|
168 |
+
module_list.append(module)
|
169 |
+
prev_filters = filters
|
170 |
+
output_filters.append(filters)
|
171 |
+
|
172 |
+
return (net_info, module_list)
|
173 |
+
|
174 |
+
class Darknet(nn.Module):
|
175 |
+
def __init__(self, cfgfile):
|
176 |
+
super(Darknet, self).__init__()
|
177 |
+
self.blocks = parse_cfg(cfgfile)
|
178 |
+
self.net_info, self.module_list = create_modules(self.blocks)
|
179 |
+
|
180 |
+
def forward(self, x, CUDA):
|
181 |
+
modules = self.blocks[1:]
|
182 |
+
outputs = {} #We cache the outputs for the route layer
|
183 |
+
|
184 |
+
write = 0
|
185 |
+
for i, module in enumerate(modules):
|
186 |
+
module_type = (module["type"])
|
187 |
+
|
188 |
+
if module_type == "convolutional" or module_type == "upsample":
|
189 |
+
x = self.module_list[i](x)
|
190 |
+
|
191 |
+
elif module_type == "route":
|
192 |
+
layers = module["layers"]
|
193 |
+
layers = [int(a) for a in layers]
|
194 |
+
|
195 |
+
if (layers[0]) > 0:
|
196 |
+
layers[0] = layers[0] - i
|
197 |
+
|
198 |
+
if len(layers) == 1:
|
199 |
+
x = outputs[i + (layers[0])]
|
200 |
+
|
201 |
+
else:
|
202 |
+
if (layers[1]) > 0:
|
203 |
+
layers[1] = layers[1] - i
|
204 |
+
|
205 |
+
map1 = outputs[i + layers[0]]
|
206 |
+
map2 = outputs[i + layers[1]]
|
207 |
+
x = torch.cat((map1, map2), 1)
|
208 |
+
|
209 |
+
|
210 |
+
elif module_type == "shortcut":
|
211 |
+
from_ = int(module["from"])
|
212 |
+
x = outputs[i-1] + outputs[i+from_]
|
213 |
+
|
214 |
+
elif module_type == 'yolo':
|
215 |
+
anchors = self.module_list[i][0].anchors
|
216 |
+
#Get the input dimensions
|
217 |
+
inp_dim = int (self.net_info["height"])
|
218 |
+
|
219 |
+
#Get the number of classes
|
220 |
+
num_classes = int (module["classes"])
|
221 |
+
|
222 |
+
#Transform
|
223 |
+
x = x.data
|
224 |
+
x = predict_transform(x, inp_dim, anchors, num_classes, CUDA)
|
225 |
+
if not write: #if no collector has been intialised.
|
226 |
+
detections = x
|
227 |
+
write = 1
|
228 |
+
|
229 |
+
else:
|
230 |
+
detections = torch.cat((detections, x), 1)
|
231 |
+
|
232 |
+
outputs[i] = x
|
233 |
+
|
234 |
+
return detections
|
235 |
+
|
236 |
+
|
237 |
+
def load_weights(self, weightfile):
|
238 |
+
#Open the weights file
|
239 |
+
fp = open(weightfile, "rb")
|
240 |
+
|
241 |
+
#The first 5 values are header information
|
242 |
+
# 1. Major version number
|
243 |
+
# 2. Minor Version Number
|
244 |
+
# 3. Subversion number
|
245 |
+
# 4,5. Images seen by the network (during training)
|
246 |
+
header = np.fromfile(fp, dtype = np.int32, count = 5)
|
247 |
+
self.header = torch.from_numpy(header)
|
248 |
+
self.seen = self.header[3]
|
249 |
+
|
250 |
+
weights = np.fromfile(fp, dtype = np.float32)
|
251 |
+
|
252 |
+
ptr = 0
|
253 |
+
for i in range(len(self.module_list)):
|
254 |
+
module_type = self.blocks[i + 1]["type"]
|
255 |
+
|
256 |
+
#If module_type is convolutional load weights
|
257 |
+
#Otherwise ignore.
|
258 |
+
|
259 |
+
if module_type == "convolutional":
|
260 |
+
model = self.module_list[i]
|
261 |
+
try:
|
262 |
+
batch_normalize = int(self.blocks[i+1]["batch_normalize"])
|
263 |
+
except:
|
264 |
+
batch_normalize = 0
|
265 |
+
|
266 |
+
conv = model[0]
|
267 |
+
|
268 |
+
|
269 |
+
if (batch_normalize):
|
270 |
+
bn = model[1]
|
271 |
+
|
272 |
+
#Get the number of weights of Batch Norm Layer
|
273 |
+
num_bn_biases = bn.bias.numel()
|
274 |
+
|
275 |
+
#Load the weights
|
276 |
+
bn_biases = torch.from_numpy(weights[ptr:ptr + num_bn_biases])
|
277 |
+
ptr += num_bn_biases
|
278 |
+
|
279 |
+
bn_weights = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
280 |
+
ptr += num_bn_biases
|
281 |
+
|
282 |
+
bn_running_mean = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
283 |
+
ptr += num_bn_biases
|
284 |
+
|
285 |
+
bn_running_var = torch.from_numpy(weights[ptr: ptr + num_bn_biases])
|
286 |
+
ptr += num_bn_biases
|
287 |
+
|
288 |
+
#Cast the loaded weights into dims of model weights.
|
289 |
+
bn_biases = bn_biases.view_as(bn.bias.data)
|
290 |
+
bn_weights = bn_weights.view_as(bn.weight.data)
|
291 |
+
bn_running_mean = bn_running_mean.view_as(bn.running_mean)
|
292 |
+
bn_running_var = bn_running_var.view_as(bn.running_var)
|
293 |
+
|
294 |
+
#Copy the data to model
|
295 |
+
bn.bias.data.copy_(bn_biases)
|
296 |
+
bn.weight.data.copy_(bn_weights)
|
297 |
+
bn.running_mean.copy_(bn_running_mean)
|
298 |
+
bn.running_var.copy_(bn_running_var)
|
299 |
+
|
300 |
+
else:
|
301 |
+
#Number of biases
|
302 |
+
num_biases = conv.bias.numel()
|
303 |
+
|
304 |
+
#Load the weights
|
305 |
+
conv_biases = torch.from_numpy(weights[ptr: ptr + num_biases])
|
306 |
+
ptr = ptr + num_biases
|
307 |
+
|
308 |
+
#reshape the loaded weights according to the dims of the model weights
|
309 |
+
conv_biases = conv_biases.view_as(conv.bias.data)
|
310 |
+
|
311 |
+
#Finally copy the data
|
312 |
+
conv.bias.data.copy_(conv_biases)
|
313 |
+
|
314 |
+
#Let us load the weights for the Convolutional layers
|
315 |
+
num_weights = conv.weight.numel()
|
316 |
+
|
317 |
+
#Do the same as above for weights
|
318 |
+
conv_weights = torch.from_numpy(weights[ptr:ptr+num_weights])
|
319 |
+
ptr = ptr + num_weights
|
320 |
+
|
321 |
+
conv_weights = conv_weights.view_as(conv.weight.data)
|
322 |
+
conv.weight.data.copy_(conv_weights)
|
detect.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PyTorch implementation of Darknet
|
2 |
+
# This is a custom, hard-coded version of darknet with
|
3 |
+
# YOLOv3 implementation for openimages database. This
|
4 |
+
# was written to test viability of implementing YOLO
|
5 |
+
# for face detection followed by emotion / sentiment
|
6 |
+
# analysis.
|
7 |
+
#
|
8 |
+
# Configuration, weights and data are hardcoded.
|
9 |
+
# Additional options include, ability to create
|
10 |
+
# subset of data with faces exracted for labelling.
|
11 |
+
#
|
12 |
+
# Author : Saikiran Tharimena
|
13 |
+
# Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
|
14 |
+
# Project : Emotion / Sentiment Detection from news images
|
15 |
+
# Date : 12 September 2022
|
16 |
+
# Version : v0.1
|
17 |
+
#
|
18 |
+
# (C) Schibsted ASA
|
19 |
+
|
20 |
+
# Libraries
|
21 |
+
import os
|
22 |
+
import cv2
|
23 |
+
import torch
|
24 |
+
import numpy as np
|
25 |
+
from utils import *
|
26 |
+
from darknet import Darknet
|
27 |
+
from torch.autograd import Variable
|
28 |
+
from torch.cuda import is_available as check_cuda
|
29 |
+
|
30 |
+
# Parameters
|
31 |
+
batch_size = 1
|
32 |
+
confidence = 0.25
|
33 |
+
nms_thresh = 0.30
|
34 |
+
run_cuda = False
|
35 |
+
|
36 |
+
# CFG Files
|
37 |
+
cwd = os.path.dirname(__file__)
|
38 |
+
cfg = cwd + '/cfg/yolov3-openimages.cfg'
|
39 |
+
data = cwd + '/cfg/openimages.data'
|
40 |
+
clsnames= cwd + '/cfg/openimages.names'
|
41 |
+
weights = cwd + '/cfg/yolov3-openimages.weights'
|
42 |
+
|
43 |
+
# Load classes
|
44 |
+
num_classes = 601
|
45 |
+
classes = load_classes(clsnames)
|
46 |
+
|
47 |
+
# Set up the neural network
|
48 |
+
print('Load Network')
|
49 |
+
model = Darknet(cfg)
|
50 |
+
|
51 |
+
print('Load Weights')
|
52 |
+
model.load_weights(weights)
|
53 |
+
|
54 |
+
print('Successfully loaded Network')
|
55 |
+
|
56 |
+
# Check CUDA
|
57 |
+
if run_cuda:
|
58 |
+
CUDA = check_cuda()
|
59 |
+
else:
|
60 |
+
CUDA = False
|
61 |
+
|
62 |
+
# Input dimension
|
63 |
+
inp_dim = int(model.net_info["height"])
|
64 |
+
|
65 |
+
# put the model on GPU
|
66 |
+
if CUDA:
|
67 |
+
model.cuda()
|
68 |
+
|
69 |
+
# Set the model in evaluation mode
|
70 |
+
model.eval()
|
71 |
+
|
72 |
+
# face detector
|
73 |
+
def detect_face(image):
|
74 |
+
# Just lazy to update this
|
75 |
+
imlist = [image]
|
76 |
+
|
77 |
+
loaded_ims = [cv2.imread(x) for x in imlist]
|
78 |
+
|
79 |
+
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))]))
|
80 |
+
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims]
|
81 |
+
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2)
|
82 |
+
|
83 |
+
leftover = 0
|
84 |
+
if (len(im_dim_list) % batch_size):
|
85 |
+
leftover = 1
|
86 |
+
|
87 |
+
if batch_size != 1:
|
88 |
+
num_batches = len(imlist) // batch_size + leftover
|
89 |
+
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size,
|
90 |
+
len(im_batches))])) for i in range(num_batches)]
|
91 |
+
|
92 |
+
write = 0
|
93 |
+
if CUDA:
|
94 |
+
im_dim_list = im_dim_list.cuda()
|
95 |
+
|
96 |
+
for i, batch in enumerate(im_batches):
|
97 |
+
# load the image
|
98 |
+
|
99 |
+
if CUDA:
|
100 |
+
batch = batch.cuda()
|
101 |
+
with torch.no_grad():
|
102 |
+
prediction = model(Variable(batch), CUDA)
|
103 |
+
|
104 |
+
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh)
|
105 |
+
|
106 |
+
if type(prediction) == int:
|
107 |
+
|
108 |
+
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
|
109 |
+
im_id = i*batch_size + im_num
|
110 |
+
|
111 |
+
continue
|
112 |
+
|
113 |
+
prediction[:,0] += i*batch_size # transform the atribute from index in batch to index in imlist
|
114 |
+
|
115 |
+
if not write: # If we have't initialised output
|
116 |
+
output = prediction
|
117 |
+
write = 1
|
118 |
+
else:
|
119 |
+
output = torch.cat((output, prediction))
|
120 |
+
|
121 |
+
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]):
|
122 |
+
im_id = i * batch_size + im_num
|
123 |
+
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id]
|
124 |
+
|
125 |
+
if CUDA:
|
126 |
+
torch.cuda.synchronize()
|
127 |
+
|
128 |
+
try:
|
129 |
+
output
|
130 |
+
except NameError:
|
131 |
+
return None
|
132 |
+
|
133 |
+
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long())
|
134 |
+
|
135 |
+
scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1)
|
136 |
+
|
137 |
+
output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2
|
138 |
+
output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2
|
139 |
+
|
140 |
+
output[:, 1:5] /= scaling_factor
|
141 |
+
|
142 |
+
for i in range(output.shape[0]):
|
143 |
+
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0])
|
144 |
+
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1])
|
145 |
+
|
146 |
+
def get_detections(x, results):
|
147 |
+
c1 = [int(y) for y in x[1:3]]
|
148 |
+
c2 = [int(y) for y in x[3:5]]
|
149 |
+
|
150 |
+
det_class = int(x[-1])
|
151 |
+
label = "{0}".format(classes[det_class])
|
152 |
+
|
153 |
+
return (label, tuple(c1 + c2))
|
154 |
+
|
155 |
+
detections = list(map(lambda x: get_detections(x, loaded_ims), output))
|
156 |
+
|
157 |
+
if CUDA:
|
158 |
+
torch.cuda.empty_cache()
|
159 |
+
|
160 |
+
return loaded_ims[0], detections
|
161 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
fastai
|
3 |
+
numpy
|
4 |
+
opencv-python
|
utils.py
ADDED
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PyTorch implementation of Darknet
|
2 |
+
# This is a custom, hard-coded version of darknet with
|
3 |
+
# YOLOv3 implementation for openimages database. This
|
4 |
+
# was written to test viability of implementing YOLO
|
5 |
+
# for face detection followed by emotion / sentiment
|
6 |
+
# analysis.
|
7 |
+
#
|
8 |
+
# Configuration, weights and data are hardcoded.
|
9 |
+
# Additional options include, ability to create
|
10 |
+
# subset of data with faces exracted for labelling.
|
11 |
+
#
|
12 |
+
# Author : Saikiran Tharimena
|
13 |
+
# Co-Authors: Kjetil Marinius Sjulsen, Juan Carlos Calvet Lopez
|
14 |
+
# Project : Emotion / Sentiment Detection from news images
|
15 |
+
# Date : 12 September 2022
|
16 |
+
# Version : v0.1
|
17 |
+
#
|
18 |
+
# (C) Schibsted ASA
|
19 |
+
|
20 |
+
import torch
|
21 |
+
import torch.nn as nn
|
22 |
+
import torch.nn.functional as F
|
23 |
+
from torch.autograd import Variable
|
24 |
+
import numpy as np
|
25 |
+
import cv2
|
26 |
+
|
27 |
+
|
28 |
+
def unique(tensor):
|
29 |
+
tensor_np = tensor.cpu().numpy()
|
30 |
+
unique_np = np.unique(tensor_np)
|
31 |
+
unique_tensor = torch.from_numpy(unique_np)
|
32 |
+
|
33 |
+
tensor_res = tensor.new(unique_tensor.shape)
|
34 |
+
tensor_res.copy_(unique_tensor)
|
35 |
+
return tensor_res
|
36 |
+
|
37 |
+
|
38 |
+
def bbox_iou(box1, box2):
|
39 |
+
"""
|
40 |
+
Returns the IoU of two bounding boxes
|
41 |
+
|
42 |
+
|
43 |
+
"""
|
44 |
+
#Get the coordinates of bounding boxes
|
45 |
+
b1_x1, b1_y1, b1_x2, b1_y2 = box1[:,0], box1[:,1], box1[:,2], box1[:,3]
|
46 |
+
b2_x1, b2_y1, b2_x2, b2_y2 = box2[:,0], box2[:,1], box2[:,2], box2[:,3]
|
47 |
+
|
48 |
+
#get the corrdinates of the intersection rectangle
|
49 |
+
inter_rect_x1 = torch.max(b1_x1, b2_x1)
|
50 |
+
inter_rect_y1 = torch.max(b1_y1, b2_y1)
|
51 |
+
inter_rect_x2 = torch.min(b1_x2, b2_x2)
|
52 |
+
inter_rect_y2 = torch.min(b1_y2, b2_y2)
|
53 |
+
|
54 |
+
#Intersection area
|
55 |
+
inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(inter_rect_y2 - inter_rect_y1 + 1, min=0)
|
56 |
+
|
57 |
+
#Union Area
|
58 |
+
b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1)
|
59 |
+
b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1)
|
60 |
+
|
61 |
+
iou = inter_area / (b1_area + b2_area - inter_area)
|
62 |
+
|
63 |
+
return iou
|
64 |
+
|
65 |
+
|
66 |
+
def predict_transform(prediction, inp_dim, anchors, num_classes, CUDA = True):
|
67 |
+
|
68 |
+
batch_size = prediction.size(0)
|
69 |
+
stride = inp_dim // prediction.size(2)
|
70 |
+
grid_size = inp_dim // stride
|
71 |
+
bbox_attrs = 5 + num_classes
|
72 |
+
num_anchors = len(anchors)
|
73 |
+
|
74 |
+
prediction = prediction.view(batch_size, bbox_attrs*num_anchors, grid_size*grid_size)
|
75 |
+
prediction = prediction.transpose(1,2).contiguous()
|
76 |
+
prediction = prediction.view(batch_size, grid_size*grid_size*num_anchors, bbox_attrs)
|
77 |
+
anchors = [(a[0]/stride, a[1]/stride) for a in anchors]
|
78 |
+
|
79 |
+
#Sigmoid the centre_X, centre_Y. and object confidencce
|
80 |
+
prediction[:,:,0] = torch.sigmoid(prediction[:,:,0])
|
81 |
+
prediction[:,:,1] = torch.sigmoid(prediction[:,:,1])
|
82 |
+
prediction[:,:,4] = torch.sigmoid(prediction[:,:,4])
|
83 |
+
|
84 |
+
#Add the center offsets
|
85 |
+
grid = np.arange(grid_size)
|
86 |
+
a,b = np.meshgrid(grid, grid)
|
87 |
+
|
88 |
+
x_offset = torch.FloatTensor(a).view(-1,1)
|
89 |
+
y_offset = torch.FloatTensor(b).view(-1,1)
|
90 |
+
|
91 |
+
if CUDA:
|
92 |
+
x_offset = x_offset.cuda()
|
93 |
+
y_offset = y_offset.cuda()
|
94 |
+
|
95 |
+
x_y_offset = torch.cat((x_offset, y_offset), 1).repeat(1,num_anchors).view(-1,2).unsqueeze(0)
|
96 |
+
|
97 |
+
prediction[:,:,:2] += x_y_offset
|
98 |
+
|
99 |
+
#log space transform height and the width
|
100 |
+
anchors = torch.FloatTensor(anchors)
|
101 |
+
|
102 |
+
if CUDA:
|
103 |
+
anchors = anchors.cuda()
|
104 |
+
|
105 |
+
anchors = anchors.repeat(grid_size*grid_size, 1).unsqueeze(0)
|
106 |
+
prediction[:,:,2:4] = torch.exp(prediction[:,:,2:4])*anchors
|
107 |
+
|
108 |
+
prediction[:,:,5: 5 + num_classes] = torch.sigmoid((prediction[:,:, 5 : 5 + num_classes]))
|
109 |
+
|
110 |
+
prediction[:,:,:4] *= stride
|
111 |
+
|
112 |
+
return prediction
|
113 |
+
|
114 |
+
|
115 |
+
def write_results(prediction, confidence, num_classes, nms_conf = 0.4):
|
116 |
+
conf_mask = (prediction[:,:,4] > confidence).float().unsqueeze(2)
|
117 |
+
prediction = prediction*conf_mask
|
118 |
+
|
119 |
+
box_corner = prediction.new(prediction.shape)
|
120 |
+
box_corner[:,:,0] = (prediction[:,:,0] - prediction[:,:,2]/2)
|
121 |
+
box_corner[:,:,1] = (prediction[:,:,1] - prediction[:,:,3]/2)
|
122 |
+
box_corner[:,:,2] = (prediction[:,:,0] + prediction[:,:,2]/2)
|
123 |
+
box_corner[:,:,3] = (prediction[:,:,1] + prediction[:,:,3]/2)
|
124 |
+
prediction[:,:,:4] = box_corner[:,:,:4]
|
125 |
+
|
126 |
+
batch_size = prediction.size(0)
|
127 |
+
|
128 |
+
write = False
|
129 |
+
|
130 |
+
|
131 |
+
|
132 |
+
for ind in range(batch_size):
|
133 |
+
image_pred = prediction[ind] #image Tensor
|
134 |
+
#confidence threshholding
|
135 |
+
#NMS
|
136 |
+
|
137 |
+
max_conf, max_conf_score = torch.max(image_pred[:,5:5+ num_classes], 1)
|
138 |
+
max_conf = max_conf.float().unsqueeze(1)
|
139 |
+
max_conf_score = max_conf_score.float().unsqueeze(1)
|
140 |
+
seq = (image_pred[:,:5], max_conf, max_conf_score)
|
141 |
+
image_pred = torch.cat(seq, 1)
|
142 |
+
|
143 |
+
non_zero_ind = (torch.nonzero(image_pred[:,4]))
|
144 |
+
try:
|
145 |
+
image_pred_ = image_pred[non_zero_ind.squeeze(),:].view(-1,7)
|
146 |
+
except:
|
147 |
+
continue
|
148 |
+
|
149 |
+
if image_pred_.shape[0] == 0:
|
150 |
+
continue
|
151 |
+
#
|
152 |
+
|
153 |
+
#Get the various classes detected in the image
|
154 |
+
img_classes = unique(image_pred_[:,-1]) # -1 index holds the class index
|
155 |
+
|
156 |
+
|
157 |
+
for cls in img_classes:
|
158 |
+
#perform NMS
|
159 |
+
|
160 |
+
|
161 |
+
#get the detections with one particular class
|
162 |
+
cls_mask = image_pred_*(image_pred_[:,-1] == cls).float().unsqueeze(1)
|
163 |
+
class_mask_ind = torch.nonzero(cls_mask[:,-2]).squeeze()
|
164 |
+
image_pred_class = image_pred_[class_mask_ind].view(-1,7)
|
165 |
+
|
166 |
+
#sort the detections such that the entry with the maximum objectness
|
167 |
+
#confidence is at the top
|
168 |
+
conf_sort_index = torch.sort(image_pred_class[:,4], descending = True )[1]
|
169 |
+
image_pred_class = image_pred_class[conf_sort_index]
|
170 |
+
idx = image_pred_class.size(0) #Number of detections
|
171 |
+
|
172 |
+
for i in range(idx):
|
173 |
+
#Get the IOUs of all boxes that come after the one we are looking at
|
174 |
+
#in the loop
|
175 |
+
try:
|
176 |
+
ious = bbox_iou(image_pred_class[i].unsqueeze(0), image_pred_class[i+1:])
|
177 |
+
except ValueError:
|
178 |
+
break
|
179 |
+
|
180 |
+
except IndexError:
|
181 |
+
break
|
182 |
+
|
183 |
+
#Zero out all the detections that have IoU > treshhold
|
184 |
+
iou_mask = (ious < nms_conf).float().unsqueeze(1)
|
185 |
+
image_pred_class[i+1:] *= iou_mask
|
186 |
+
|
187 |
+
#Remove the non-zero entries
|
188 |
+
non_zero_ind = torch.nonzero(image_pred_class[:,4]).squeeze()
|
189 |
+
image_pred_class = image_pred_class[non_zero_ind].view(-1,7)
|
190 |
+
|
191 |
+
batch_ind = image_pred_class.new(image_pred_class.size(0), 1).fill_(ind) #Repeat the batch_id for as many detections of the class cls in the image
|
192 |
+
seq = batch_ind, image_pred_class
|
193 |
+
|
194 |
+
if not write:
|
195 |
+
output = torch.cat(seq,1)
|
196 |
+
write = True
|
197 |
+
else:
|
198 |
+
out = torch.cat(seq,1)
|
199 |
+
output = torch.cat((output,out))
|
200 |
+
|
201 |
+
try:
|
202 |
+
return output
|
203 |
+
except:
|
204 |
+
return 0
|
205 |
+
|
206 |
+
|
207 |
+
def letterbox_image(img, inp_dim):
|
208 |
+
'''resize image with unchanged aspect ratio using padding'''
|
209 |
+
img_w, img_h = img.shape[1], img.shape[0]
|
210 |
+
w, h = inp_dim
|
211 |
+
new_w = int(img_w * min(w/img_w, h/img_h))
|
212 |
+
new_h = int(img_h * min(w/img_w, h/img_h))
|
213 |
+
resized_image = cv2.resize(img, (new_w,new_h), interpolation = cv2.INTER_CUBIC)
|
214 |
+
|
215 |
+
canvas = np.full((inp_dim[1], inp_dim[0], 3), 128)
|
216 |
+
|
217 |
+
canvas[(h-new_h)//2:(h-new_h)//2 + new_h,(w-new_w)//2:(w-new_w)//2 + new_w, :] = resized_image
|
218 |
+
|
219 |
+
return canvas
|
220 |
+
|
221 |
+
|
222 |
+
def prep_image(img, inp_dim):
|
223 |
+
"""
|
224 |
+
Prepare image for inputting to the neural network.
|
225 |
+
|
226 |
+
Returns a Variable
|
227 |
+
"""
|
228 |
+
img = (letterbox_image(img, (inp_dim, inp_dim)))
|
229 |
+
img = img[:,:,::-1].transpose((2,0,1)).copy()
|
230 |
+
img = torch.from_numpy(img).float().div(255.0).unsqueeze(0)
|
231 |
+
return img
|
232 |
+
|
233 |
+
|
234 |
+
def load_classes(namesfile):
|
235 |
+
fp = open(namesfile, "r")
|
236 |
+
names = fp.read().split("\n")[:-1]
|
237 |
+
return names
|