import torch |
from utils import * |
import gradio as gr |
from numpy import array |
from darknet import Darknet |
from torch.autograd import Variable |
from torch.cuda import is_available as check_cuda |
from PIL.ImageOps import grayscale |
from fastai.vision.all import PILImage, load_learner |
batch_size = 1 |
confidence = 0.25 |
nms_thresh = 0.30 |
run_cuda = False |
cfg = 'cfg/yolov3-openimages.cfg' |
clsnames= 'cfg/openimages.names' |
weights = 'cfg/yolov3-openimages.weights' |
classes = load_classes(clsnames) |
num_classes = len(classes) |
print('Load Network') |
model = Darknet(cfg) |
print('Load Weights') |
model.load_weights(weights) |
print('Successfully loaded Network') |
if run_cuda: |
CUDA = check_cuda() |
else: |
CUDA = False |
inp_dim = int(model.net_info["height"]) |
if CUDA: |
model.cuda() |
model.eval() |
def get_detections(x): |
c1 = [int(y) for y in x[1:3]] |
c2 = [int(y) for y in x[3:5]] |
det_class = int(x[-1]) |
label = "{0}".format(classes[det_class]) |
return (label, tuple(c1 + c2)) |
def detector(image): |
imlist = [image] |
loaded_ims = [image] |
im_batches = list(map(prep_image, loaded_ims, [inp_dim for x in range(len(imlist))])) |
im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_ims] |
im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) |
leftover = 0 |
if (len(im_dim_list) % batch_size): |
leftover = 1 |
if batch_size != 1: |
num_batches = len(imlist) // batch_size + leftover |
im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, |
len(im_batches))])) for i in range(num_batches)] |
write = 0 |
if CUDA: |
im_dim_list = im_dim_list.cuda() |
for i, batch in enumerate(im_batches): |
if CUDA: |
batch = batch.cuda() |
with torch.no_grad(): |
prediction = model(Variable(batch), CUDA) |
prediction = write_results(prediction, confidence, num_classes, nms_conf = nms_thresh) |
if type(prediction) == int: |
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): |
im_id = i*batch_size + im_num |
continue |
prediction[:,0] += i*batch_size |
if not write: |
output = prediction |
write = 1 |
else: |
output = torch.cat((output, prediction)) |
for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): |
im_id = i * batch_size + im_num |
objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] |
if CUDA: |
torch.cuda.synchronize() |
try: |
output |
except NameError: |
return loaded_ims[0], [] |
im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) |
scaling_factor = torch.min(608/im_dim_list,1)[0].view(-1,1) |
output[:, [1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 |
output[:, [2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 |
output[:, 1:5] /= scaling_factor |
for i in range(output.shape[0]): |
output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) |
output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) |
detections = list(map(get_detections, output)) |
if CUDA: |
torch.cuda.empty_cache() |
return loaded_ims[0], detections |
learn_emotion = load_learner('models/emotions_vgg19.pkl') |
learn_emotion_labels = learn_emotion.dls.vocab |
learn_sentiment = load_learner('models/sentiment_vgg19.pkl') |
learn_sentiment_labels = learn_sentiment.dls.vocab |
def crop_images(img, bbox): |
"Here image should be an image object from PILImage.create" |
xmin, ymin, xmax, ymax = bbox[1] |
return img.crop((xmin, ymin, xmax, ymax)) |
def detect_person_face(img, detections): |
'''This function is called from within detect face. |
If only a person is detected, then this will crop |
image and then try to detect face again.''' |
faces = [] |
for detection in detections: |
temp = crop_images(img, detection) |
_, detect = detector(array(temp)[...,:3]) |
human_face = [idx for idx, val in enumerate(detect) if val[0] == 'Human face'] |
if len(human_face) == 0: |
continue |
faces.append(crop_images(temp, detect[human_face[0]])) |
return faces |
def detect_face(img): |
_, detections = detector(array(img)[...,:3]) |
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Human face'] |
if len(human_face) == 0: |
human_face = [idx for idx, val in enumerate(detections) if val[0] == 'Person'] |
if len(human_face) == 0: |
return [] |
else: |
faces = detect_person_face(img, [detections[idx] for idx in human_face]) |
else: |
faces = [] |
for idx in human_face: |
faces.append(crop_images(img, detections[idx])) |
return faces |
def predict(img): |
img = PILImage.create(img) |
faces = detect_face(img) |
output = [] |
if len(faces) == 0: |
img = img.resize((48, 48)) |
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) |
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) |
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} |
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} |
output = [img, emotions, sentiments, img, emotions, sentiments, img, emotions, sentiments] |
else: |
for face in faces[:3]: |
img = face.resize((48, 48)) |
pred_emotion, pred_emotion_idx, probs_emotion = learn_emotion.predict(array(grayscale(img))) |
pred_sentiment, pred_sentiment_idx, probs_sentiment = learn_sentiment.predict(array(grayscale(img))) |
emotions = {learn_emotion_labels[i]: float(probs_emotion[i]) for i in range(len(learn_emotion_labels))} |
sentiments = {learn_sentiment_labels[i]: float(probs_sentiment[i]) for i in range(len(learn_sentiment_labels))} |
output.append(img) |
output.append(emotions) |
output.append(sentiments) |
temp = output[-3:] |
while len(output) < 9: |
output = output + temp |
return output |
title = 'Face Recognition with Emotion and Sentiment Detector' |
description = gr.Markdown( |
"""Ever wondered what a person might be feeling looking at their picture? |
Well, now you can! Try this fun app. Just upload a facial image in JPG or |
PNG format. Voila! you can now see what they might have felt when the picture |
was taken. |
This is an updated version of Facial Expression Classifier: |
https://huggingface.co/spaces/schibsted/facial_expression_classifier |
""").value |
article = gr.Markdown( |
"""**DISCLAIMER:** This model does not reveal the actual emotional state of a person. Use and |
interpret results at your own risk! It was built as a demo for AI course. Samples images |
were downloaded from VG & AftenPosten news webpages. Copyrights belong to respective |
brands. All rights reserved. |
**PREMISE:** The idea is to determine an overall sentiment of a news site on a daily basis |
based on the pictures. We are restricting pictures to only include close-up facial |
images. |
**DATA:** FER2013 dataset consists of 48x48 pixel grayscale images of faces. There are 28,709 |
images in the training set and 3,589 images in the test set. However, for this demo all |
pictures were combined into a single dataset and 80:20 split was used for training. Images |
are assigned one of the 7 emotions: Angry, Disgust, Fear, Happy, Sad, Surprise, and Neutral. |
In addition to these 7 classes, images were re-classified into 3 sentiment categories based |
on emotions: |
Positive (Happy, Surprise) |
Negative (Angry, Disgust, Fear, Sad) |
Neutral (Neutral) |
FER2013 (preliminary version) dataset can be downloaded at: |
https://www.kaggle.com/c/challenges-in-representation-learning-facial-expression-recognition-challenge/data |
**EMOTION / SENTIMENT MODEL:** VGG19 was used as the base model and trained on FER2013 dataset. Model was trained |
using PyTorch and FastAI. Two models were trained, one for detecting emotion and the other |
for detecting sentiment. Although, this could have been done with just one model, here two |
models were trained for the demo. |
**FACE DETECTOR:** Darknet with YOLOv3 architecture was used for face detection. Reach out to me for full details. |
In short, any image is first sent through darknet. If face is detected, then it is passed through emotion/sentiment |
model for each face in the picture. If a person is detected rather than a face, the image is cropped and run through |
face detector again. If a face is detected, then it is passed through emotion/sentiment model. In case face is not |
detected in an image, then the entire image is evaluated to generate some score. This is done because, I couldn't |
figure out how to pipe None/blank output to Gradio.Interface(). There maybe option through Gradio.Blocks() but was |
too lazy to go through that at this stage. In addition, the output is restricted to only 3 faces in a picture. |
""").value |
enable_queue=True |
examples = ['happy1.jpg', 'happy2.jpg', 'angry1.png', 'angry2.jpg', 'neutral1.jpg', 'neutral2.jpg'] |
gr.Interface(fn = predict, |
inputs = gr.Image(), |
outputs = [gr.Image(shape=(48, 48), label='Person 1'), |
gr.Label(label='Emotion - Person 1'), |
gr.Label(label='Sentiment - Person 1'), |
gr.Image(shape=(48, 48), label='Person 2'), |
gr.Label(label='Emotion - Person 2'), |
gr.Label(label='Sentiment - Person 2'), |
gr.Image(shape=(48, 48), label='Person 3'), |
gr.Label(label='Emotion - Person 3'), |
gr.Label(label='Sentiment - Person 3'),], |
title = title, |
examples = examples, |
description = description, |
article=article, |
allow_flagging='never').launch(enable_queue=enable_queue) |