File size: 3,725 Bytes
5d5ba51
b9008f4
5d5ba51
b9008f4
 
7f5ff73
b9008f4
5d5ba51
b9008f4
 
5d5ba51
 
b9008f4
 
5d5ba51
b9008f4
59df8b5
 
b9008f4
5d5ba51
b9008f4
5d5ba51
b9008f4
5d5ba51
b9008f4
5d5ba51
b9008f4
5d5ba51
b9008f4
59df8b5
b9008f4
5d5ba51
b9008f4
5d5ba51
 
b9008f4
5d5ba51
b9008f4
 
 
 
 
 
 
 
 
52e7c21
b9008f4
 
 
 
52e7c21
b9008f4
52e7c21
b9008f4
d5bdf86
56b7b2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import torch
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
from einops import rearrange
import gradio
import call_labels

# define the feature extractor
extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")

# define the pretrained model
model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")

# retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019)
labels = call_labels.call_labels()

# call model.eval() to assert that we are evaluating the model and not updating the weights
model.eval()

# define the function used for model inference
def classify(image):
  # disable gradient calculation
  with torch.no_grad():
    # extract features from the image input
    inputs = extractor(images=image, return_tensors='pt')
    # call the logits parameter only (object: SequenceClassifierOutput)
    outputs = model(**inputs).logits
    # remove the batch dimension
    outputs = rearrange(outputs, '1 j->j')
    # use the softmax function to convert the logits into probabilities
    outputs = torch.nn.functional.softmax(outputs)
    # convert the data type from tensor to a numpy array
    outputs = outputs.cpu().numpy()
    # returns a key-value pair of the id labels and its corresponding probabilities
    return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}

# define the gradio interface
gradio.Interface(fn=classify,
                 inputs=gradio.inputs.Image(shape=(224,224),
                                            image_mode='RGB',
                                            source='upload',
                                            tool='editor',
                                            type='pil',
                                            label=None,
                                            optional=False),
                 outputs=gradio.outputs.Label(num_top_classes=5,
                                              type='auto'),
                 theme='grass',
                 examples=[['bedroom.jpg'],
                           ['bathroom_AS.jpg'],
                           ['samsung_room.jpg']],
                 live=True,
                 layout='horizontal',
                 title='Indoor Scene Recognition',
                 description='A smart and easy-to-use indoor scene classifier. Start by uploading an input image. The outputs are the top five indoor scene classes that best fit your input image.',
                 interpretation='default',
                 article='''<h2><b>Additional Information</b></h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Visual Transformer (ViT)</b> model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224 and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. The original GitHub repository of the Visual Transformer is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>. This Visual Transformer model was fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> from Kaggle. The source model from Hugging Face is found in <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>this link</a></b>.</p>''',
                 allow_flagging='never').launch()