Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,95 +1,56 @@
|
|
1 |
-
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
|
2 |
-
import gradio
|
3 |
import torch
|
|
|
4 |
from einops import rearrange
|
5 |
-
import
|
|
|
6 |
|
|
|
7 |
extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")
|
|
|
|
|
8 |
model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")
|
9 |
|
10 |
-
labels
|
11 |
-
|
12 |
-
"1": "artstudio",
|
13 |
-
"2": "auditorium",
|
14 |
-
"3": "bakery",
|
15 |
-
"4": "bar",
|
16 |
-
"5": "bathroom",
|
17 |
-
"6": "bedroom",
|
18 |
-
"7": "bookstore",
|
19 |
-
"8": "bowling",
|
20 |
-
"9": "buffet",
|
21 |
-
"10": "casino",
|
22 |
-
"11": "children_room",
|
23 |
-
"12": "church_inside",
|
24 |
-
"13": "classroom",
|
25 |
-
"14": "cloister",
|
26 |
-
"15": "closet",
|
27 |
-
"16": "clothingstore",
|
28 |
-
"17": "computerroom",
|
29 |
-
"18": "concert_hall",
|
30 |
-
"19": "corridor",
|
31 |
-
"20": "deli",
|
32 |
-
"21": "dentaloffice",
|
33 |
-
"22": "dining_room",
|
34 |
-
"23": "elevator",
|
35 |
-
"24": "fastfood_restaurant",
|
36 |
-
"25": "florist",
|
37 |
-
"26": "gameroom",
|
38 |
-
"27": "garage",
|
39 |
-
"28": "greenhouse",
|
40 |
-
"29": "grocerystore",
|
41 |
-
"30": "gym",
|
42 |
-
"31": "hairsalon",
|
43 |
-
"32": "hospitalroom",
|
44 |
-
"33": "inside_bus",
|
45 |
-
"34": "inside_subway",
|
46 |
-
"35": "jewelleryshop",
|
47 |
-
"36": "kindergarden",
|
48 |
-
"37": "kitchen",
|
49 |
-
"38": "laboratorywet",
|
50 |
-
"39": "laundromat",
|
51 |
-
"40": "library",
|
52 |
-
"41": "livingroom",
|
53 |
-
"42": "lobby",
|
54 |
-
"43": "locker_room",
|
55 |
-
"44": "mall",
|
56 |
-
"45": "meeting_room",
|
57 |
-
"46": "movietheater",
|
58 |
-
"47": "museum",
|
59 |
-
"48": "nursery",
|
60 |
-
"49": "office",
|
61 |
-
"50": "operating_room",
|
62 |
-
"51": "pantry",
|
63 |
-
"52": "poolinside",
|
64 |
-
"53": "prisoncell",
|
65 |
-
"54": "restaurant",
|
66 |
-
"55": "restaurant_kitchen",
|
67 |
-
"56": "shoeshop",
|
68 |
-
"57": "stairscase",
|
69 |
-
"58": "studiomusic",
|
70 |
-
"59": "subway",
|
71 |
-
"60": "toystore",
|
72 |
-
"61": "trainstation",
|
73 |
-
"62": "tv_studio",
|
74 |
-
"63": "videostore",
|
75 |
-
"64": "waitingroom",
|
76 |
-
"65": "warehouse",
|
77 |
-
"66": "winecellar"
|
78 |
-
}
|
79 |
|
|
|
80 |
model.eval()
|
81 |
|
|
|
82 |
def classify(image):
|
|
|
83 |
with torch.no_grad():
|
|
|
84 |
inputs = extractor(images=image, return_tensors='pt')
|
|
|
85 |
outputs = model(**inputs).logits
|
|
|
86 |
outputs = rearrange(outputs, '1 j->j')
|
|
|
87 |
outputs = torch.nn.functional.softmax(outputs)
|
|
|
88 |
outputs = outputs.cpu().numpy()
|
|
|
89 |
return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
|
90 |
|
|
|
91 |
gradio.Interface(fn=classify,
|
92 |
-
inputs=gradio.inputs.Image(shape=(224,224),
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
allow_flagging='never').launch()
|
|
|
|
|
|
|
1 |
import torch
|
2 |
+
from transformers import AutoFeatureExtractor, AutoModelForImageClassification
|
3 |
from einops import rearrange
|
4 |
+
import gradio
|
5 |
+
import call_labels
|
6 |
|
7 |
+
# define the feature extractor
|
8 |
extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")
|
9 |
+
|
10 |
+
# define the pretrained model
|
11 |
model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")
|
12 |
|
13 |
+
# retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019)
|
14 |
+
labels = call_labels.call_labels()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
# call model.eval() to assert that we are evaluating the model and not updating the weights
|
17 |
model.eval()
|
18 |
|
19 |
+
# define the function used for model inference
|
20 |
def classify(image):
|
21 |
+
# disable gradient calculation
|
22 |
with torch.no_grad():
|
23 |
+
# extract features from the image input
|
24 |
inputs = extractor(images=image, return_tensors='pt')
|
25 |
+
# call the logits parameter only (object: SequenceClassifierOutput)
|
26 |
outputs = model(**inputs).logits
|
27 |
+
# remove the batch dimension
|
28 |
outputs = rearrange(outputs, '1 j->j')
|
29 |
+
# use the softmax function to convert the logits into probabilities
|
30 |
outputs = torch.nn.functional.softmax(outputs)
|
31 |
+
# convert the data type from tensor to a numpy array
|
32 |
outputs = outputs.cpu().numpy()
|
33 |
+
# returns a key-value pair of the id labels and its corresponding probabilities
|
34 |
return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
|
35 |
|
36 |
+
# define the gradio interface
|
37 |
gradio.Interface(fn=classify,
|
38 |
+
inputs=gradio.inputs.Image(shape=(224,224),
|
39 |
+
image_mode='RGB',
|
40 |
+
source='upload',
|
41 |
+
tool='editor',
|
42 |
+
type='pil',
|
43 |
+
label=None,
|
44 |
+
optional=False),
|
45 |
+
outputs=gradio.outputs.Label(num_top_classes=5,
|
46 |
+
type='auto'),
|
47 |
+
theme='dark-huggingface',
|
48 |
+
examples=[['bedroom.jpg'],
|
49 |
+
['bathroom_AS.jpg'],
|
50 |
+
['samsung_room.jpg']],
|
51 |
+
live=True,
|
52 |
+
title='Indoor Scene Recognition',
|
53 |
+
description='An indoor scene classifier. Start by uploading an input image. The outputs are the top five indoor scene classes that best fit your input image.',
|
54 |
+
interpretation='default',
|
55 |
+
article='''<h2><b>Additional Information</b></h2><p style='text-align: justify'>This indoor scene classifier employs the <b>google/vit-base-patch16-224-in21k</b>, a <b>Visual Transformer (ViT)</b> model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224 introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. The original GitHub repository of the Visual Transformer is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>. This model was fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> from Kaggle. The source model is found in <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>this link</a></b>.</p>''',
|
56 |
allow_flagging='never').launch()
|