Spaces:

egmaminta
/

indoor-scene-recognition-to-speech

Runtime error

App Files Files Community

egmaminta commited on Mar 23, 2022

Commit

b9008f4

1 Parent(s): be555d2

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -75

app.py CHANGED Viewed

@@ -1,95 +1,56 @@
-from transformers import AutoFeatureExtractor, AutoModelForImageClassification
-import gradio
 import torch
 from einops import rearrange
-import numpy
 extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")
 model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")
-labels = {
-    "0": "airport_inside",
-    "1": "artstudio",
-    "2": "auditorium",
-    "3": "bakery",
-    "4": "bar",
-    "5": "bathroom",
-    "6": "bedroom",
-    "7": "bookstore",
-    "8": "bowling",
-    "9": "buffet",
-    "10": "casino",
-    "11": "children_room",
-    "12": "church_inside",
-    "13": "classroom",
-    "14": "cloister",
-    "15": "closet",
-    "16": "clothingstore",
-    "17": "computerroom",
-    "18": "concert_hall",
-    "19": "corridor",
-    "20": "deli",
-    "21": "dentaloffice",
-    "22": "dining_room",
-    "23": "elevator",
-    "24": "fastfood_restaurant",
-    "25": "florist",
-    "26": "gameroom",
-    "27": "garage",
-    "28": "greenhouse",
-    "29": "grocerystore",
-    "30": "gym",
-    "31": "hairsalon",
-    "32": "hospitalroom",
-    "33": "inside_bus",
-    "34": "inside_subway",
-    "35": "jewelleryshop",
-    "36": "kindergarden",
-    "37": "kitchen",
-    "38": "laboratorywet",
-    "39": "laundromat",
-    "40": "library",
-    "41": "livingroom",
-    "42": "lobby",
-    "43": "locker_room",
-    "44": "mall",
-    "45": "meeting_room",
-    "46": "movietheater",
-    "47": "museum",
-    "48": "nursery",
-    "49": "office",
-    "50": "operating_room",
-    "51": "pantry",
-    "52": "poolinside",
-    "53": "prisoncell",
-    "54": "restaurant",
-    "55": "restaurant_kitchen",
-    "56": "shoeshop",
-    "57": "stairscase",
-    "58": "studiomusic",
-    "59": "subway",
-    "60": "toystore",
-    "61": "trainstation",
-    "62": "tv_studio",
-    "63": "videostore",
-    "64": "waitingroom",
-    "65": "warehouse",
-    "66": "winecellar"
-  }
 model.eval()
 def classify(image):
   with torch.no_grad():
     inputs = extractor(images=image, return_tensors='pt')
     outputs = model(**inputs).logits
     outputs = rearrange(outputs, '1 j->j')
     outputs = torch.nn.functional.softmax(outputs)
     outputs = outputs.cpu().numpy()
     return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
 gradio.Interface(fn=classify,
-                 inputs=gradio.inputs.Image(shape=(224,224), image_mode='RGB', source='upload', tool='editor', type='pil', label=None, optional=False),
-                 outputs=gradio.outputs.Label(num_top_classes=5, type='confidences'),
-                 theme='huggingface',
                  allow_flagging='never').launch()

 import torch
+from transformers import AutoFeatureExtractor, AutoModelForImageClassification
 from einops import rearrange
+import gradio
+import call_labels
+# define the feature extractor
 extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")
+# define the pretrained model
 model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")
+# retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019)
+labels = call_labels.call_labels()
+# call model.eval() to assert that we are evaluating the model and not updating the weights
 model.eval()
+# define the function used for model inference
 def classify(image):
+  # disable gradient calculation
   with torch.no_grad():
+    # extract features from the image input
     inputs = extractor(images=image, return_tensors='pt')
+    # call the logits parameter only (object: SequenceClassifierOutput)
     outputs = model(**inputs).logits
+    # remove the batch dimension
     outputs = rearrange(outputs, '1 j->j')
+    # use the softmax function to convert the logits into probabilities
     outputs = torch.nn.functional.softmax(outputs)
+    # convert the data type from tensor to a numpy array
     outputs = outputs.cpu().numpy()
+    # returns a key-value pair of the id labels and its corresponding probabilities
     return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
+# define the gradio interface
 gradio.Interface(fn=classify,
+                 inputs=gradio.inputs.Image(shape=(224,224),
+                                            image_mode='RGB',
+                                            source='upload',
+                                            tool='editor',
+                                            type='pil',
+                                            label=None,
+                                            optional=False),
+                 outputs=gradio.outputs.Label(num_top_classes=5,
+                                              type='auto'),
+                 theme='dark-huggingface',
+                 examples=[['bedroom.jpg'],
+                           ['bathroom_AS.jpg'],
+                           ['samsung_room.jpg']],
+                 live=True,
+                 title='Indoor Scene Recognition',
+                 description='An indoor scene classifier. Start by uploading an input image. The outputs are the top five indoor scene classes that best fit your input image.',
+                 interpretation='default',
+                 article='''<h2><b>Additional Information</b></h2><p style='text-align: justify'>This indoor scene classifier employs the <b>google/vit-base-patch16-224-in21k</b>, a <b>Visual Transformer (ViT)</b> model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224 introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. The original GitHub repository of the Visual Transformer is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>. This model was fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> from Kaggle. The source model is found in <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>this link</a></b>.</p>''',
                  allow_flagging='never').launch()