Spaces:
Runtime error
Runtime error
import torch | |
from transformers import AutoFeatureExtractor, AutoModelForImageClassification | |
from einops import rearrange | |
import gradio | |
import call_labels | |
# define the feature extractor | |
extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes") | |
# define the pretrained model | |
model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes") | |
# retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019) | |
labels = call_labels.call_labels() | |
# call model.eval() to assert that we are evaluating the model and not updating the weights | |
model.eval() | |
# define the function used for model inference | |
def classify(image): | |
# disable gradient calculation | |
with torch.no_grad(): | |
# extract features from the image input | |
inputs = extractor(images=image, return_tensors='pt') | |
# call the logits parameter only (object: SequenceClassifierOutput) | |
outputs = model(**inputs).logits | |
# remove the batch dimension | |
outputs = rearrange(outputs, '1 j->j') | |
# use the softmax function to convert the logits into probabilities | |
outputs = torch.nn.functional.softmax(outputs) | |
# convert the data type from tensor to a numpy array | |
outputs = outputs.cpu().numpy() | |
# returns a key-value pair of the id labels and its corresponding probabilities | |
return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))} | |
# define the gradio interface | |
gradio.Interface(fn=classify, | |
inputs=gradio.inputs.Image(shape=(224,224), | |
image_mode='RGB', | |
source='upload', | |
tool='editor', | |
type='pil', | |
label=None, | |
optional=False), | |
outputs=gradio.outputs.Label(num_top_classes=5, | |
type='auto'), | |
theme='grass', | |
examples=[['bedroom.jpg'], | |
['bathroom_AS.jpg'], | |
['samsung_room.jpg']], | |
live=True, | |
layout='horizontal', | |
title='Indoor Scene Recognition', | |
description='A smart and easy-to-use indoor scene classifier. Start by uploading an input image. The outputs are the top five indoor scene classes that best fit your input image.', | |
interpretation='default', | |
article='''<h2><b>Additional Information</b></h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Visual Transformer (ViT)</b> model pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224 and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. The original GitHub repository of the Visual Transformer is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>. This Visual Transformer model was fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> from Kaggle. The source model from Hugging Face is found in <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>this link</a></b>.</p>''', | |
allow_flagging='never').launch() |