egmaminta commited on
Commit
8eaf54e
1 Parent(s): da92d66

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -43
app.py CHANGED
@@ -2,60 +2,105 @@ import torch
2
  from transformers import AutoFeatureExtractor, AutoModelForImageClassification
3
  from einops import rearrange
4
  import gradio
 
 
 
 
5
  import call_labels
6
 
7
  # define the feature extractor
8
- extractor = AutoFeatureExtractor.from_pretrained("vincentclaes/mit-indoor-scenes")
9
 
10
- # define the pretrained model
11
- model = AutoModelForImageClassification.from_pretrained("vincentclaes/mit-indoor-scenes")
12
 
13
- # retrieve the labels provided from MIT Indoor Scenes dataset (https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019)
14
- labels = call_labels.call_labels()
15
 
16
- # call model.eval() to assert that we are evaluating the model and not updating the weights
17
- model.eval()
18
-
19
- # define the function used for model inference
20
- def classify(image):
21
- # disable gradient calculation
22
  with torch.no_grad():
23
  # extract features from the image input
24
- inputs = extractor(images=image, return_tensors='pt')
25
  # call the logits parameter only (object: SequenceClassifierOutput)
26
- outputs = model(**inputs).logits
27
  # remove the batch dimension
28
  outputs = rearrange(outputs, '1 j->j')
29
  # use the softmax function to convert the logits into probabilities
30
- outputs = torch.nn.functional.softmax(outputs)
31
  # convert the data type from tensor to a numpy array
32
  outputs = outputs.cpu().numpy()
33
- # returns a key-value pair of the id labels and its corresponding probabilities
34
- return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
35
-
36
- # define the gradio interface
37
- gradio.Interface(fn=classify,
38
- inputs=gradio.inputs.Image(shape=(224,224),
39
- image_mode='RGB',
40
- source='upload',
41
- tool='editor',
42
- type='pil',
43
- label=None,
44
- optional=False),
45
- outputs=gradio.outputs.Label(num_top_classes=5,
46
- type='auto'),
47
- theme='grass',
48
- examples=[['bedroom.jpg'],
49
- ['bathroom.jpg'],
50
- ['samsung_room.jpg']],
51
- live=True,
52
- layout='horizontal',
53
- title='Indoor Scene Recognition',
54
- description='A smart and easy-to-use indoor scene classifier. Start by uploading an input image of an indoor scene. The outputs are the top five indoor scene classes that best describe your input image.',
55
- article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
56
- <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
57
- <h2>Disclaimer</h2>
58
- <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
59
- <h2>Limitations</h2>
60
- <p style='text-align: justify'>The model was trained only on 67 classes (indoor scenes). Hence, the model should perform better if the input indoor scene image belongs to one of the target classes it was trained on.</p>''',
61
- allow_flagging='never').launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import AutoFeatureExtractor, AutoModelForImageClassification
3
  from einops import rearrange
4
  import gradio
5
+ import scipy.io.wavfile
6
+ import time
7
+ from espnet2.bin.tts_inference import Text2Speech
8
+ from espnet2.utils.types import str_or_none
9
  import call_labels
10
 
11
  # define the feature extractor
12
+ ViT_extractor = AutoFeatureExtractor.from_pretrained('vincentclaes/mit-indoor-scenes')
13
 
14
+ # define the pretrained ViT model
15
+ ViT_model = AutoModelForImageClassification.from_pretrained('vincentclaes/mit-indoor-scenes')
16
 
17
+ # call eval() to change the forward() behaviour of the module it is called upon
18
+ ViT_model.eval()
19
 
20
+ # define the function used for the ViT model inference
21
+ def ViT_inference(image):
22
+ # disable gradient calculation/backpropagation
 
 
 
23
  with torch.no_grad():
24
  # extract features from the image input
25
+ inputs = ViT_extractor(images=image, return_tensors='pt')
26
  # call the logits parameter only (object: SequenceClassifierOutput)
27
+ outputs = ViT_model(**inputs).logits
28
  # remove the batch dimension
29
  outputs = rearrange(outputs, '1 j->j')
30
  # use the softmax function to convert the logits into probabilities
31
+ outputs = torch.nn.functional.softmax(outputs, dim=0)
32
  # convert the data type from tensor to a numpy array
33
  outputs = outputs.cpu().numpy()
34
+ # returns a key-value pair composed of id labels and its corresponding probabilities
35
+ # return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))} '(Uncomment this for debugging purposes only.)'
36
+ # define a dictionary containing the key-value pair composed of id labels and its corresponding probabilities
37
+ logit_dict = {labels[str(i)]: float(outputs[i]) for i in range(len(labels))}
38
+ # retrieve the label with the maximum probability
39
+ max_key = max(logit_dict, key=logit_dict.get)
40
+ # format it as a string/text and pass it to a variable
41
+ tts_input = 'In front of you is the {}.'.format(str(max_key))
42
+ # returns a text format used as inputs to the text-to-speech model
43
+ return tts_input
44
+
45
+ # define the ViT gradio interface
46
+ ViT_interface = gradio.Interface(fn=ViT_inference,
47
+ inputs=gradio.inputs.Image(shape=(224,224),
48
+ image_mode='RGB',
49
+ source='upload',
50
+ tool='editor',
51
+ type='pil',
52
+ label='Input: Indoor Scene Image'),
53
+ outputs='text')
54
+
55
+ # define the pretrained TTS model
56
+ TTS_model = Text2Speech.from_pretrained(
57
+ # call on the trained model
58
+ model_tag=str_or_none('kan-bayashi/ljspeech_vits'),
59
+ # set the vocoder
60
+ vocoder_tag=str_or_none('none'),
61
+ # set the device it should use
62
+ device='cpu',
63
+ # only for Tacotron 2 & Transformer
64
+ threshold=0.5,
65
+ # only for Tacotron 2
66
+ minlenratio=0.0,
67
+ maxlenratio=10.0,
68
+ use_att_constraint=False,
69
+ backward_window=1,
70
+ forward_window=3,
71
+ # only for FastSpeech & FastSpeech2 & VITS
72
+ speed_control_alpha=1.0,
73
+ # only for VITS
74
+ noise_scale=0.333,
75
+ noise_scale_dur=0.333,
76
+ )
77
+
78
+ # define the function used for the TTS model inference
79
+ def TTS_inference(text):
80
+ with torch.no_grad():
81
+ wav = TTS_model(text)['wav']
82
+ scipy.io.wavfile.write('out.wav',TTS_model.fs , wav.view(-1).cpu().numpy())
83
+ return 'out.wav'
84
+
85
+ # define the TTS gradio interface
86
+ TTS_interface = gradio.Interface(fn=TTS_inference,
87
+ inputs='text',
88
+ outputs=gradio.outputs.Audio(type='file', label='Output: Audio'))
89
+
90
+ # Combine the two models using the gradio.mix.Series
91
+ img2speech = gradio.mix.Series(ViT_interface, TTS_interface,
92
+ theme='grass',
93
+ live='True',
94
+ layout='horizontal',
95
+ title='''Hearing What's In Front of You: Indoor Scene Recognition-to-Speech''',
96
+ description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.',
97
+ article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
98
+ <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
99
+ <p style='text-align: justify'>The Text-to-Speech model is from <b><a href='https://huggingface.co/espnet/kan-bayashi_ljspeech_vits' target='_blank'>espnet/kan-bayashi_ljspeech_vits</a></b>. It was imported from <b><a href='https://zenodo.org/record/5443814/' target='_blank'>this link</a></b> and was trained using the ljspeech/tts1 recipe in <b><a href='https://github.com/espnet/espnet/' target='_blank'>ESPnet: end-to-end speech processing toolkit</a></b>. The published work being referenced by this model is found in <b><a href='https://arxiv.org/pdf/1804.00015.pdf' target='_blank'>this link</a></b>.</p>
100
+ <h2>Disclaimer</h2>
101
+ <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
102
+ <h2>Limitations</h2>
103
+ <p style='text-align: justify'>The model was trained only on 67 classes (indoor scenes). Hence, the model should perform better if the input indoor scene image belongs to one of the target classes it was trained on. For demonstration purposes, it temporarily accommodates English as its language but it is flexible and versatile to other common major languages.</p>
104
+ <h2>Credits</h2>
105
+ <p style='text-align: justify'>I would like to express my gratitude to <b><a href='https://github.com/vincentclaes' target='_blank'>Vincent Claes</a></b> and <b><a href='https://github.com/siddhu001' target='_blank'>Siddhant Arora</a></b> for uploading the ViT and TTS models in the Hugging Face Models library, respectively, purely for academic and research purposes. All credits go to these two brilliant people.''',
106
+ allow_flagging='never').launch()