egmaminta commited on
Commit
6ee66b6
1 Parent(s): 241c56a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -104,11 +104,11 @@ img2speech = gradio.mix.Series(ViT_interface,
104
  description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.',
105
  article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
106
  <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
107
- <p style='text-align: justify'>The Text-to-Speech model is from <b><a href='https://huggingface.co/espnet/kan-bayashi_ljspeech_vits' target='_blank'>espnet/kan-bayashi_ljspeech_vits</a></b>. It was imported from <b><a href='https://zenodo.org/record/5443814/' target='_blank'>this link</a></b> and was trained using the ljspeech/tts1 recipe in <b><a href='https://github.com/espnet/espnet/' target='_blank'>ESPnet: end-to-end speech processing toolkit</a></b>. The published work being referenced by this model is found in <b><a href='https://arxiv.org/pdf/1804.00015.pdf' target='_blank'>this link</a></b>.</p>
108
  <h2>Disclaimer</h2>
109
  <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
110
  <h2>Limitations</h2>
111
  <p style='text-align: justify'>The model was trained only on 67 classes (indoor scenes). Hence, the model should perform better if the input indoor scene image belongs to one of the target classes it was trained on. For demonstration purposes, it temporarily accommodates English as its language but it is flexible and versatile to other common major languages.</p>
112
  <h2>Credits</h2>
113
- <p style='text-align: justify'>I would like to express my gratitude to <b><a href='https://github.com/vincentclaes' target='_blank'>Vincent Claes</a></b> and <b><a href='https://github.com/kan-bayashi' target='_blank'>Tomoki Hayashi</a></b> for uploading the ViT and TTS models in the Hugging Face Models library, respectively, purely for academic and research purposes. All credits go to these two brilliant people.''',
114
  allow_flagging='never').launch()
 
104
  description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.',
105
  article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
106
  <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
107
+ <p style='text-align: justify'>The Text-to-Speech model is from <b><a href='https://huggingface.co/espnet/kan-bayashi_ljspeech_vits' target='_blank'>espnet/kan-bayashi_ljspeech_vits</a></b>. It was imported from <b><a href='https://zenodo.org/record/5443814/' target='_blank'>this link</a></b>, which was uploaded by <b><a href='https://github.com/kan-bayashi' target='_blank'>Tomoki Hayashi</a></b>, and was trained using the ljspeech/tts1 recipe in <b><a href='https://github.com/espnet/espnet/' target='_blank'>ESPnet: end-to-end speech processing toolkit</a></b>. The published work being referenced by this model is found in <b><a href='https://arxiv.org/pdf/1804.00015.pdf' target='_blank'>this link</a></b>.</p>
108
  <h2>Disclaimer</h2>
109
  <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
110
  <h2>Limitations</h2>
111
  <p style='text-align: justify'>The model was trained only on 67 classes (indoor scenes). Hence, the model should perform better if the input indoor scene image belongs to one of the target classes it was trained on. For demonstration purposes, it temporarily accommodates English as its language but it is flexible and versatile to other common major languages.</p>
112
  <h2>Credits</h2>
113
+ <p style='text-align: justify'>I would like to express my gratitude to <b><a href='https://github.com/vincentclaes' target='_blank'>Vincent Claes</a></b> and <b><a href='https://github.com/siddhu001' target='_blank'>Siddhant Arora</a></b> for uploading the ViT and TTS models in the Hugging Face Models library, respectively, purely for academic and research purposes. All credits go to these two brilliant people.''',
114
  allow_flagging='never').launch()