egmaminta commited on
Commit
03a31ab
·
1 Parent(s): 6ee66b6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -1
app.py CHANGED
@@ -104,7 +104,7 @@ img2speech = gradio.mix.Series(ViT_interface,
104
  description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.',
105
  article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
106
  <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
107
- <p style='text-align: justify'>The Text-to-Speech model is from <b><a href='https://huggingface.co/espnet/kan-bayashi_ljspeech_vits' target='_blank'>espnet/kan-bayashi_ljspeech_vits</a></b>. It was imported from <b><a href='https://zenodo.org/record/5443814/' target='_blank'>this link</a></b>, which was uploaded by <b><a href='https://github.com/kan-bayashi' target='_blank'>Tomoki Hayashi</a></b>, and was trained using the ljspeech/tts1 recipe in <b><a href='https://github.com/espnet/espnet/' target='_blank'>ESPnet: end-to-end speech processing toolkit</a></b>. The published work being referenced by this model is found in <b><a href='https://arxiv.org/pdf/1804.00015.pdf' target='_blank'>this link</a></b>.</p>
108
  <h2>Disclaimer</h2>
109
  <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
110
  <h2>Limitations</h2>
 
104
  description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.',
105
  article='''<h2>Additional Information</h2><p style='text-align: justify'>This indoor scene classifier employs the <b><a href='https://huggingface.co/google/vit-base-patch16-224-in21k' target='_blank'>google/vit-base-patch16-224-in21k</a></b>, a <b>Vision Transformer (ViT)</b> model pre-trained on the <b><a href='https://github.com/Alibaba-MIIL/ImageNet21K' target='_blank'>ImageNet-21k</a></b> (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper <b><a href='https://arxiv.org/abs/2010.11929' target='_blank'>An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale</a></b> by Dosovitskiy et al. It was then fine-tuned on the <b><a href='https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019' target='_blank'>MIT Indoor Scenes</a></b> data set from Kaggle. The source model used in this space is from <b><a href='https://huggingface.co/vincentclaes/mit-indoor-scenes' target='_blank'>vincentclaes/mit-indoor-scenes</a></b>.</p>
106
  <p style='text-align: justify'>For further research on the Vision Transformer, the original GitHub repository is found in <b><a href='https://github.com/google-research/vision_transformer' target='_blank'>this link</a></b>.</p>
107
+ <p style='text-align: justify'>The Text-to-Speech model is from <b><a href='https://huggingface.co/espnet/kan-bayashi_ljspeech_vits' target='_blank'>espnet/kan-bayashi_ljspeech_vits</a></b>. It was imported from <b><a href='https://zenodo.org/record/5443814/' target='_blank'>this link</a></b> uploaded by <b><a href='https://github.com/kan-bayashi' target='_blank'>Tomoki Hayashi</a></b>, and was trained using the ljspeech/tts1 recipe in <b><a href='https://github.com/espnet/espnet/' target='_blank'>ESPnet: end-to-end speech processing toolkit</a></b>. The published work being referenced by this model is found in <b><a href='https://arxiv.org/pdf/1804.00015.pdf' target='_blank'>this link</a></b>.</p>
108
  <h2>Disclaimer</h2>
109
  <p style='text-align: justify'>The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.</p>
110
  <h2>Limitations</h2>