# required libraries import torch from transformers import AutoFeatureExtractor, AutoModelForImageClassification from einops import rearrange import gradio import scipy.io.wavfile import time from espnet2.bin.tts_inference import Text2Speech from espnet2.utils.types import str_or_none import call_labels # call the labels (data set from: https://www.kaggle.com/itsahmad/indoor-scenes-cvpr-2019) labels = call_labels.call_labels() # define the feature extractor ViT_extractor = AutoFeatureExtractor.from_pretrained('vincentclaes/mit-indoor-scenes') # define the pretrained ViT model ViT_model = AutoModelForImageClassification.from_pretrained('vincentclaes/mit-indoor-scenes') # call eval() to change the forward() behaviour of the module it is called upon ViT_model.eval() # define the function used for the ViT model inference def ViT_inference(image): # disable gradient calculation/backpropagation with torch.no_grad(): # extract features from the image input inputs = ViT_extractor(images=image, return_tensors='pt') # call the logits parameter only (object: SequenceClassifierOutput) outputs = ViT_model(**inputs).logits # remove the batch dimension outputs = rearrange(outputs, '1 j->j') # use the softmax function to convert the logits into probabilities outputs = torch.nn.functional.softmax(outputs, dim=0) # convert the data type from tensor to a numpy array outputs = outputs.cpu().numpy() # returns a key-value pair composed of id labels and its corresponding probabilities # return {labels[str(i)]: float(outputs[i]) for i in range(len(labels))} '(Uncomment this for debugging purposes only.)' # define a dictionary containing the key-value pair composed of id labels and its corresponding probabilities logit_dict = {labels[str(i)]: float(outputs[i]) for i in range(len(labels))} # retrieve the label with the maximum probability max_key = max(logit_dict, key=logit_dict.get) # format it as a string/text and pass it to a variable tts_input = 'In front of you is the {}.'.format(str(max_key)) # returns a text format used as inputs to the text-to-speech model return tts_input # define the ViT gradio interface ViT_interface = gradio.Interface(fn=ViT_inference, inputs=gradio.inputs.Image(shape=(224,224), image_mode='RGB', source='upload', tool='editor', type='pil', label='Input: Indoor Scene Image'), outputs='text') # define the pretrained TTS model TTS_model = Text2Speech.from_pretrained( # call on the trained model model_tag=str_or_none('kan-bayashi/ljspeech_vits'), # set the vocoder vocoder_tag=str_or_none('none'), # set the device it should use device='cpu', # only for Tacotron 2 & Transformer threshold=0.5, # only for Tacotron 2 minlenratio=0.0, maxlenratio=10.0, use_att_constraint=False, backward_window=1, forward_window=3, # only for FastSpeech & FastSpeech2 & VITS speed_control_alpha=1.0, # only for VITS noise_scale=0.333, noise_scale_dur=0.333, ) # define the function used for the TTS model inference def TTS_inference(text): with torch.no_grad(): wav = TTS_model(text)['wav'] scipy.io.wavfile.write('out.wav',TTS_model.fs , wav.view(-1).cpu().numpy()) return 'out.wav' # define the TTS gradio interface TTS_interface = gradio.Interface(fn=TTS_inference, inputs='text', outputs=gradio.outputs.Audio(type='file', label='Output: Audio')) # Combine the two models using the gradio.mix.Series img2speech = gradio.mix.Series(ViT_interface, TTS_interface, theme='grass', live='True', examples=[['bathroom.jpg'], ['bedroom.jpg'], ['samsung_room.jpg']], layout='horizontal', title='''Hearing What's In Front of You: Indoor Scene Recognition-to-Speech''', description='For the blind and visually-impaired people. A smart and easy-to-use indoor scene classifier-to-speech. Start by uploading an input image of an indoor scene. The output is an audio file saying what you are externally facing in front of.', article='''
This indoor scene classifier employs the google/vit-base-patch16-224-in21k, a Vision Transformer (ViT) model pre-trained on the ImageNet-21k (14 million images, 21,843 classes) at a resolution of 224 pixels by 224 pixels and was first introduced in the paper An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale by Dosovitskiy et al. It was then fine-tuned on the MIT Indoor Scenes data set from Kaggle. The source model used in this space is from vincentclaes/mit-indoor-scenes.
For further research on the Vision Transformer, the original GitHub repository is found in this link.
The Text-to-Speech model is from espnet/kan-bayashi_ljspeech_vits. It was imported from this link and was trained using the ljspeech/tts1 recipe in ESPnet: end-to-end speech processing toolkit. The published work being referenced by this model is found in this link.
The team releasing the Vision Transformer did not write a model card for it via Hugging Face. Hence, the Vision Transformer model card released in the Hugging Face Models library has been written by the Hugging Face team.
The model was trained only on 67 classes (indoor scenes). Hence, the model should perform better if the input indoor scene image belongs to one of the target classes it was trained on. For demonstration purposes, it temporarily accommodates English as its language but it is flexible and versatile to other common major languages.
I would like to express my gratitude to Vincent Claes and Siddhant Arora for uploading the ViT and TTS models in the Hugging Face Models library, respectively, purely for academic and research purposes. All credits go to these two brilliant people.''', allow_flagging='never').launch()