File size: 2,511 Bytes
f754afe
 
 
 
 
 
fb9b3a0
f754afe
fb9b3a0
f754afe
fb9b3a0
 
 
 
6c29be2
 
 
fb9b3a0
6c29be2
 
 
fb9b3a0
6c29be2
 
f754afe
 
 
6c29be2
 
 
 
 
 
 
 
 
f754afe
6c29be2
f754afe
6c29be2
 
f754afe
6c29be2
fb9b3a0
6c29be2
 
 
fb9b3a0
6c29be2
fb9b3a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c29be2
fb9b3a0
 
 
 
 
 
 
f754afe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
"""Image Captioning with ViT+GPT2
Automatically generated by Colaboratory.
Original file is located at
    https://colab.research.google.com/drive/1P3O0gO5AUqSmM8rE9dxy2tXJ-9jkhxHz
"""

#! pip install transformers -q

#! pip install gradio -q

from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
import requests
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

#predict_step(['/content/drive/MyDrive/caption generator/horses.png'])


import gradio as gr

inputs = [
    gr.inputs.Image(type="pil", label="Original Image")
]

outputs = [
    gr.outputs.Textbox(label = 'Caption')
]

title = "Image Captioning using ViT + GPT2"
description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
article = " <a href='https://huggingface.co/sachin/vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
examples = [
    ["horses.png"],
    ["persons.png"],
    ["football_player.png"]

]

gr.Interface(
    predict_step,
    inputs,
    outputs,
    title=title,
    description=description,
    article=article,
    examples=examples,
    theme="huggingface",
).launch(debug=True, enable_queue=True)