D0k-tor commited on
Commit
a1fde91
·
1 Parent(s): 2851b05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -32
app.py CHANGED
@@ -1,54 +1,89 @@
1
- import gradio as gr
2
- import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import torch
4
  import re
 
5
  from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
6
 
7
- # def greet(name):
8
- # return "Hello " + name + "!!"
9
-
10
- # iface = gr.Interface(fn=greet, inputs="text", outputs="text")
11
- # iface.launch()
12
-
13
  device='cpu'
14
- encoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
15
- decoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
16
- model_checkpoint = "ydshieh/vit-gpt2-coco-eng"
17
  feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
18
  tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
19
  model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
20
 
21
 
22
  def predict(image,max_length=64, num_beams=4):
23
- input_image = Image.open(image)
24
- model.eval()
25
- pixel_values = feature_extractor(images=[input_image], return_tensors="pt").pixel_values
26
- with torch.no_grad():
27
- output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
28
- preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
29
- preds = [pred.strip() for pred in preds]
30
- return preds[0]
31
-
32
- # image = image.convert('RGB')
33
- # image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
34
- # clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
35
- # caption_ids = model.generate(image, max_length = max_length)[0]
36
- # caption_text = clean_text(tokenizer.decode(caption_ids))
37
- # return caption_text
38
 
39
- # st.title("Image to Text using Lora")
40
 
41
- inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
42
- output = gr.outputs.Textbox(type="text",label="Captions")
43
- description = "NTT Data Bilbao team"
44
- title = "Image to Text using Lora"
45
 
 
 
46
  interface = gr.Interface(
 
47
  fn=predict,
48
  description=description,
49
- inputs = inputs,
50
  theme="grass",
51
  outputs=output,
 
52
  title=title,
53
  )
54
  interface.launch(debug=True)
 
1
+ # import gradio as gr
2
+ # import streamlit as st
3
+ # import torch
4
+ # import re
5
+ # from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
6
+
7
+ # device='cpu'
8
+ # encoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
9
+ # decoder_checkpoint = "ydshieh/vit-gpt2-coco-en"
10
+ # model_checkpoint = "ydshieh/vit-gpt2-coco-eng"
11
+ # feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
12
+ # tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
13
+ # model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
14
+
15
+ # def predict(image,max_length=64, num_beams=4):
16
+ # input_image = Image.open(image)
17
+ # model.eval()
18
+ # pixel_values = feature_extractor(images=[input_image], return_tensors="pt").pixel_values
19
+ # with torch.no_grad():
20
+ # output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
21
+ # preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
22
+ # preds = [pred.strip() for pred in preds]
23
+ # return preds[0]
24
+
25
+ # # image = image.convert('RGB')
26
+ # # image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
27
+ # # clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
28
+ # # caption_ids = model.generate(image, max_length = max_length)[0]
29
+ # # caption_text = clean_text(tokenizer.decode(caption_ids))
30
+ # # return caption_text
31
+
32
+ # # st.title("Image to Text using Lora")
33
+
34
+ # inputs = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
35
+ # output = gr.outputs.Textbox(type="text",label="Captions")
36
+ # description = "NTT Data Bilbao team"
37
+ # title = "Image to Text using Lora"
38
+
39
+ # interface = gr.Interface(
40
+ # fn=predict,
41
+ # description=description,
42
+ # inputs = inputs,
43
+ # theme="grass",
44
+ # outputs=output,
45
+ # title=title,
46
+ # )
47
+ # interface.launch(debug=True)
48
+
49
  import torch
50
  import re
51
+ import gradio as gr
52
  from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
53
 
 
 
 
 
 
 
54
  device='cpu'
55
+ encoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
56
+ decoder_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
57
+ model_checkpoint = "nlpconnect/vit-gpt2-image-captioning"
58
  feature_extractor = ViTFeatureExtractor.from_pretrained(encoder_checkpoint)
59
  tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
60
  model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
61
 
62
 
63
  def predict(image,max_length=64, num_beams=4):
64
+ image = image.convert('RGB')
65
+ image = feature_extractor(image, return_tensors="pt").pixel_values.to(device)
66
+ clean_text = lambda x: x.replace('<|endoftext|>','').split('\n')[0]
67
+ caption_ids = model.generate(image, max_length = max_length)[0]
68
+ caption_text = clean_text(tokenizer.decode(caption_ids))
69
+ return caption_text
70
+
 
 
 
 
 
 
 
 
71
 
 
72
 
73
+ input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
74
+ output = gr.outputs.Textbox(type="auto",label="Captions")
75
+ examples = [f"example{i}.jpg" for i in range(1,7)]
 
76
 
77
+ title = "Image Captioning "
78
+ description = "Made by : shreyasdixit.tech"
79
  interface = gr.Interface(
80
+
81
  fn=predict,
82
  description=description,
83
+ inputs = input,
84
  theme="grass",
85
  outputs=output,
86
+ examples = examples,
87
  title=title,
88
  )
89
  interface.launch(debug=True)