Spaces:
Runtime error
Runtime error
hong_seungbum
commited on
Commit
β’
c7f5de3
1
Parent(s):
ec2e55c
add application file
Browse files- .gitignore +1 -0
- README.md +0 -13
- __pycache__/app.cpython-39.pyc +0 -0
- __pycache__/main.cpython-39.pyc +0 -0
- app.py +42 -0
- libs/__pycache__/model_list.cpython-39.pyc +0 -0
- libs/model_list.py +6 -0
- model_test.py +27 -0
- models/InstructBlip.py +54 -0
- models/__init__.py +8 -0
- models/__pycache__/InstructBlip.cpython-39.pyc +0 -0
- models/__pycache__/__init__.cpython-39.pyc +0 -0
- models/__pycache__/blip.cpython-39.pyc +0 -0
- models/__pycache__/blip2.cpython-39.pyc +0 -0
- models/__pycache__/vit_gpt2.cpython-39.pyc +0 -0
- models/blip.py +31 -0
- models/blip2.py +33 -0
- models/vit_gpt2.py +46 -0
- requirements.txt +91 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
venv
|
README.md
CHANGED
@@ -1,13 +0,0 @@
|
|
1 |
-
---
|
2 |
-
title: Compare Image Question Answer
|
3 |
-
emoji: π
|
4 |
-
colorFrom: gray
|
5 |
-
colorTo: blue
|
6 |
-
sdk: gradio
|
7 |
-
sdk_version: 4.1.2
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: apache-2.0
|
11 |
-
---
|
12 |
-
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__pycache__/app.cpython-39.pyc
ADDED
Binary file (1.45 kB). View file
|
|
__pycache__/main.cpython-39.pyc
ADDED
Binary file (622 Bytes). View file
|
|
app.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import List
|
2 |
+
import gradio as gr
|
3 |
+
from PIL import Image
|
4 |
+
from models import load_transformers
|
5 |
+
from libs.model_list import model_list
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
def multiple_image_captioning(input_img: Image.Image, question:str) -> List:
|
10 |
+
results = []
|
11 |
+
for model_name, pretrained_paths in model_list.items():
|
12 |
+
|
13 |
+
for pretrained_path in pretrained_paths:
|
14 |
+
try:
|
15 |
+
process = load_transformers(name=model_name, model_pretrain=pretrained_path)
|
16 |
+
|
17 |
+
if question == '':
|
18 |
+
text = process.image_captioning(input_img)
|
19 |
+
else:
|
20 |
+
text = process.visual_question_answering(input_img)
|
21 |
+
except Exception as e:
|
22 |
+
text = str(e)
|
23 |
+
|
24 |
+
results.append(text)
|
25 |
+
return results
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
question_text_Box = gr.Textbox(label="Question")
|
30 |
+
|
31 |
+
outputs = []
|
32 |
+
for model_name, pretrained_paths in model_list.items():
|
33 |
+
for pretrained_path in pretrained_paths:
|
34 |
+
outputs.append(gr.Textbox(label=model_name, info=pretrained_path))
|
35 |
+
|
36 |
+
|
37 |
+
demo = gr.Interface(fn=multiple_image_captioning,
|
38 |
+
inputs=[gr.Image(type='pil'), question_text_Box],
|
39 |
+
outputs=outputs,
|
40 |
+
)
|
41 |
+
|
42 |
+
demo.launch()
|
libs/__pycache__/model_list.cpython-39.pyc
ADDED
Binary file (402 Bytes). View file
|
|
libs/model_list.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
model_list = {'blip2' : ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xxl"],
|
2 |
+
'blip' : ["Salesforce/blip-vqa-base"],
|
3 |
+
'vit_gpt2':["nlpconnect/vit-gpt2-image-captioning"],
|
4 |
+
'InstructBlip': ["Salesforce/instructblip-vicuna-7b"]
|
5 |
+
}
|
6 |
+
|
model_test.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
import requests
|
3 |
+
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
4 |
+
import torch
|
5 |
+
|
6 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
7 |
+
|
8 |
+
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
9 |
+
model = Blip2ForConditionalGeneration.from_pretrained(
|
10 |
+
"Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
|
11 |
+
) # doctest: +IGNORE_RESULT
|
12 |
+
|
13 |
+
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
14 |
+
image = Image.open(requests.get(url, stream=True).raw)
|
15 |
+
|
16 |
+
inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
|
17 |
+
|
18 |
+
generated_ids = model.generate(**inputs)
|
19 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
20 |
+
print(generated_text)
|
21 |
+
|
22 |
+
prompt = "Question: how many cats are there? Answer:"
|
23 |
+
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)
|
24 |
+
|
25 |
+
generated_ids = model.generate(**inputs)
|
26 |
+
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
27 |
+
print(generated_text)
|
models/InstructBlip.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
|
2 |
+
import torch
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
class InstructBlip:
|
8 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
|
10 |
+
def __init__(self, model_pretrain:str = "Salesforce/instructblip-vicuna-7b"):
|
11 |
+
self.model = InstructBlipForConditionalGeneration.from_pretrained(model_pretrain
|
12 |
+
, device_map={"": 0}, torch_dtype=torch.float16)
|
13 |
+
self.processor = InstructBlipProcessor.from_pretrained(model_pretrain)
|
14 |
+
|
15 |
+
def image_captioning(self, image: Image.Image) -> str:
|
16 |
+
prompt = "What are the features of this picture?"
|
17 |
+
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
|
18 |
+
|
19 |
+
outputs = self.model.generate(
|
20 |
+
**inputs,
|
21 |
+
do_sample=False,
|
22 |
+
num_beams=5,
|
23 |
+
max_length=256,
|
24 |
+
min_length=1,
|
25 |
+
top_p=0.9,
|
26 |
+
repetition_penalty=1.5,
|
27 |
+
length_penalty=1.0,
|
28 |
+
temperature=1,
|
29 |
+
)
|
30 |
+
generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
31 |
+
|
32 |
+
return generated_text
|
33 |
+
|
34 |
+
def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
|
35 |
+
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device)
|
36 |
+
|
37 |
+
outputs = self.model.generate(
|
38 |
+
**inputs,
|
39 |
+
do_sample=False,
|
40 |
+
num_beams=5,
|
41 |
+
max_length=256,
|
42 |
+
min_length=1,
|
43 |
+
top_p=0.9,
|
44 |
+
repetition_penalty=1.5,
|
45 |
+
length_penalty=1.0,
|
46 |
+
temperature=1,
|
47 |
+
)
|
48 |
+
generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
|
49 |
+
|
50 |
+
return generated_text
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
models/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import importlib
|
2 |
+
|
3 |
+
def load_transformers(name: str, model_pretrain: str):
|
4 |
+
model_module = importlib.import_module(f"models.{name}")
|
5 |
+
model_class = getattr(model_module, name)
|
6 |
+
model_instance = model_class(model_pretrain=model_pretrain)
|
7 |
+
|
8 |
+
return model_instance
|
models/__pycache__/InstructBlip.cpython-39.pyc
ADDED
Binary file (1.87 kB). View file
|
|
models/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (470 Bytes). View file
|
|
models/__pycache__/blip.cpython-39.pyc
ADDED
Binary file (1.62 kB). View file
|
|
models/__pycache__/blip2.cpython-39.pyc
ADDED
Binary file (1.62 kB). View file
|
|
models/__pycache__/vit_gpt2.cpython-39.pyc
ADDED
Binary file (1.93 kB). View file
|
|
models/blip.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from PIL import Image
|
3 |
+
from transformers import AutoProcessor, BlipForQuestionAnswering
|
4 |
+
import torch
|
5 |
+
|
6 |
+
|
7 |
+
class blip:
|
8 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
9 |
+
|
10 |
+
|
11 |
+
def __init__(self, model_pretrain:str = "Salesforce/blip-vqa-base"):
|
12 |
+
self.processor = AutoProcessor.from_pretrained(model_pretrain)
|
13 |
+
self.model = BlipForQuestionAnswering.from_pretrained(
|
14 |
+
model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
|
15 |
+
)
|
16 |
+
|
17 |
+
def image_captioning(self, image: Image.Image) -> str:
|
18 |
+
|
19 |
+
text = "What are the features of this picture??"
|
20 |
+
inputs = self.processor(images=image, text=text, return_tensors="pt").to(self.device, torch.float16)
|
21 |
+
outputs = self.model.generate(**inputs)
|
22 |
+
|
23 |
+
return self.processor.decode(outputs[0], skip_special_tokens=True)
|
24 |
+
|
25 |
+
def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
|
26 |
+
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
|
27 |
+
|
28 |
+
generated_ids = self.model.generate(**inputs)
|
29 |
+
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
30 |
+
|
31 |
+
return generated_text
|
models/blip2.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from PIL import Image
|
3 |
+
from transformers import Blip2Processor, Blip2ForConditionalGeneration
|
4 |
+
import torch
|
5 |
+
from models import load_transformers
|
6 |
+
|
7 |
+
|
8 |
+
class blip2:
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
|
11 |
+
|
12 |
+
def __init__(self, model_pretrain:str = "Salesforce/blip2-opt-2.7b"):
|
13 |
+
self.processor = Blip2Processor.from_pretrained(model_pretrain)
|
14 |
+
self.model = Blip2ForConditionalGeneration.from_pretrained(
|
15 |
+
model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
def image_captioning(self, image: Image.Image) -> str:
|
20 |
+
inputs = self.processor(images=image, return_tensors="pt").to(self.device, torch.float16)
|
21 |
+
|
22 |
+
generated_ids = self.model.generate(**inputs)
|
23 |
+
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
24 |
+
|
25 |
+
return generated_text
|
26 |
+
|
27 |
+
def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
|
28 |
+
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device=self.device, dtype=torch.float16)
|
29 |
+
|
30 |
+
generated_ids = self.model.generate(**inputs)
|
31 |
+
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
32 |
+
|
33 |
+
return generated_text
|
models/vit_gpt2.py
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
3 |
+
import torch
|
4 |
+
from PIL import Image
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
from PIL import Image
|
9 |
+
from transformers import AutoProcessor, BlipForQuestionAnswering
|
10 |
+
import torch
|
11 |
+
from models import load_transformers
|
12 |
+
|
13 |
+
|
14 |
+
class vit_gpt2:
|
15 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
+
max_length = 16
|
17 |
+
num_beams = 4
|
18 |
+
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
|
19 |
+
|
20 |
+
def __init__(self, model_pretrain:str = "nlpconnect/vit-gpt2-image-captioning"):
|
21 |
+
self.model = VisionEncoderDecoderModel.from_pretrained(model_pretrain
|
22 |
+
, device_map={"": 0}, torch_dtype=torch.float16)
|
23 |
+
self.feature_extractor = ViTImageProcessor.from_pretrained(model_pretrain)
|
24 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_pretrain)
|
25 |
+
|
26 |
+
def image_captioning(self, image: Image.Image) -> str:
|
27 |
+
pixel_values = self.feature_extractor(images=[image], return_tensors="pt").pixel_values
|
28 |
+
pixel_values = pixel_values.to(self.device)
|
29 |
+
|
30 |
+
output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
|
31 |
+
|
32 |
+
preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
|
33 |
+
|
34 |
+
return preds[0]
|
35 |
+
|
36 |
+
def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
|
37 |
+
inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
|
38 |
+
|
39 |
+
generated_ids = self.model.generate(**inputs)
|
40 |
+
generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
41 |
+
|
42 |
+
return generated_text
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.24.1
|
2 |
+
aiofiles==23.2.1
|
3 |
+
altair==5.1.2
|
4 |
+
annotated-types==0.6.0
|
5 |
+
anyio==3.7.1
|
6 |
+
attrs==23.1.0
|
7 |
+
certifi==2023.7.22
|
8 |
+
charset-normalizer==3.3.2
|
9 |
+
click==8.1.7
|
10 |
+
colorama==0.4.6
|
11 |
+
contourpy==1.2.0
|
12 |
+
cycler==0.12.1
|
13 |
+
exceptiongroup==1.1.3
|
14 |
+
fastapi==0.104.1
|
15 |
+
ffmpy==0.3.1
|
16 |
+
filelock==3.13.1
|
17 |
+
fonttools==4.44.0
|
18 |
+
fsspec==2023.10.0
|
19 |
+
gradio==4.1.2
|
20 |
+
gradio_client==0.7.0
|
21 |
+
h11==0.14.0
|
22 |
+
httpcore==1.0.1
|
23 |
+
httpx==0.25.1
|
24 |
+
huggingface-hub==0.17.3
|
25 |
+
idna==3.4
|
26 |
+
importlib-resources==6.1.1
|
27 |
+
Jinja2==3.1.2
|
28 |
+
jsonschema==4.19.2
|
29 |
+
jsonschema-specifications==2023.7.1
|
30 |
+
kiwisolver==1.4.5
|
31 |
+
markdown-it-py==3.0.0
|
32 |
+
MarkupSafe==2.1.3
|
33 |
+
matplotlib==3.8.1
|
34 |
+
mdurl==0.1.2
|
35 |
+
mpmath==1.3.0
|
36 |
+
networkx==3.2.1
|
37 |
+
numpy==1.26.1
|
38 |
+
nvidia-cublas-cu12==12.1.3.1
|
39 |
+
nvidia-cuda-cupti-cu12==12.1.105
|
40 |
+
nvidia-cuda-nvrtc-cu12==12.1.105
|
41 |
+
nvidia-cuda-runtime-cu12==12.1.105
|
42 |
+
nvidia-cudnn-cu12==8.9.2.26
|
43 |
+
nvidia-cufft-cu12==11.0.2.54
|
44 |
+
nvidia-curand-cu12==10.3.2.106
|
45 |
+
nvidia-cusolver-cu12==11.4.5.107
|
46 |
+
nvidia-cusparse-cu12==12.1.0.106
|
47 |
+
nvidia-nccl-cu12==2.18.1
|
48 |
+
nvidia-nvjitlink-cu12==12.3.52
|
49 |
+
nvidia-nvtx-cu12==12.1.105
|
50 |
+
orjson==3.9.10
|
51 |
+
packaging==23.2
|
52 |
+
pandas==2.1.2
|
53 |
+
Pillow==10.1.0
|
54 |
+
psutil==5.9.6
|
55 |
+
pydantic==2.4.2
|
56 |
+
pydantic_core==2.10.1
|
57 |
+
pydub==0.25.1
|
58 |
+
Pygments==2.16.1
|
59 |
+
pyparsing==3.1.1
|
60 |
+
python-dateutil==2.8.2
|
61 |
+
python-multipart==0.0.6
|
62 |
+
pytz==2023.3.post1
|
63 |
+
PyYAML==6.0.1
|
64 |
+
referencing==0.30.2
|
65 |
+
regex==2023.10.3
|
66 |
+
requests==2.31.0
|
67 |
+
rich==13.6.0
|
68 |
+
rpds-py==0.12.0
|
69 |
+
safetensors==0.4.0
|
70 |
+
semantic-version==2.10.0
|
71 |
+
shellingham==1.5.4
|
72 |
+
six==1.16.0
|
73 |
+
sniffio==1.3.0
|
74 |
+
starlette==0.27.0
|
75 |
+
sympy==1.12
|
76 |
+
tokenizers==0.14.1
|
77 |
+
tomlkit==0.12.0
|
78 |
+
toolz==0.12.0
|
79 |
+
torch==1.12.1+cu113
|
80 |
+
torchaudio==0.12.1+cu113
|
81 |
+
torchvision==0.13.1+cu113
|
82 |
+
tqdm==4.66.1
|
83 |
+
transformers==4.35.0
|
84 |
+
triton==2.1.0
|
85 |
+
typer==0.9.0
|
86 |
+
typing_extensions==4.8.0
|
87 |
+
tzdata==2023.3
|
88 |
+
urllib3==2.0.7
|
89 |
+
uvicorn==0.24.0.post1
|
90 |
+
websockets==11.0.3
|
91 |
+
zipp==3.17.0
|