Spaces:

qbration21
/

compare_image_question_answer

Runtime error

App Files Files Community

hong_seungbum commited on Nov 10, 2023

Commit

c7f5de3

1 Parent(s): ec2e55c

add application file

Browse files

Files changed (19) hide show

.gitignore +1 -0
README.md +0 -13
__pycache__/app.cpython-39.pyc +0 -0
__pycache__/main.cpython-39.pyc +0 -0
app.py +42 -0
libs/__pycache__/model_list.cpython-39.pyc +0 -0
libs/model_list.py +6 -0
model_test.py +27 -0
models/InstructBlip.py +54 -0
models/__init__.py +8 -0
models/__pycache__/InstructBlip.cpython-39.pyc +0 -0
models/__pycache__/__init__.cpython-39.pyc +0 -0
models/__pycache__/blip.cpython-39.pyc +0 -0
models/__pycache__/blip2.cpython-39.pyc +0 -0
models/__pycache__/vit_gpt2.cpython-39.pyc +0 -0
models/blip.py +31 -0
models/blip2.py +33 -0
models/vit_gpt2.py +46 -0
requirements.txt +91 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ venv

README.md CHANGED Viewed

@@ -1,13 +0,0 @@
----
-title: Compare Image Question Answer
-emoji: 🌖
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 4.1.2
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/app.cpython-39.pyc ADDED Viewed

Binary file (1.45 kB). View file

__pycache__/main.cpython-39.pyc ADDED Viewed

Binary file (622 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from typing import List
+import gradio as gr
+from PIL import Image
+from models import load_transformers
+from libs.model_list import model_list
+def multiple_image_captioning(input_img: Image.Image, question:str) -> List:
+    results = []
+    for model_name, pretrained_paths in model_list.items():
+        for pretrained_path in pretrained_paths:
+            try:
+                process = load_transformers(name=model_name, model_pretrain=pretrained_path)
+                if question == '':
+                    text = process.image_captioning(input_img)
+                else:
+                    text = process.visual_question_answering(input_img)
+            except Exception as e:
+                text = str(e)
+            results.append(text)
+    return results
+question_text_Box = gr.Textbox(label="Question")
+outputs = []
+for model_name, pretrained_paths in model_list.items():
+        for pretrained_path in pretrained_paths:
+            outputs.append(gr.Textbox(label=model_name, info=pretrained_path))
+demo = gr.Interface(fn=multiple_image_captioning,
+                    inputs=[gr.Image(type='pil'), question_text_Box],
+                    outputs=outputs,
+                )
+demo.launch()

libs/__pycache__/model_list.cpython-39.pyc ADDED Viewed

Binary file (402 Bytes). View file

libs/model_list.py ADDED Viewed

	@@ -0,0 +1,6 @@

+model_list = {'blip2' : ["Salesforce/blip2-opt-2.7b", "Salesforce/blip2-flan-t5-xxl"],
+              'blip' : ["Salesforce/blip-vqa-base"],
+              'vit_gpt2':["nlpconnect/vit-gpt2-image-captioning"],
+              'InstructBlip': ["Salesforce/instructblip-vicuna-7b"]
+              }

model_test.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from PIL import Image
+import requests
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+import torch
+device = "cuda" if torch.cuda.is_available() else "cpu"
+processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+model = Blip2ForConditionalGeneration.from_pretrained(
+    "Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16
+)  # doctest: +IGNORE_RESULT
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
+generated_ids = model.generate(**inputs)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+print(generated_text)
+prompt = "Question: how many cats are there? Answer:"
+inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16)
+generated_ids = model.generate(**inputs)
+generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+print(generated_text)

models/InstructBlip.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
+import torch
+from PIL import Image
+class InstructBlip:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def __init__(self, model_pretrain:str = "Salesforce/instructblip-vicuna-7b"):
+        self.model = InstructBlipForConditionalGeneration.from_pretrained(model_pretrain
+                                                        , device_map={"": 0}, torch_dtype=torch.float16)
+        self.processor = InstructBlipProcessor.from_pretrained(model_pretrain)
+    def image_captioning(self, image: Image.Image) -> str:
+        prompt = "What are the features of this picture?"
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device)
+        outputs = self.model.generate(
+                **inputs,
+                do_sample=False,
+                num_beams=5,
+                max_length=256,
+                min_length=1,
+                top_p=0.9,
+                repetition_penalty=1.5,
+                length_penalty=1.0,
+                temperature=1,
+        )
+        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        return generated_text
+    def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device)
+        outputs = self.model.generate(
+                **inputs,
+                do_sample=False,
+                num_beams=5,
+                max_length=256,
+                min_length=1,
+                top_p=0.9,
+                repetition_penalty=1.5,
+                length_penalty=1.0,
+                temperature=1,
+        )
+        generated_text = self.processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
+        return generated_text

models/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import importlib
+def load_transformers(name: str, model_pretrain: str):
+    model_module = importlib.import_module(f"models.{name}")
+    model_class = getattr(model_module, name)
+    model_instance = model_class(model_pretrain=model_pretrain)
+    return model_instance

models/__pycache__/InstructBlip.cpython-39.pyc ADDED Viewed

Binary file (1.87 kB). View file

models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (470 Bytes). View file

models/__pycache__/blip.cpython-39.pyc ADDED Viewed

Binary file (1.62 kB). View file

models/__pycache__/blip2.cpython-39.pyc ADDED Viewed

Binary file (1.62 kB). View file

models/__pycache__/vit_gpt2.cpython-39.pyc ADDED Viewed

Binary file (1.93 kB). View file

models/blip.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from PIL import Image
+from transformers import AutoProcessor, BlipForQuestionAnswering
+import torch
+class blip:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def __init__(self, model_pretrain:str = "Salesforce/blip-vqa-base"):
+        self.processor = AutoProcessor.from_pretrained(model_pretrain)
+        self.model = BlipForQuestionAnswering.from_pretrained(
+            model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
+            )
+    def image_captioning(self, image: Image.Image) -> str:
+        text = "What are the features of this picture??"
+        inputs = self.processor(images=image, text=text, return_tensors="pt").to(self.device, torch.float16)
+        outputs = self.model.generate(**inputs)
+        return self.processor.decode(outputs[0], skip_special_tokens=True)
+    def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text

models/blip2.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from PIL import Image
+from transformers import Blip2Processor, Blip2ForConditionalGeneration
+import torch
+from models import load_transformers
+class blip2:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    def __init__(self, model_pretrain:str = "Salesforce/blip2-opt-2.7b"):
+        self.processor = Blip2Processor.from_pretrained(model_pretrain)
+        self.model = Blip2ForConditionalGeneration.from_pretrained(
+            model_pretrain, device_map={"": 0}, torch_dtype=torch.float16
+        )
+    def image_captioning(self, image: Image.Image) -> str:
+        inputs = self.processor(images=image, return_tensors="pt").to(self.device, torch.float16)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text
+    def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(device=self.device, dtype=torch.float16)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text

models/vit_gpt2.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+import torch
+from PIL import Image
+from PIL import Image
+from transformers import AutoProcessor, BlipForQuestionAnswering
+import torch
+from models import load_transformers
+class vit_gpt2:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    max_length = 16
+    num_beams = 4
+    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+    def __init__(self, model_pretrain:str = "nlpconnect/vit-gpt2-image-captioning"):
+        self.model = VisionEncoderDecoderModel.from_pretrained(model_pretrain
+                                                               , device_map={"": 0}, torch_dtype=torch.float16)
+        self.feature_extractor = ViTImageProcessor.from_pretrained(model_pretrain)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_pretrain)
+    def image_captioning(self, image: Image.Image) -> str:
+        pixel_values = self.feature_extractor(images=[image], return_tensors="pt").pixel_values
+        pixel_values = pixel_values.to(self.device)
+        output_ids = self.model.generate(pixel_values, **self.gen_kwargs)
+        preds = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+        return preds[0]
+    def visual_question_answering(self, image: Image.Image, prompt: str) -> str:
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.device, torch.float16)
+        generated_ids = self.model.generate(**inputs)
+        generated_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+        return generated_text

requirements.txt ADDED Viewed

	@@ -0,0 +1,91 @@

+accelerate==0.24.1
+aiofiles==23.2.1
+altair==5.1.2
+annotated-types==0.6.0
+anyio==3.7.1
+attrs==23.1.0
+certifi==2023.7.22
+charset-normalizer==3.3.2
+click==8.1.7
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+exceptiongroup==1.1.3
+fastapi==0.104.1
+ffmpy==0.3.1
+filelock==3.13.1
+fonttools==4.44.0
+fsspec==2023.10.0
+gradio==4.1.2
+gradio_client==0.7.0
+h11==0.14.0
+httpcore==1.0.1
+httpx==0.25.1
+huggingface-hub==0.17.3
+idna==3.4
+importlib-resources==6.1.1
+Jinja2==3.1.2
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+kiwisolver==1.4.5
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.8.1
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.2.1
+numpy==1.26.1
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.52
+nvidia-nvtx-cu12==12.1.105
+orjson==3.9.10
+packaging==23.2
+pandas==2.1.2
+Pillow==10.1.0
+psutil==5.9.6
+pydantic==2.4.2
+pydantic_core==2.10.1
+pydub==0.25.1
+Pygments==2.16.1
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3.post1
+PyYAML==6.0.1
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+rich==13.6.0
+rpds-py==0.12.0
+safetensors==0.4.0
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.16.0
+sniffio==1.3.0
+starlette==0.27.0
+sympy==1.12
+tokenizers==0.14.1
+tomlkit==0.12.0
+toolz==0.12.0
+torch==1.12.1+cu113
+torchaudio==0.12.1+cu113
+torchvision==0.13.1+cu113
+tqdm==4.66.1
+transformers==4.35.0
+triton==2.1.0
+typer==0.9.0
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.0.7
+uvicorn==0.24.0.post1
+websockets==11.0.3
+zipp==3.17.0