yesidcanoc
/

image-captioning-swin-tiny-distilgpt2

+# Image captioning model
+## How To use this model.
+Adapt the code below to your needs.
+```
+import os
+from PIL import Image
+import torchvision.transforms as transforms
+from transformers import GPT2TokenizerFast, VisionEncoderDecoderModel
+class DataProcessing:
+    def __init__(self):
+        # GPT-2 tokenizer
+        self.tokenizer = GPT2TokenizerFast.from_pretrained('distilgpt2')
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        # Define the transforms to be applied to the images
+        self.transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+class GenerateCaptions(DataProcessing):
+    NUM_BEAMS = 3
+    MAX_LENGTH = 15
+    EARLY_STOPPING = True
+    DO_SAMPLE = True
+    TOP_K = 10
+    NUM_RETURN_SEQUENCES = 2 # number of captions to generate
+    def __init__(self, captioning_model):
+        self.captioning_model = captioning_model
+        super().__init__()
+    def read_img_predict(self, path):
+        try:
+            with Image.open(path) as img:
+                if img.mode != "RGB":
+                    img = img.convert('RGB')
+                img_transformed = self.transform(img).unsqueeze(0)
+                # tensor dimensions max_lenght X num_return_sequences, where ij == some_token_id
+                model_output = self.captioning_model.generate(
+                    img_transformed,
+                    num_beams=self.NUM_BEAMS,
+                    max_length=self.MAX_LENGTH,
+                    early_stopping=self.EARLY_STOPPING,
+                    do_sample=self.DO_SAMPLE,
+                    top_k=self.TOP_K,
+                    num_return_sequences=self.NUM_RETURN_SEQUENCES,
+                )
+                # g is a tensor like this one: tensor([50256,    13,   198,   198,   198,   198,   198,   198,   198, 50256,
+                # 50256, 50256, 50256, 50256, 50256])
+                captions = [self.tokenizer.decode(g, skip_special_tokens=True).strip() for g in model_output]
+                return captions
+        except FileNotFoundError:
+            raise FileNotFoundError(f"File not found: {path}")
+    def generate_caption(self, path):
+        """
+        Generate captions for a single image or a directory of images
+        :param path: path to image or directory of images
+        :return: captions
+        """
+        if os.path.isdir(path):
+            self.decoded_predictions = []
+            for root, dirs, files in os.walk(path):
+                for file in files:
+                    self.decoded_predictions.append(self.read_img_predict(os.path.join(root, file)))
+            return self.decoded_predictions
+        elif os.path.isfile(path):
+            return self.read_img_predict(path)
+        else:
+            raise ValueError(f"Invalid path: {path}")
+image_captioning_model = VisionEncoderDecoderModel.from_pretrained("yesidcanoc/image-captioning-swin-tiny-distilgpt2")
+generate_captions = GenerateCaptions(image_captioning_model)
+captions = generate_captions.generate_caption('../data/test_data/images')
+print(captions)
+```