Spaces:

m7mdal7aj
/

KB-VQA

Sleeping

App Files Files Community

m7mdal7aj commited on Feb 20, 2024

Commit

17c1e65

verified ·

1 Parent(s): 2770d90

Upload 11 files

Browse files

Files changed (12) hide show

.gitattributes +2 -0
my_model/LLAMA2/LLAMA2_config.py +15 -0
my_model/LLAMA2/LLAMA2_model.py +173 -0
my_model/extract_objects.py +45 -0
my_model/fine_tuner/fine_tuner.py +347 -0
my_model/fine_tuner/fine_tuning_config.py +114 -0
my_model/fine_tuner/fine_tuning_data/fine_tuning_data_detic.csv +3 -0
my_model/fine_tuner/fine_tuning_data/fine_tuning_data_yolov5.csv +3 -0
my_model/fine_tuner/fine_tuning_data/read_me.txt +8 -0
my_model/fine_tuner/fine_tuning_data_handler.py +182 -0
my_model/object_detection.py +259 -0
my_model/utilities.py +278 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+my_model/fine_tuner/fine_tuning_data/fine_tuning_data_detic.csv filter=lfs diff=lfs merge=lfs -text
+my_model/fine_tuner/fine_tuning_data/fine_tuning_data_yolov5.csv filter=lfs diff=lfs merge=lfs -text

my_model/LLAMA2/LLAMA2_config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Configuration parameters for LLaMA-2 model
+import torch
+import os
+MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
+TOKENIZER_NAME = "meta-llama/Llama-2-7b-chat-hf"
+QUANTIZATION = '4bit'  # Options: '4bit', '8bit', or None
+FROM_SAVED = False
+MODEL_PATH = None
+TRUST_REMOTE = False
+USE_FAST = True
+ADD_EOS_TOKEN = True
+# ACCESS_TOKEN = "xx"  # My HF Read-only Token, to be added here if needed
+huggingface_token = os.getenv('HUGGINGFACE_TOKEN')  # for use as a secret on hf space
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

my_model/LLAMA2/LLAMA2_model.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from typing import Optional
+import bitsandbytes  # only for using on GPU
+import accelerate  # only for using on GPU
+from my_model.LLAMA2 import LLAMA2_config as config  # Importing LLAMA2 configuration file
+import warnings
+# Suppress only FutureWarning from transformers
+warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
+class Llama2ModelManager:
+    """
+    Manages loading and configuring the LLaMA-2 model and tokenizer.
+    Attributes:
+        device (str): Device to use for the model ('cuda' or 'cpu').
+        model_name (str): Name or path of the pre-trained model.
+        tokenizer_name (str): Name or path of the tokenizer.
+        quantization (str): Specifies the quantization level ('4bit', '8bit', or None).
+        from_saved (bool): Flag to load the model from a saved path.
+        model_path (str or None): Path to the saved model if `from_saved` is True.
+        trust_remote (bool): Whether to trust remote code when loading the tokenizer.
+        use_fast (bool): Whether to use the fast version of the tokenizer.
+        add_eos_token (bool): Whether to add an EOS token to the tokenizer.
+        access_token (str): Access token for Hugging Face Hub.
+        model (AutoModelForCausalLM or None): Loaded model, initially None.
+    """
+    def __init__(self) -> None:
+        """
+        Initializes the Llama2ModelManager class with configuration settings.
+        """
+        self.device: str = config.DEVICE
+        self.model_name: str = config.MODEL_NAME
+        self.tokenizer_name: str = config.TOKENIZER_NAME
+        self.quantization: str = config.QUANTIZATION
+        self.from_saved: bool = config.FROM_SAVED
+        self.model_path: Optional[str] = config.MODEL_PATH
+        self.trust_remote: bool = config.TRUST_REMOTE
+        self.use_fast: bool = config.USE_FAST
+        self.add_eos_token: bool = config.ADD_EOS_TOKEN
+        self.access_token: str = config.ACCESS_TOKEN
+        self.model: Optional[AutoModelForCausalLM] = None
+    def create_bnb_config(self) -> BitsAndBytesConfig:
+        """
+        Creates a BitsAndBytes configuration based on the quantization setting.
+        Returns:
+            BitsAndBytesConfig: Configuration for BitsAndBytes optimized model.
+        """
+        if self.quantization == '4bit':
+            return BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                bnb_4bit_compute_dtype=torch.bfloat16
+            )
+        elif self.quantization == '8bit':
+            return BitsAndBytesConfig(
+                load_in_8bit=True,
+                bnb_8bit_use_double_quant=True,
+                bnb_8bit_quant_type="nf4",
+                bnb_8bit_compute_dtype=torch.bfloat16
+            )
+    def load_model(self) -> AutoModelForCausalLM:
+        """
+        Loads the LLaMA-2 model based on the specified configuration. If the model is already loaded, returns the existing model.
+        Returns:
+            AutoModelForCausalLM: Loaded LLaMA-2 model.
+        """
+        if self.model is not None:
+            print("Model is already loaded.")
+            return self.model
+        if self.from_saved:
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto")
+        else:
+            bnb_config = None if self.quantization is None else self.create_bnb_config()
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto",
+                                                              quantization_config=bnb_config,
+                                                              torch_dtype=torch.float16,
+                                                              token=self.access_token)
+        if self.model is not None:
+            print(f"LLAMA2 Model loaded successfully in {self.quantization} quantization.")
+        else:
+            print("LLAMA2 Model failed to load.")
+        return self.model
+    def load_tokenizer(self) -> AutoTokenizer:
+        """
+        Loads the tokenizer for the LLaMA-2 model with the specified configuration.
+        Returns:
+            AutoTokenizer: Loaded tokenizer for LLaMA-2 model.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, use_fast=self.use_fast,
+                                                       token=self.access_token,
+                                                       trust_remote_code=self.trust_remote,
+                                                       add_eos_token=self.add_eos_token)
+        if self.tokenizer is not None:
+            print(f"LLAMA2 Tokenizer loaded successfully.")
+        else:
+            print("LLAMA2 Tokenizer failed to load.")
+        return self.tokenizer
+    def load_model_and_tokenizer(self, for_fine_tuning):
+        """
+        Loads LLAMa2 model and tokenizer in one method and adds special tokens if the purpose if fine tuning.
+        :param for_fine_tuning: YES(True) / NO (False)
+        :return: LLAMA2 Model and Tokenizer
+        """
+        if for_fine_tuning:
+            self.tokenizer = self.load_tokenizer()
+            self.model = self.load_model()
+            self.add_special_tokens()
+        else:
+            self.tokenizer = self.load_tokenizer()
+            self.model = self.load_model()
+        return self.model, self.tokenizer
+    def add_special_tokens(self, tokens: Optional[list[str]] = None) -> None:
+        """
+        Adds special tokens to the tokenizer and updates the model's token embeddings if the model is loaded,
+        only if the tokenizer is loaded.
+        Args:
+            tokens (list of str, optional): Special tokens to add. Defaults to a predefined set.
+        Returns:
+            None
+        """
+        if self.tokenizer is None:
+            print("Tokenizer is not loaded. Cannot add special tokens.")
+            return
+        if tokens is None:
+            tokens = ['[CAP]', '[/CAP]', '[QES]', '[/QES]', '[OBJ]', '[/OBJ]']
+        # Update the tokenizer with new tokens
+        print(f"Original vocabulary size: {len(self.tokenizer)}")
+        print(f"Adding the following tokens: {tokens}")
+        self.tokenizer.add_tokens(tokens, special_tokens=True)
+        self.tokenizer.add_special_tokens({'pad_token': '<pad>'})
+        print(f"Adding Padding Token {self.tokenizer.pad_token}")
+        self.tokenizer.padding_side = "right"
+        print(f'Padding side: {self.tokenizer.padding_side}')
+        # Resize the model token embeddings if the model is loaded
+        if self.model is not None:
+            self.model.resize_token_embeddings(len(self.tokenizer))
+            self.model.config.pad_token_id = self.tokenizer.pad_token_id
+        print(f'Updated Vocabulary Size: {len(self.tokenizer)}')
+        print(f'Padding Token: {self.tokenizer.pad_token}')
+        print(f'Special Tokens: {self.tokenizer.added_tokens_decoder}')
+if __name__ == "__main__":
+    pass
+    LLAMA2_manager = Llama2ModelManager()
+    LLAMA2_model = LLAMA2_manager.load_model()  # First time loading the model
+    LLAMA2_tokenizer = LLAMA2_manager.load_tokenizer()
+    LLAMA2_manager.add_special_tokens(LLAMA2_model, LLAMA2_tokenizer)

my_model/extract_objects.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from object_detection import ObjectDetector
+import os
+def detect_objects_for_image(image_name, detector):
+    if os.path.exists(image_path):
+        image = detector.process_image(image_path)
+        detected_objects_str, _ = detector.detect_objects(image)
+        return detected_objects_str
+    else:
+        return "Image not found"
+def add_detected_objects_to_dataframe(df, image_directory, detector):
+    """
+    Adds a column to the DataFrame with detected objects for each image specified in the 'image_name' column.
+    Parameters:
+    df (pd.DataFrame): DataFrame containing a column 'image_name' with image filenames.
+    image_directory (str): Path to the directory containing images.
+    detector (ObjectDetector): An instance of the ObjectDetector class.
+    Returns:
+    pd.DataFrame: The original DataFrame with an additional column 'detected_objects'.
+    """
+    # Ensure 'image_name' column exists in the DataFrame
+    if 'image_name' not in df.columns:
+        raise ValueError("DataFrame must contain an 'image_name' column.")
+    image_path = os.path.join(image_directory, image_name)
+    # Function to detect objects for a given image filename
+    # Apply the function to each row in the DataFrame
+    df['detected_objects'] = df['image_name'].apply(detect_objects_for_image)
+    return df
+# Example usage (assuming the function will be used in a context where 'detector' is defined and configured):
+# df_images = pd.DataFrame({"image_name": ["image1.jpg", "image2.jpg", ...]})
+# image_directory = "path/to/image_directory"
+# updated_df = add_detected_objects_to_dataframe(df_images, image_directory, detector)
+# updated_df.head()

my_model/fine_tuner/fine_tuner.py ADDED Viewed

	@@ -0,0 +1,347 @@

+#   Main Fine-Tuning Script for meta-llama/Llama-2-7b-chat-hf
+#   This script is the central executable for fine-tuning large language models, specifically designed for the LLAMA2
+#   model.
+#   It encompasses the entire process of fine-tuning, starting from data preparation to the final model training.
+#   The script leverages the 'FinetuningDataHandler' class for data loading, inspection, preparation, and splitting.
+#   This ensures that the dataset is correctly processed and prepared for effective training.
+#   The fine-tuning process is managed by the Finetuner class, which handles the training of the model using specific
+#   training arguments and datasets. Advanced configurations for Quantized Low-Rank Adaptation (QLoRA) and Parameter
+#   Efficient Fine-Tuning (PEFT) are utilized to optimize the training process on limited hardware resources.
+#   The script is designed to be executed as a standalone process, providing an end-to-end solution for fine-tuning
+#   LLMs. It is a part of a larger project aimed at optimizing the performance of language model to adapt to
+#   OK-VQA dataset.
+#   Ensure all dependencies are installed and the required files are in place before running this script.
+#   The configurations for the fine-tuning process are defined in the 'fine_tuning_config.py' file.
+#   ---------- Please run this file for the full fine-tuning process to start ----------#
+#   ---------- Please ensure this is run on a GPU ----------#
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, TRANSFORMERS_CACHE
+from trl import SFTTrainer
+from datasets import Dataset, load_dataset
+from peft import LoraConfig, prepare_model_for_kbit_training, PeftModel
+import fine_tuning_config as config
+from typing import List
+import bitsandbytes  # only on GPU
+import gc
+import os
+import shutil
+from my_model.LLAMA2.LLAMA2_model import Llama2ModelManager
+from fine_tuning_data_handler import FinetuningDataHandler
+class QLoraConfig:
+    """
+    Configures QLoRA (Quantized Low-Rank Adaptation) parameters for efficient model fine-tuning.
+    LoRA allows adapting large language models with a minimal number of trainable parameters.
+    Attributes:
+        lora_config (LoraConfig): Configuration object for LoRA parameters.
+    """
+    def __init__(self) -> None:
+        """
+        Initializes QLoraConfig with specific LoRA parameters.
+        """
+        # please refer to config file 'fine_tuning_config.py' for QLORA arguments description.
+        self.lora_config = LoraConfig(
+            lora_alpha=config.LORA_ALPHA,
+            lora_dropout=config.LORA_DROPOUT,
+            r=config.LORA_R,
+            bias="none",  # bias is already accounted for in LLAMA2 pre-trained model layers.
+            task_type="CAUSAL_LM",
+            target_modules=['up_proj', 'down_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']  # modules for fine-tuning.
+        )
+class Finetuner:
+    """
+       The Finetuner class manages the fine-tuning process of a pre-trained language model using specific
+       training arguments and datasets. It is designed to adapt a pre-trained model on a specific dataset
+       to enhance its performance on similar data.
+       This class not only facilitates the fine-tuning of LLAMA2 but also includes advanced
+       resource management capabilities. It provides methods for deleting model and trainer objects,
+       clearing GPU memory, and cleaning up Hugging Face's Transformers cache. These functionalities
+       make the Finetuner class especially useful in environments with limited computational resources
+       or when managing multiple models or training sessions.
+       Additionally, the class supports configurations for Quantized Low-Rank Adaptation (QLoRA)
+       to fine-tune models with minimal trainable parameters, and Parameter Efficient Fine-Tuning (PEFT)
+       for training efficiency on limited hardware.
+       Attributes:
+           base_model (AutoModelForCausalLM): The pre-trained language model to be fine-tuned.
+           tokenizer (AutoTokenizer): The tokenizer associated with the model.
+           train_dataset (Dataset): The dataset used for training.
+           eval_dataset (Dataset): The dataset used for evaluation.
+           training_arguments (TrainingArguments): Configuration for training the model.
+       Key Methods:
+           - load_LLAMA2_for_finetuning: Loads the LLAMA2 model and tokenizer for fine-tuning.
+           - train: Trains the model using PEFT configuration.
+           - delete_model: Deletes a specified model attribute.
+           - delete_trainer: Deletes a specified trainer object.
+           - clear_training_resources: Clears GPU memory.
+           - clear_cache_and_collect_garbage: Clears Transformers cache and performs garbage collection.
+           - find_all_linear_names: Identifies linear layer names suitable for LoRA application.
+           - print_trainable_parameters: Prints the number of trainable parameters in the model.
+       """
+    def __init__(self, train_dataset: Dataset, eval_dataset: Dataset) -> None:
+        """
+        Initializes the Finetuner class with the model, tokenizer, and datasets.
+        Args:
+            model (AutoModelForCausalLM): The pre-trained language model.
+            tokenizer (AutoTokenizer): The tokenizer for the model.
+            train_dataset (Dataset): The dataset for training the model.
+            eval_dataset (Dataset): The dataset for evaluating the model.
+        """
+        self.base_model, self.tokenizer = self.load_LLAMA2_for_finetuning()
+        self.merged_model = None
+        self.train_dataset = train_dataset
+        self.eval_dataset = eval_dataset
+        # please refer to config file 'fine_tuning_config.py' for training arguments description.
+        self.training_arguments = TrainingArguments(
+            output_dir=config.OUTPUT_DIR,
+            num_train_epochs=config.NUM_TRAIN_EPOCHS,
+            per_device_train_batch_size=config.PER_DEVICE_TRAIN_BATCH_SIZE,
+            per_device_eval_batch_size=config.PER_DEVICE_EVAL_BATCH_SIZE,
+            gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
+            fp16=config.FP16,
+            bf16=config.BF16,
+            evaluation_strategy=config.Evaluation_STRATEGY,
+            eval_steps=config.EVALUATION_STEPS,
+            max_grad_norm=config.MAX_GRAD_NORM,
+            learning_rate=config.LEARNING_RATE,
+            weight_decay=config.WEIGHT_DECAY,
+            optim=config.OPTIM,
+            lr_scheduler_type=config.LR_SCHEDULER_TYPE,
+            max_steps=config.MAX_STEPS,
+            warmup_ratio=config.WARMUP_RATIO,
+            group_by_length=config.GROUP_BY_LENGTH,
+            save_steps=config.SAVE_STEPS,
+            logging_steps=config.LOGGING_STEPS,
+            report_to="tensorboard"
+        )
+    def load_LLAMA2_for_finetuning(self):
+        """
+        Loads the LLAMA2 model and tokenizer, specifically configured for fine-tuning.
+        This method ensures the model is ready to be adapted to a specific task or dataset.
+        Returns:
+            Tuple[AutoModelForCausalLM, AutoTokenizer]: The loaded model and tokenizer.
+        """
+        llm_manager = Llama2ModelManager()
+        base_model, tokenizer = llm_manager.load_model_and_tokenizer(for_fine_tuning=True)
+        return base_model, tokenizer
+    def find_all_linear_names(self) -> List[str]:
+        """
+        Identifies all linear layer names in the model that are suitable for applying LoRA.
+        Returns:
+            List[str]: A list of linear layer names.
+        """
+        cls = bitsandbytes.nn.Linear4bit
+        lora_module_names = set()
+        for name, module in self.base_model.named_modules():
+            if isinstance(module, cls):
+                names = name.split('.')
+                lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+        # We dont want to train these two modules to avoid computational overhead.
+        lora_module_names -= {'lm_head', 'gate_proj'}
+        return list(lora_module_names)
+    def print_trainable_parameters(self, use_4bit: bool = False) -> None:
+        """
+        Calculates and prints the number of trainable parameters in the model.
+        Args:
+            use_4bit (bool): If true, calculates the parameter count considering 4-bit quantization.
+        """
+        trainable_params = sum(p.numel() for p in self.base_model.parameters() if p.requires_grad)
+        if use_4bit:
+            trainable_params /= 2
+        total_params = sum(p.numel() for p in self.base_model.parameters())
+        print(f"All Parameters: {total_params:,d} || Trainable Parameters: {trainable_params:,d} "
+              f"|| Trainable Parameters %: {100 * trainable_params / total_params:.2f}%")
+    def train(self, peft_config: LoraConfig) -> None:
+        """
+        Trains the model using the specified PEFT (Progressive Effort Fine-Tuning) configuration.
+        Args:
+            peft_config (LoraConfig): Configuration for the PEFT training process.
+        """
+        self.base_model.config.use_cache = False
+        # Set the pretraining_tp flag to 1 to enable the use of LoRA (Low-Rank Adapters) layers.
+        self.base_model.config.pretraining_tp = 1
+        # Prepare the model for k-bit training by quantizing the weights to 4 bits using bitsandbytes.
+        self.base_model = prepare_model_for_kbit_training(self.base_model)
+        self.trainer = SFTTrainer(
+            model=self.base_model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.eval_dataset,
+            peft_config=peft_config,
+            dataset_text_field='text',
+            max_seq_length=config.MAX_TOKEN_COUNT,
+            tokenizer=self.tokenizer,
+            args=self.training_arguments,
+            packing=config.PACKING
+        )
+        self.trainer.train()
+    def save_model(self):
+        """
+        Saves the fine-tuned model to the specified directory.
+        This method saves the model weights and configuration of the fine-tuned model.
+        The save directory and filename are determined by the configuration provided in
+        the 'fine_tuning_config.py' file. It is useful for persisting the fine-tuned model
+        for later use or evaluation.
+        The saved model can be easily loaded using Hugging Face's model loading utilities.
+        """
+        self.fine_tuned_adapter_name = config.ADAPTER_SAVE_NAME
+        self.trainer.model.save_pretrained(self.fine_tuned_adapter_name)
+    def merge_weights(self):
+        """
+        Merges the weights of the fine-tuned adapter with the base model.
+        This method integrates the fine-tuned adapter weights into the base model,
+        resulting in a single consolidated model. The merged model can then be used
+        for inference or further training.
+        After merging, the weights of the adapter are no longer separate from the
+        base model, enabling more efficient storage and deployment. The merged model
+        is stored in the 'self.merged_model' attribute of the Finetuner class.
+        """
+        self.merged_model = PeftModel.from_pretrained(self.base_model, self.fine_tuned_adapter_name)
+        self.merged_model = self.merged_model.merge_and_unload()
+    def delete_model(self, model_name: str):
+        """
+        Deletes a specified model attribute.
+        Args:
+            model_name (str): The name of the model attribute to delete.
+        """
+        try:
+            if hasattr(self, model_name) and getattr(self, model_name) is not None:
+                delattr(self, model_name)
+                print(f"Model '{model_name}' has been deleted.")
+            else:
+                print(f"Warning: Model '{model_name}' has already been cleared or does not exist.")
+        except Exception as e:
+            print(f"Error occurred while deleting model '{model_name}': {str(e)}")
+    def delete_trainer(self, trainer_name: str):
+        """
+        Deletes a specified trainer object.
+        Args:
+            trainer_name (str): The name of the trainer object to delete.
+        """
+        try:
+            if hasattr(self, trainer_name) and getattr(self, trainer_name) is not None:
+                delattr(self, trainer_name)
+                print(f"Trainer object '{trainer_name}' has been deleted.")
+            else:
+                print(f"Warning: Trainer object '{trainer_name}' has already been cleared or does not exist.")
+        except Exception as e:
+            print(f"Error occurred while deleting trainer object '{trainer_name}': {str(e)}")
+    def clear_training_resources(self):
+        """
+        Clears GPU memory.
+        """
+        try:
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                print("GPU memory has been cleared.")
+        except Exception as e:
+            print(f"Error occurred while clearing GPU memory: {str(e)}")
+    def clear_cache_and_collect_garbage(self):
+        """
+        Clears Hugging Face's Transformers cache and runs garbage collection.
+        """
+        try:
+            if os.path.exists(TRANSFORMERS_CACHE):
+                shutil.rmtree(TRANSFORMERS_CACHE, ignore_errors=True)
+                print("Transformers cache has been cleared.")
+            gc.collect()
+            print("Garbage collection has been executed.")
+        except Exception as e:
+            print(f"Error occurred while clearing cache and collecting garbage: {str(e)}")
+def fine_tune(save_fine_tuned_adapter=False, merge=False, delete_trainer_after_fine_tune=False):
+    """
+    Conducts the fine-tuning process of a pre-trained language model using specified configurations.
+    This function encompasses the complete workflow of fine-tuning, including data handling, training,
+    and optional steps like saving the fine-tuned model and merging weights.
+    Args:
+        save_fine_tuned_adapter (bool): If True, saves the fine-tuned adapter after training.
+        merge (bool): If True, merges the weights of the fine-tuned adapter into the base model.
+        delete_trainer_after_fine_tune (bool): If True, deletes the trainer object after fine-tuning to free up resources.
+    Returns:
+        The fine-tuned model after the fine-tuning process. This could be either the merged model
+        or the trained model based on the provided arguments.
+    The function initiates by preparing the training and evaluation datasets using the `FinetuningDataHandler`.
+    It then sets up the QLoRA configuration for the fine-tuning process. The actual training is carried out by
+    the `Finetuner` class. Post training, based on the arguments, the function can save the fine-tuned model,
+    merge the adapter weights with the base model, and clean up resources by deleting the trainer object.
+    """
+    data_handler = FinetuningDataHandler()
+    fine_tuning_data_train, fine_tuning_data_eval = data_handler.inspect_prepare_split_data()
+    qlora = QLoraConfig()
+    peft_config = qlora.lora_config
+    tuner = Finetuner(fine_tuning_data_train, fine_tuning_data_eval)
+    tuner.train(peft_config=peft_config)
+    if save_fine_tuned_adapter:
+        tuner.save_model()
+    if merge:
+        tuner.merge_weights()
+    if delete_trainer_after_fine_tune:
+        tuner.delete_trainer("trainer")
+    tuner.delete_model("base_model")  # We always delete this as it is not required after the merger.
+    if save_fine_tuned_adapter:
+        tuner.save_model()
+        if tuner.merged_model is not None:
+            return tuner.merged_model
+        else:
+            return tuner.trainer.model
+if __name__ == "__main__":
+    fine_tune()

my_model/fine_tuner/fine_tuning_config.py ADDED Viewed

	@@ -0,0 +1,114 @@

+# Configurable parameters for fine-tuning
+import os
+# *** Dataset ***
+# Base directory where the script is running
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Path to the folder containing the data files, relative to the configuration file
+DATA_FOLDER = 'fine_tuning_data'
+# Full path to the data folder
+DATA_FOLDER_PATH = os.path.join(BASE_DIR, DATA_FOLDER)
+# Path to the dataset file (CSV format)
+DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv')  # or 'fine_tuning_data_detic.csv'
+# *** Fine-tuned Adapter ***
+TRAINED_ADAPTER_NAME = 'fine_tuned_adapter'  # name of fine-tuned adapter.
+FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model'
+FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER)
+ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME)
+# Proportion of the dataset to include in the test split (e.g., 0.1 for 10%)
+TEST_SIZE = 0.1
+# Seed for random operations to ensure reproducibility
+SEED = 123
+# *** QLoRA Configuration Parameters ***
+# LoRA attention dimension: number of additional parameters in each LoRA layer
+LORA_R = 64
+# Alpha parameter for LoRA scaling: controls the scaling of LoRA weights
+LORA_ALPHA = 32
+# Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers
+LORA_DROPOUT = 0.05
+# *** TrainingArguments Configuration Parameters for the Transformers library ***
+# Output directory to save model predictions and checkpoints
+OUTPUT_DIR = "./TUNED_MODEL_LLAMA"
+# Number of epochs to train the model
+NUM_TRAIN_EPOCHS = 1
+# Enable mixed-precision training using fp16 (set to True for faster training)
+FP16 = True
+# Enable mixed-precision training using bf16 (set to True if using an A100 GPU)
+BF16 = False
+# Batch size per GPU/Device for training
+PER_DEVICE_TRAIN_BATCH_SIZE = 16
+# Batch size per GPU/Device for evaluation
+PER_DEVICE_EVAL_BATCH_SIZE = 8
+# Number of update steps to accumulate gradients before performing a backward/update pass
+GRADIENT_ACCUMULATION_STEPS = 1
+# Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown
+GRADIENT_CHECKPOINTING = True
+# Maximum gradient norm for gradient clipping to prevent exploding gradients
+MAX_GRAD_NORM = 0.3
+# Initial learning rate for the AdamW optimizer
+LEARNING_RATE = 2e-4
+# Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights)
+WEIGHT_DECAY = 0.01
+# Optimizer type, here using 'paged_adamw_8bit' for efficient training
+OPTIM = "paged_adamw_8bit"
+# Learning rate scheduler type (e.g., 'linear', 'cosine', etc.)
+LR_SCHEDULER_TYPE = "linear"
+# Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number
+# Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the
+# number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior
+# when MAX_STEPS is not specified or set to a negative value2.
+MAX_STEPS = -1
+# Ratio of the total number of training steps used for linear warmup
+WARMUP_RATIO = 0.03
+# Whether to group sequences into batches with the same length to save memory and increase speed
+GROUP_BY_LENGTH = False
+# Save a model checkpoint every X update steps
+SAVE_STEPS = 50
+# Log training information every X update steps
+LOGGING_STEPS = 25
+PACKING = False
+# Evaluation strategy during training ("steps", "epoch, "no")
+Evaluation_STRATEGY = "steps"
+# Number of update steps between two evaluations if `evaluation_strategy="steps"`.
+# Will default to the same value as `logging_steps` if not set.
+EVALUATION_STEPS = 5
+# Maximum number of tokens per sample in the dataset
+MAX_TOKEN_COUNT = 1024
+if __name__=="__main__":
+    pass

my_model/fine_tuner/fine_tuning_data/fine_tuning_data_detic.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77089f24dd5414b0d1dcb5b8f3b34aac3daea86e68c1c70e2da6490482ac9d4b
+size 54670629

my_model/fine_tuner/fine_tuning_data/fine_tuning_data_yolov5.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a44d22827c212a9d7a30bb3fd94cb7d7ad82a968a55eaa09e0ff5a61f85fde05
+size 14547559

my_model/fine_tuner/fine_tuning_data/read_me.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+The data files 'fine_tuning_data_detic.csv' and 'fine_tuning_data_yolov5.csv' are the result of the preparation and
+filtration after performing below steps:
+- Generate the captions for all the images.
+- Delete all samples with corrupted or rubbish data. (Please refer to the report for details)
+- Run object detection models ('yolov5' and 'detic') and generate the corresponding objects for the images corresponding to the remaining samples.
+- Convert all the question, answer, caption, objects together with the system prompt into the desired template for all
+  the samples (Please refer to the report for the detailed template design).

my_model/fine_tuner/fine_tuning_data_handler.py ADDED Viewed

	@@ -0,0 +1,182 @@

+from my_model.utilities import is_pycharm
+import seaborn as sns
+from transformers import AutoTokenizer
+from datasets import Dataset, load_dataset
+import fine_tuning_config as config
+from my_model.LLAMA2.LLAMA2_model import Llama2ModelManager
+from typing import Tuple
+class FinetuningDataHandler:
+    """
+    A class dedicated to handling data for fine-tuning language models. It manages loading,
+    inspecting, preparing, and splitting the dataset, specifically designed to filter out
+    data samples exceeding a specified token count limit. This is crucial for models with
+    token count constraints and it helps control the level of GPU RAM tolernace based on the number of tokens,
+    ensuring efficient and effective model fine-tuning.
+    Attributes:
+        tokenizer (AutoTokenizer): Tokenizer used for tokenizing the dataset.
+        dataset_file (str): File path to the dataset.
+        max_token_count (int): Maximum allowable token count per data sample.
+    Methods:
+        load_llm_tokenizer(): Loads the LLM tokenizer and adds special tokens, if not already loaded.
+        load_dataset(): Loads the dataset from a specified file path.
+        plot_tokens_count_distribution(token_counts, title): Plots the distribution of token counts in the dataset.
+        filter_dataset_by_indices(dataset, valid_indices): Filters the dataset based on valid indices, removing samples exceeding token limits.
+        get_token_counts(dataset): Calculates token counts for each sample in the dataset.
+        prepare_dataset(): Tokenizes and filters the dataset, preparing it for training. Also visualizes token count distribution before and after filtering.
+        split_dataset_for_train_eval(dataset): Divides the dataset into training and evaluation sets.
+        inspect_prepare_split_data(): Coordinates the data preparation and splitting process for fine-tuning.
+    """
+    def __init__(self, tokenizer: AutoTokenizer = None, dataset_file: str = config.DATASET_FILE) -> None:
+        """
+        Initializes the FinetuningDataHandler class.
+        Args:
+            tokenizer (AutoTokenizer): Tokenizer to use for tokenizing the dataset.
+            dataset_file (str): Path to the dataset file.
+        """
+        self.tokenizer = tokenizer  # The tokenizer used for processing the dataset.
+        self.dataset_file = dataset_file  # Path to the fine-tuning dataset file.
+        self.max_token_count = config.MAX_TOKEN_COUNT  # Max token count for filtering.
+    def load_llm_tokenizer(self):
+        """
+        Loads the LLM tokenizer and adds special tokens, if not already loaded.
+        If the tokenizer is already loaded, this method does nothing.
+        """
+        if self.tokenizer is None:
+            llm_manager = Llama2ModelManager()  # Initialize Llama2 model manager.
+            # we only need the tokenizer for the data inspection not the model itself.
+            self.tokenizer = llm_manager.load_tokenizer()
+            llm_manager.add_special_tokens()  # Add special tokens specific to LLAMA2 vocab for efficient tokenization.
+    def load_dataset(self) -> Dataset:
+        """
+        Loads the dataset from the specified file path. The dataset is expected to be in CSV format.
+        Returns:
+            Dataset: The loaded dataset, ready for processing.
+        """
+        return load_dataset('csv', data_files=self.dataset_file)
+    def plot_tokens_count_distribution(self, token_counts: list, title: str = "Token Count Distribution") -> None:
+        """
+        Plots the distribution of token counts in the dataset for visualization purposes.
+        Args:
+            token_counts (list): List of token counts, each count representing the number of tokens in a dataset sample.
+            title (str): Title for the plot, highlighting the nature of the distribution.
+        """
+        if is_pycharm():  # Ensuring compatibility with PyCharm's environment for interactive plot.
+            import matplotlib
+            matplotlib.use('TkAgg')  # Set the backend to 'TkAgg'
+        import matplotlib.pyplot as plt
+        sns.set_style("whitegrid")
+        plt.figure(figsize=(15, 6))
+        plt.hist(token_counts, bins=50, color='#3498db', edgecolor='black')
+        plt.title(title, fontsize=16)
+        plt.xlabel("Number of Tokens", fontsize=14)
+        plt.ylabel("Number of Samples", fontsize=14)
+        plt.xticks(fontsize=12)
+        plt.yticks(fontsize=12)
+        plt.tight_layout()
+        plt.show()
+    def filter_dataset_by_indices(self, dataset: Dataset, valid_indices: list) -> Dataset:
+        """
+        Filters the dataset based on a list of valid indices. This method is used to exclude
+        data samples that have a token count exceeding the specified maximum token count.
+        Args:
+            dataset (Dataset): The dataset to be filtered.
+            valid_indices (list): Indices of samples with token counts within the limit.
+        Returns:
+            Dataset: Filtered dataset containing only samples with valid indices.
+        """
+        return dataset['train'].select(valid_indices)  # Select only samples with valid indices based on token count.
+    def get_token_counts(self, dataset):
+        """
+        Calculates and returns the token counts for each sample in the dataset.
+        This function assumes the dataset has a 'train' split and a 'text' field.
+        Args:
+            dataset (Dataset): The dataset for which to count tokens.
+        Returns:
+            List[int]: List of token counts per sample in the dataset.
+        """
+        if 'train' in dataset:
+            return [len(self.tokenizer.tokenize(s)) for s in dataset["train"]["text"]]
+        else:
+            # After filtering the samples with unacceptable token count, the dataset is already
+            # dataset = dataset['train']
+            return [len(self.tokenizer.tokenize(s)) for s in dataset["text"]]
+    def prepare_dataset(self) -> Tuple[Dataset, Dataset]:
+        """
+        Prepares the dataset for fine-tuning by tokenizing the data and filtering out samples
+        that exceed the maximum used context window (configurable through max_token_count).
+        It also visualizes the token count distribution before and after filtering.
+        Returns:
+            Tuple[Dataset, Dataset]: The train and evaluate datasets, post-filtering.
+        """
+        dataset = self.load_dataset()
+        self.load_llm_tokenizer()
+        # Count tokens in each dataset sample before filtering
+        token_counts_before_filtering = self.get_token_counts(dataset)
+        # Plot token count distribution before filtering for visualization.
+        self.plot_tokens_count_distribution(token_counts_before_filtering, "Token Count Distribution Before Filtration")
+        # Identify valid indices based on max token count.
+        valid_indices = [i for i, count in enumerate(token_counts_before_filtering) if count <= self.max_token_count]
+        # Filter the dataset to exclude samples with excessive token counts.
+        filtered_dataset = self.filter_dataset_by_indices(dataset, valid_indices)
+        token_counts_after_filtering = self.get_token_counts(filtered_dataset)
+        self.plot_tokens_count_distribution(token_counts_after_filtering, "Token Count Distribution After Filtration")
+        return self.split_dataset_for_train_eval(filtered_dataset)  # split the dataset into training and evaluation.
+    def split_dataset_for_train_eval(self, dataset) -> Tuple[Dataset, Dataset]:
+        """
+        Splits the dataset into training and evaluation datasets.
+        Args:
+            dataset (Dataset): The dataset to split.
+        Returns:
+            tuple[Dataset, Dataset]: The split training and evaluation datasets.
+        """
+        split_data = dataset.train_test_split(test_size=config.TEST_SIZE, shuffle=True, seed=config.SEED)
+        train_data, eval_data = split_data['train'], split_data['test']
+        return train_data, eval_data
+    def inspect_prepare_split_data(self) -> tuple[Dataset, Dataset]:
+        """
+        Orchestrates the process of inspecting, preparing, and splitting the dataset for fine-tuning.
+        Returns:
+            tuple[Dataset, Dataset]: The prepared training and evaluation datasets.
+        """
+        return self.prepare_dataset()
+# Example usage
+if __name__ == "__main__":
+    #  Please uncomment the below lines to test the data prep.
+    #data_handler = FinetuningDataHandler()
+    #fine_tuning_data_train, fine_tuning_data_eval = data_handler.inspect_prepare_split_data()
+    #print(fine_tuning_data_train, fine_tuning_data_eval)
+    pass

my_model/object_detection.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from transformers import AutoImageProcessor, AutoModelForObjectDetection
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import cv2
+import os
+from utilities import get_path, show_image
+class ObjectDetector:
+    """
+        A class for detecting objects in images using models like Detic and YOLOv5.
+        This class supports loading and using different object detection models to identify objects
+        in images and draw bounding boxes around them.
+        Attributes:
+            model (torch.nn.Module): The loaded object detection model.
+            processor (transformers.AutoImageProcessor): Processor for the Detic model.
+            model_name (str): Name of the model used for detection.
+        """
+    def __init__(self):
+        """
+        Initializes the ObjectDetector class with default values.
+        """
+        self.model = None
+        self.processor = None
+        self.model_name = None
+    def load_model(self, model_name='detic', pretrained=True, model_version='yolov5s'):
+        """
+                Load the specified object detection model.
+                Args:
+                    model_name (str): Name of the model to load. Options are 'detic' and 'yolov5'.
+                    pretrained (bool): Boolean indicating if a pretrained model should be used.
+                    model_version (str): Version of the YOLOv5 model, applicable only when using YOLOv5.
+                Raises:
+                    ValueError: If an unsupported model name is provided.
+        """
+        self.model_name = model_name
+        if model_name == 'detic':
+            self._load_detic_model(pretrained)
+        elif model_name == 'yolov5':
+            self._load_yolov5_model(pretrained, model_version)
+        else:
+            raise ValueError(f"Unsupported model name: {model_name}")
+    def _load_detic_model(self, pretrained):
+        """
+        Load the Detic model.
+        Args:
+            pretrained (bool): If True, load a pretrained model.
+        """
+        try:
+            model_path = get_path('deformable-detr-detic', 'models')
+            self.processor = AutoImageProcessor.from_pretrained(model_path)
+            self.model = AutoModelForObjectDetection.from_pretrained(model_path)
+        except Exception as e:
+            print(f"Error loading Detic model: {e}")
+            raise
+    def _load_yolov5_model(self, pretrained, model_version):
+        """
+        Load the YOLOv5 model.
+        Args:
+            pretrained (bool): If True, load a pretrained model.
+            model_version (str): Version of the YOLOv5 model.
+        """
+        try:
+            model_path = get_path('yolov5', 'models')
+            if model_path and os.path.exists(model_path):
+                self.model = torch.hub.load(model_path, model_version, pretrained=pretrained, source='local')
+            else:
+                self.model = torch.hub.load('ultralytics/yolov5', model_version, pretrained=pretrained)
+        except Exception as e:
+            print(f"Error loading YOLOv5 model: {e}")
+            raise
+    def process_image(self, image_path):
+        """
+        Process the image from the given path.
+        Args:
+            image_path (str): Path to the image file.
+        Returns:
+            Image.Image: Processed image in RGB format.
+        Raises:
+            Exception: If an error occurs during image processing.
+        """
+        try:
+            with Image.open(image_path) as image:
+                return image.convert("RGB")
+        except Exception as e:
+            print(f"Error processing image: {e}")
+            raise
+    def detect_objects(self, image, threshold=0.4):
+        """
+        Detect objects in the given image using the loaded model.
+        Args:
+            image (Image.Image): Image in which to detect objects.
+            threshold (float): Model detection confidence.
+        Returns:
+            tuple: A tuple containing a string representation and a list of detected objects.
+        Raises:
+            ValueError: If the model is not loaded or the model name is unsupported.
+        """
+        if self.model_name == 'detic':
+            return self._detect_with_detic(image, threshold)
+        elif self.model_name == 'yolov5':
+            return self._detect_with_yolov5(image, threshold)
+        else:
+            raise ValueError("Model not loaded or unsupported model name")
+    def _detect_with_detic(self, image, threshold):
+        """
+        Detect objects using the Detic model.
+        Args:
+            image (Image.Image): The image in which to detect objects.
+            threshold (float): The confidence threshold for detections.
+        Returns:
+            tuple: A tuple containing a string representation and a list of detected objects.
+                   Each object in the list is represented as a tuple (label_name, box_rounded, certainty).
+        """
+        inputs = self.processor(images=image, return_tensors="pt")
+        outputs = self.model(**inputs)
+        target_sizes = torch.tensor([image.size[::-1]])
+        results = self.processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=threshold)[
+            0]
+        detected_objects_str = ""
+        detected_objects_list = []
+        for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+            if score >= threshold:
+                label_name = self.model.config.id2label[label.item()]
+                box_rounded = [round(coord, 2) for coord in box.tolist()]
+                certainty = round(score.item() * 100, 2)
+                detected_objects_str += f"{{object: {label_name}, bounding box: {box_rounded}, certainty: {certainty}%}}\n"
+                detected_objects_list.append((label_name, box_rounded, certainty))
+        return detected_objects_str, detected_objects_list
+    def _detect_with_yolov5(self, image, threshold):
+        """
+        Detect objects using the YOLOv5 model.
+        Args:
+            image (Image.Image): The image in which to detect objects.
+            threshold (float): The confidence threshold for detections.
+        Returns:
+            tuple: A tuple containing a string representation and a list of detected objects.
+                   Each object in the list is represented as a tuple (label_name, box_rounded, certainty).
+        """
+        cv2_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        results = self.model(cv2_img)
+        detected_objects_str = ""
+        detected_objects_list = []
+        for *bbox, conf, cls in results.xyxy[0]:
+            if conf >= threshold:
+                label_name = results.names[int(cls)]
+                box_rounded = [round(coord.item(), 2) for coord in bbox]
+                certainty = round(conf.item() * 100, 2)
+                detected_objects_str += f"{{object: {label_name}, bounding box: {box_rounded}, certainty: {certainty}%}}\n"
+                detected_objects_list.append((label_name, box_rounded, certainty))
+        return detected_objects_str, detected_objects_list
+    def draw_boxes(self, image, detected_objects, show_confidence=True):
+        """
+        Draw bounding boxes around detected objects in the image.
+        Args:
+            image (Image.Image): Image on which to draw.
+            detected_objects (list): List of detected objects.
+            show_confidence (bool): Whether to show confidence scores.
+        Returns:
+            Image.Image: Image with drawn boxes.
+        """
+        draw = ImageDraw.Draw(image)
+        try:
+            font = ImageFont.truetype("arial.ttf", 15)
+        except IOError:
+            font = ImageFont.load_default()
+        colors = ["red", "green", "blue", "yellow", "purple", "orange"]
+        label_color_map = {}
+        for label_name, box, score in detected_objects:
+            if label_name not in label_color_map:
+                label_color_map[label_name] = colors[len(label_color_map) % len(colors)]
+            color = label_color_map[label_name]
+            draw.rectangle(box, outline=color, width=3)
+            label_text = f"{label_name}"
+            if show_confidence:
+                label_text += f" ({round(score, 2)}%)"
+            draw.text((box[0], box[1]), label_text, fill=color, font=font)
+        return image
+def detect_and_draw_objects(image_path, model_type='yolov5', threshold=0.2, show_confidence=True):
+    """
+    Detects objects in an image, draws bounding boxes around them, and returns the processed image and a string description.
+    Args:
+        image_path (str): Path to the image file.
+        model_type (str): Type of model to use for detection ('yolov5' or 'detic').
+        threshold (float): Detection threshold.
+        show_confidence (bool): Whether to show confidence scores on the output image.
+    Returns:
+        tuple: A tuple containing the processed Image.Image and a string of detected objects.
+    """
+    detector = ObjectDetector()
+    detector.load_model(model_type)
+    image = detector.process_image(image_path)
+    detected_objects_string, detected_objects_list = detector.detect_objects(image, threshold=threshold)
+    image_with_boxes = detector.draw_boxes(image, detected_objects_list, show_confidence=show_confidence)
+    return image_with_boxes, detected_objects_string
+# Example usage
+if __name__ == "__main__":
+    pass
+    # 'Sample_Images' is the folder conatining sample images for demo.
+    image_path = get_path('horse.jpg', 'Sample_Images')
+    processed_image, objects_string = detect_and_draw_objects(image_path,
+                                                              model_type='detic',
+                                                              threshold=0.2,
+                                                              show_confidence=False)
+    show_image(processed_image)
+    print("Detected Objects:", objects_string)

my_model/utilities.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import pandas as pd
+from collections import Counter
+import json
+import os
+from PIL import Image
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from IPython import get_ipython
+import sys
+class VQADataProcessor:
+    """
+    A class to process OKVQA dataset.
+    Attributes:
+        questions_file_path (str): The file path for the questions JSON file.
+        annotations_file_path (str): The file path for the annotations JSON file.
+        questions (list): List of questions extracted from the JSON file.
+        annotations (list): List of annotations extracted from the JSON file.
+        df_questions (DataFrame): DataFrame created from the questions list.
+        df_answers (DataFrame): DataFrame created from the annotations list.
+        merged_df (DataFrame): DataFrame resulting from merging questions and answers.
+    """
+    def __init__(self, questions_file_path, annotations_file_path):
+        """
+        Initializes the VQADataProcessor with file paths for questions and annotations.
+        Parameters:
+            questions_file_path (str): The file path for the questions JSON file.
+            annotations_file_path (str): The file path for the annotations JSON file.
+        """
+        self.questions_file_path = questions_file_path
+        self.annotations_file_path = annotations_file_path
+        self.questions, self.annotations = self.read_json_files()
+        self.df_questions = pd.DataFrame(self.questions)
+        self.df_answers = pd.DataFrame(self.annotations)
+        self.merged_df = None
+    def read_json_files(self):
+        """
+        Reads the JSON files for questions and annotations.
+        Returns:
+            tuple: A tuple containing two lists: questions and annotations.
+        """
+        with open(self.questions_file_path, 'r') as file:
+            data = json.load(file)
+            questions = data['questions']
+        with open(self.annotations_file_path, 'r') as file:
+            data = json.load(file)
+            annotations = data['annotations']
+        return questions, annotations
+    @staticmethod
+    def find_most_frequent(my_list):
+        """
+        Finds the most frequent item in a list.
+        Parameters:
+            my_list (list): A list of items.
+        Returns:
+            The most frequent item in the list. Returns None if the list is empty.
+        """
+        if not my_list:
+            return None
+        counter = Counter(my_list)
+        most_common = counter.most_common(1)
+        return most_common[0][0]
+    def merge_dataframes(self):
+        """
+        Merges the questions and answers DataFrames on 'question_id' and 'image_id'.
+        """
+        self.merged_df = pd.merge(self.df_questions, self.df_answers, on=['question_id', 'image_id'])
+    def join_words_with_hyphen(self, sentence):
+        return '-'.join(sentence.split())
+    def process_answers(self):
+        """
+        Processes the answers by extracting raw and processed answers and finding the most frequent ones.
+        """
+        if self.merged_df is not None:
+            self.merged_df['raw_answers'] = self.merged_df['answers'].apply(lambda x: [ans['raw_answer'] for ans in x])
+            self.merged_df['processed_answers'] = self.merged_df['answers'].apply(
+                lambda x: [ans['answer'] for ans in x])
+            self.merged_df['most_frequent_raw_answer'] = self.merged_df['raw_answers'].apply(self.find_most_frequent)
+            self.merged_df['most_frequent_processed_answer'] = self.merged_df['processed_answers'].apply(
+                self.find_most_frequent)
+            self.merged_df.drop(columns=['answers'], inplace=True)
+        else:
+            print("DataFrames have not been merged yet.")
+        # Apply the function to the 'most_frequent_processed_answer' column
+        self.merged_df['single_word_answers'] = self.merged_df['most_frequent_processed_answer'].apply(
+            self.join_words_with_hyphen)
+    def get_processed_data(self):
+        """
+        Retrieves the processed DataFrame.
+        Returns:
+            DataFrame: The processed DataFrame. Returns None if the DataFrame is empty or not processed.
+        """
+        if self.merged_df is not None:
+            return self.merged_df
+        else:
+            print("DataFrame is empty or not processed yet.")
+            return None
+    def save_to_csv(self, df, saved_file_name):
+        if saved_file_name is not None:
+            if ".csv" not in saved_file_name:
+                df.to_csv(os.path.join(saved_file_name, ".csv"), index=None)
+            else:
+                df.to_csv(saved_file_name, index=None)
+        else:
+            df.to_csv("data.csv", index=None)
+    def display_dataframe(self):
+        """
+        Displays the processed DataFrame.
+        """
+        if self.merged_df is not None:
+            print(self.merged_df)
+        else:
+            print("DataFrame is empty.")
+def process_okvqa_dataset(questions_file_path, annotations_file_path, save_to_csv=False, saved_file_name=None):
+    """
+    Processes the OK-VQA dataset given the file paths for questions and annotations.
+    Parameters:
+        questions_file_path (str): The file path for the questions JSON file.
+        annotations_file_path (str): The file path for the annotations JSON file.
+    Returns:
+        DataFrame: The processed DataFrame containing merged and processed VQA data.
+    """
+    # Create an instance of the class
+    processor = VQADataProcessor(questions_file_path, annotations_file_path)
+    # Process the data
+    processor.merge_dataframes()
+    processor.process_answers()
+    # Retrieve the processed DataFrame
+    processed_data = processor.get_processed_data()
+    if save_to_csv:
+        processor.save_to_csv(processed_data, saved_file_name)
+    return processed_data
+def show_image(image):
+    """
+    Display an image in various environments (Jupyter, PyCharm, Hugging Face Spaces).
+    Handles different types of image inputs (file path, PIL Image, numpy array, OpenCV, PyTorch tensor).
+    Args:
+    image (str or PIL.Image or numpy.ndarray or torch.Tensor): The image to display.
+    """
+    in_jupyter = is_jupyter_notebook()
+    in_colab = is_google_colab()
+    # Convert image to PIL Image if it's a file path, numpy array, or PyTorch tensor
+    if isinstance(image, str):
+        if os.path.isfile(image):
+            image = Image.open(image)
+        else:
+            raise ValueError("File path provided does not exist.")
+    elif isinstance(image, np.ndarray):
+        if image.ndim == 3 and image.shape[2] in [3, 4]:
+            image = Image.fromarray(image[..., ::-1] if image.shape[2] == 3 else image)
+        else:
+            image = Image.fromarray(image)
+    elif torch.is_tensor(image):
+        image = Image.fromarray(image.permute(1, 2, 0).numpy().astype(np.uint8))
+    # Display the image
+    if in_jupyter or in_colab:
+        from IPython.display import display
+        display(image)
+    else:
+        image.show()
+def show_image_with_matplotlib(image):
+    if isinstance(image, str):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+    elif torch.is_tensor(image):
+        image = Image.fromarray(image.permute(1, 2, 0).numpy().astype(np.uint8))
+    plt.imshow(image)
+    plt.axis('off')  # Turn off axis numbers
+    plt.show()
+def is_jupyter_notebook():
+    """
+    Check if the code is running in a Jupyter notebook.
+    Returns:
+        bool: True if running in a Jupyter notebook, False otherwise.
+    """
+    try:
+        from IPython import get_ipython
+        if 'IPKernelApp' not in get_ipython().config:
+            return False
+        if 'ipykernel' in str(type(get_ipython())):
+            return True  # Running in Jupyter Notebook
+    except (NameError, AttributeError):
+        return False  # Not running in Jupyter Notebook
+    return False  # Default to False if none of the above conditions are met
+def is_pycharm():
+    return 'PYCHARM_HOSTED' in os.environ
+def is_google_colab():
+    return 'COLAB_GPU' in os.environ or 'google.colab' in sys.modules
+def get_path(name, path_type):
+    """
+    Generates a path for models, images, or data based on the specified type.
+    Args:
+    name (str): The name of the model, image, or data folder/file.
+    path_type (str): The type of path needed ('models', 'images', or 'data').
+    Returns:
+    str: The full path to the specified resource.
+    """
+    # Get the current working directory (assumed to be inside 'code' folder)
+    current_dir = os.getcwd()
+    # Get the directory one level up (the parent directory)
+    parent_dir = os.path.dirname(current_dir)
+    # Construct the path to the specified folder
+    folder_path = os.path.join(parent_dir, path_type)
+    # Construct the full path to the specific resource
+    full_path = os.path.join(folder_path, name)
+    return full_path
+if __name__ == "__main__":
+    pass
+    #val_data = process_okvqa_dataset('OpenEnded_mscoco_val2014_questions.json', 'mscoco_val2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_val.csv")
+    #train_data = process_okvqa_dataset('OpenEnded_mscoco_train2014_questions.json', 'mscoco_train2014_annotations.json', save_to_csv=True, saved_file_name="okvqa_train.csv")