Upload model

Browse files

Files changed (2) hide show

configuration_cxrmate_ed.py +0 -164
modelling_cxrmate_ed.py +267 -445

configuration_cxrmate_ed.py CHANGED Viewed

@@ -46,167 +46,3 @@ class CXRMateEDConfig(transformers.PretrainedConfig):
             text_config = CONFIG_MAPPING[text_config['model_type']](**text_config)
         self.text_config = text_config
-# class CXRMateEDConfig(transformers.PretrainedConfig):
-#     model_type = 'cxrmate-ed'
-#     # def __init__(
-#     #     self,
-#     #     index_value_encoder_intermediate_size: int = 2048,
-#     #     include_time_delta: bool = True,
-#     #     time_delta_monotonic_inversion: bool = True,
-#     #     add_time_deltas: bool = True,
-#     #     history: int = 0,
-#     #     tables_filter: list = ['mimic_cxr_sectioned', 'triage', 'medrecon'],
-#     #     prompt_report_sections_filter: list = ['indication', 'history'],
-#     #     pad_token_id: int = 4,
-#     #     **kwargs: Any,
-#     # ) -> None:
-#     #     super().__init__(**kwargs)
-#     #     self.index_value_encoder_intermediate_size = index_value_encoder_intermediate_size
-#     #     self.include_time_delta = include_time_delta
-#     #     self.time_delta_monotonic_inversion = time_delta_monotonic_inversion
-#     #     self.add_time_deltas = add_time_deltas
-#     #     self.history = history
-#     #     self.tables_filter = tables_filter
-#     #     self.prompt_report_sections_filter = prompt_report_sections_filter
-#     #     self.pad_token_id = pad_token_id
-#     #     self.hidden_size = self.text_config.hidden_size
-#     def __init__(
-#         self,
-#         vision_config=None,
-#         text_config=None,
-#         # ignore_index=-100,
-#         # image_token_index=32000,
-#         # projector_hidden_act="gelu",
-#         # vision_feature_select_strategy="default",
-#         # vision_feature_layer=-2,
-#         # image_seq_length=576,
-#         index_value_encoder_intermediate_size: int = 2048,
-#         include_time_delta: bool = True,
-#         time_delta_monotonic_inversion: bool = True,
-#         add_time_deltas: bool = True,
-#         history: int = 0,
-#         tables_filter: list = ['mimic_cxr_sectioned', 'triage', 'medrecon'],
-#         prompt_report_sections_filter: list = ['indication', 'history'],
-#         pad_token_id: int = 4,
-#         **kwargs,
-#     ):
-#         transformers.PretrainedConfig.__init__(self, **kwargs)
-#         self.vision_config = vision_config
-#         self.text_config = text_config
-#         self.index_value_encoder_intermediate_size = index_value_encoder_intermediate_size
-#         self.include_time_delta = include_time_delta
-#         self.time_delta_monotonic_inversion = time_delta_monotonic_inversion
-#         self.add_time_deltas = add_time_deltas
-#         self.history = history
-#         self.tables_filter = tables_filter
-#         self.prompt_report_sections_filter = prompt_report_sections_filter
-#         self.pad_token_id = pad_token_id
-        # self.ignore_index = ignore_index
-        # self.image_token_index = image_token_index
-        # self.projector_hidden_act = projector_hidden_act
-        # self.image_seq_length = image_seq_length
-        # if vision_feature_select_strategy not in ["default", "full"]:
-        #     raise ValueError(
-        #         "vision_feature_select_strategy should be one of 'default', 'full'."
-        #         f"Got: {vision_feature_select_strategy}"
-        #     )
-        # self.vision_feature_select_strategy = vision_feature_select_strategy
-        # self.vision_feature_layer = vision_feature_layer
-        # if isinstance(vision_config, dict):
-        #     vision_config["model_type"] = (
-        #         vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
-        #     )
-        #     vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
-        # elif vision_config is None:
-        #     vision_config = CONFIG_MAPPING["clip_vision_model"](
-        #         intermediate_size=4096,
-        #         hidden_size=1024,
-        #         patch_size=14,
-        #         image_size=336,
-        #         num_hidden_layers=24,
-        #         num_attention_heads=16,
-        #         vocab_size=32000,
-        #         projection_dim=768,
-        #     )
-        # if isinstance(text_config, dict):
-        #     text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-        #     text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        # elif text_config is None:
-        #     text_config = CONFIG_MAPPING["llama"]()
-        # super().__init__(**kwargs)
-# import transformers
-# from transformers.configuration_utils import PretrainedConfig
-# from transformers.utils import logging
-# logger = logging.get_logger(__name__)
-# class CXRMateEDConfig(PretrainedConfig):
-#     model_type = "cxrmate-ed"
-#     def __init__(self, **kwargs):
-#         super().__init__(**kwargs)
-#         if 'decoder' not in kwargs:
-#             self.decoder = transformers.LlamaConfig(
-#                 vocab_size=30000,
-#                 hidden_size=768,
-#                 intermediate_size=3072,
-#                 num_attention_heads=12,
-#                 num_hidden_layers=6,
-#                 max_position_embeddings=2048,
-#             )
-#             self.decoder.is_decoder = True
-#             self.decoder.index_value_encoder_intermediate_size = 2048
-#             self.decoder.include_time_delta = True
-#             self.decoder.time_delta_monotonic_inversion = True
-#             self.decoder.add_time_deltas = True
-#             self.decoder.history = 0
-#             self.decoder.tables_filter = ["mimic_cxr_sectioned", "triage", "medrecon"]
-#             self.decoder.prompt_report_sections_filter = ["indication", "history"]
-#             self.decoder.pad_token_id = 4
-#         else:
-#             self.decoder = kwargs.pop("decoder")
-#         if 'encoder' not in kwargs:
-#             self.encoder = transformers.AutoConfig.from_pretrained(
-#                 'aehrc/uniformer_base_tl_384',
-#                 projection_size=768,
-#                 trust_remote_code=True,
-#             )
-#         else:
-#             self.encoder = kwargs.pop("encoder")
-#         self.is_encoder_decoder = True
-#     @classmethod
-#     def from_encoder_decoder_configs(
-#         cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
-#     ) -> PretrainedConfig:
-#         logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
-#         decoder_config.is_decoder = True
-#         decoder_config.add_cross_attention = True
-#         return cls(encoder=encoder_config, decoder=decoder_config, **kwargs)


46	text_config = CONFIG_MAPPING[text_config['model_type']](**text_config)
47
48	self.text_config = text_config

modelling_cxrmate_ed.py CHANGED Viewed

@@ -8,13 +8,13 @@ import datasets
 import torch
 import transformers
 from huggingface_hub import hf_hub_download
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import Subset
 from torchvision.io import decode_image
-from transformers import PreTrainedTokenizerFast, VisionEncoderDecoderModel
-from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput
-from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import check_min_version, logging
 from .configuration_cxrmate_ed import CXRMateEDConfig
@@ -187,162 +187,39 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
         self.inf_time_delta_value = self.time_delta_map(float('inf'))
-        self.post_init()
-    # @classmethod
-    # def from_encoder_decoder_pretrained(
-    #     cls,
-    #     encoder_pretrained_model_name_or_path: str = None,
-    #     decoder_pretrained_model_name_or_path: str = None,
-    #     *model_args,
-    #     **kwargs,
-    # ) -> PreTrainedModel:
-    #     r"""
-    #     Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model
-    #     checkpoints.
-    #     The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
-    #     the model, you need to first set it back in training mode with `model.train()`.
-    #     Params:
-    #         encoder_pretrained_model_name_or_path (`str`, *optional*):
-    #             Information necessary to initiate the image encoder. Can be either:
-    #                 - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. An
-    #                   example is `google/vit-base-patch16-224-in21k`.
-    #                 - A path to a *directory* containing model weights saved using
-    #                   [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-    #                 - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
-    #                   this case, `from_tf` should be set to `True` and a configuration object should be provided as
-    #                   `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
-    #                   PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-    #         decoder_pretrained_model_name_or_path (`str`, *optional*, defaults to `None`):
-    #             Information necessary to initiate the text decoder. Can be either:
-    #                 - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-    #                 - A path to a *directory* containing model weights saved using
-    #                   [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-    #                 - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
-    #                   this case, `from_tf` should be set to `True` and a configuration object should be provided as
-    #                   `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
-    #                   PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-    #         model_args (remaining positional arguments, *optional*):
-    #             All remaning positional arguments will be passed to the underlying model's `__init__` method.
-    #         kwargs (remaining dictionary of keyword arguments, *optional*):
-    #             Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-    #             `output_attentions=True`).
-    #             - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter.
-    #             - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter.
-    #             - To update the parent model configuration, do not use a prefix for each configuration parameter.
-    #             Behaves differently depending on whether a `config` is provided or automatically loaded.
-    #     Example:
-    #     ```python
-    #     >>> from transformers import VisionEncoderDecoderModel
-    #     >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
-    #     >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
-    #     ...     "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased"
-    #     ... )
-    #     >>> # saving model after fine-tuning
-    #     >>> model.save_pretrained("./vit-bert")
-    #     >>> # load fine-tuned model
-    #     >>> model = VisionEncoderDecoderModel.from_pretrained("./vit-bert")
-    #     ```"""
-    #     kwargs_encoder = {
-    #         argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_")
-    #     }
-    #     kwargs_decoder = {
-    #         argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
-    #     }
-    #     # remove encoder, decoder kwargs from kwargs
-    #     for key in kwargs_encoder.keys():
-    #         del kwargs["encoder_" + key]
-    #     for key in kwargs_decoder.keys():
-    #         del kwargs["decoder_" + key]
-    #     # Load and initialize the encoder and decoder
-    #     # The distinction between encoder and decoder at the model level is made
-    #     # by the value of the flag `is_decoder` that we need to set correctly.
-    #     encoder = kwargs_encoder.pop("model", None)
-    #     if encoder is None:
-    #         if encoder_pretrained_model_name_or_path is None:
-    #             raise ValueError(
-    #                 "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has "
-    #                 "to be defined."
-    #             )
-    #         if "config" not in kwargs_encoder:
-    #             encoder_config, kwargs_encoder = transformers.AutoConfig.from_pretrained(
-    #                 encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True
-    #             )
-    #             if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True:
-    #                 logger.info(
-    #                     f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model "
-    #                     "from a decoder model. Cross-attention and casual mask are disabled."
-    #                 )
-    #                 encoder_config.is_decoder = False
-    #                 encoder_config.add_cross_attention = False
-    #             kwargs_encoder["config"] = encoder_config
-    #         encoder = transformers.AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
-    #     decoder = kwargs_decoder.pop("model", None)
-    #     if decoder is None:
-    #         if decoder_pretrained_model_name_or_path is None:
-    #             raise ValueError(
-    #                 "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has "
-    #                 "to be defined."
-    #             )
-    #         if "config" not in kwargs_decoder:
-    #             decoder_config, kwargs_decoder = transformers.AutoConfig.from_pretrained(
-    #                 decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True
-    #             )
-    #             if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False:
-    #                 logger.info(
-    #                     f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. Cross attention"
-    #                     f" layers are added to {decoder_pretrained_model_name_or_path} and randomly initialized if"
-    #                     f" {decoder_pretrained_model_name_or_path}'s architecture allows for cross attention layers."
-    #                 )
-    #                 decoder_config.is_decoder = True
-    #                 decoder_config.add_cross_attention = False
-    #             kwargs_decoder["config"] = decoder_config
-    #         if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False:
-    #             logger.warning(
-    #                 f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. "
-    #                 f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, "
-    #                 "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` "
-    #                 "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a "
-    #                 "`decoder_config` to `.from_encoder_decoder_pretrained(...)`"
-    #             )
-    #         decoder = transformers.AutoModelForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
-    #     # instantiate config with corresponding kwargs
-    #     config = CXRMateEDConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs)
-    #     # make sure input & output embeddings is not tied
-    #     config.tie_word_embeddings = False
-    #     config.is_encoder_decoder = False
-    #     return cls(encoder=encoder, decoder=decoder, config=config)
     def forward(
         self,
@@ -712,80 +589,7 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
                 sections[j].append(section_string)
         return tuple(sections.values())
-    def tokenize_text_prompt(self, tokenizer: PreTrainedTokenizerFast, **kwargs):
-        """
-        Tokenize the text columns from MIMIC-IV ED and MIMIC-CXR (excluding the findings and impression sections).
-        Time deltas for the input_ids are also prepared here.
-        Argument/s:
-            tokenizer - Hugging Face tokenizer.
-        Returns:
-            ed - dictionary containing the input_ids, token_type_ids, attention_mask and time_deltas for the ED module columns.
-            cxr - dictionary containing the input_ids, token_type_ids, and attention_mask for MIMIC-CXR columns.
-        """
-        batch_size = len(kwargs['study_id'])
-        tokenized = {
-            'input_ids': {i: [] for i in range(batch_size)},
-            'token_type_ids': {i: [] for i in range(batch_size)},
-            'time_delta': {i: [] for i in range(batch_size)},
-            'attention_mask': torch.empty(batch_size, 0, 1, device=self.device),
-        }
-        prompt_text_columns = [f'{k}_{j}' if k != 'mimic_cxr_sectioned' else j for k, v in self.tables.items() if 'text_columns' in v for j in (v['text_columns'] if isinstance(v['text_columns'], list) else [v['text_columns']])] + ['prior_findings', 'prior_impression']
-        for i in prompt_text_columns:
-            if i in kwargs:
-                if f'{i}_time_delta' not in kwargs:
-                    kwargs[f'{i}_time_delta'] = [[self.zero_time_delta_value for _ in j] if j is not None else None for j in kwargs[i]]
-                for x, (y, z) in enumerate(zip(kwargs[i], kwargs[f'{i}_time_delta'])):
-                    if y is not None:
-                        assert isinstance(y, list)
-                        assert isinstance(z, list)
-                        for text, time_delta in zip(y, z):
-                            if text is not None:
-                                tokenized['input_ids'][x].append(
-                                    tokenizer(text, add_special_tokens=False, return_tensors='pt')['input_ids'].to(device=self.device)
-                                )
-                                tokenized['token_type_ids'][x].append(
-                                    torch.full(
-                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
-                                        self.token_type_to_token_type_id[i],
-                                        dtype=torch.long,
-                                        device=self.device,
-                                    )
-                                )
-                                tokenized['time_delta'][x].append(
-                                    torch.full(
-                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
-                                        time_delta,
-                                        dtype=torch.float32,
-                                        device=self.device,
-                                    )
-                                )
-        tokenized['input_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['input_ids'].values()]
-        tokenized['token_type_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['token_type_ids'].values()]
-        tokenized['time_delta'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, device=self.device) for j in tokenized['time_delta'].values()]
-        tokenized['input_ids'] = torch.nn.utils.rnn.pad_sequence(
-            tokenized['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
-        )[:, :, 0]
-        tokenized['token_type_ids'] = torch.nn.utils.rnn.pad_sequence(
-            tokenized['token_type_ids'], batch_first=True, padding_value=0,
-        )[:, :, 0]
-        tokenized['attention_mask'] = (tokenized['input_ids'] != tokenizer.pad_token_id).int()
-        tokenized['time_delta'] = torch.nn.utils.rnn.pad_sequence(
-            tokenized['time_delta'], batch_first=True, padding_value=0,
-        )
-        return tokenized
     def prepare_inputs(
         self,
         images,
@@ -914,7 +718,219 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
         assert inputs_embeds.shape[1] == token_type_ids.shape[1]
         return inputs_embeds, attention_mask, token_type_ids, position_ids, bos_token_ids
     @staticmethod
     def create_4d_attention_mask_mixed_causality(non_causal_2d_attention_mask, causal_2d_attention_mask, dtype):
@@ -983,86 +999,24 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
         mixed_causality_4d_attention_mask[mixed_causality_4d_attention_mask == 1] = 0.0
         return mixed_causality_4d_attention_mask
-    # @staticmethod
-    # def create_4d_attention_mask_mixed_causality(non_causal_2d_attention_mask, causal_2d_attention_mask):
-    #     prompt_seq_len = non_causal_2d_attention_mask.shape[-1]
-    #     report_seq_len = causal_2d_attention_mask.shape[-1]
-    #     non_causal_2d_attention_mask = non_causal_2d_attention_mask[:, None, None, :]
-    #     causal_2d_attention_mask = causal_2d_attention_mask[:, None, None, :]
-    #     # Upper left of attention matrix:
-    #     upper_left = non_causal_2d_attention_mask.expand(-1, -1, prompt_seq_len, -1)
-    #     upper_left = upper_left * non_causal_2d_attention_mask
-    #     upper_left = upper_left * non_causal_2d_attention_mask.permute(0, 1, 3, 2)
-    #     causal_mask = torch.tril(
-    #         torch.ones(
-    #             (
-    #                 report_seq_len,
-    #                 report_seq_len,
-    #             ),
-    #             dtype=torch.long,
-    #             device=causal_2d_attention_mask.device,
-    #         ),
-    #     )
-    #     # Lower right of attention matrix:
-    #     lower_right = causal_2d_attention_mask.expand(-1, -1, report_seq_len, -1)
-    #     lower_right = lower_right * causal_2d_attention_mask.permute(0, 1, 3, 2)
-    #     lower_right = lower_right * causal_mask
-    #     # Upper right of attention matrix:
-    #     upper_right = torch.zeros(
-    #         causal_2d_attention_mask.shape[0],
-    #         1,
-    #         prompt_seq_len,
-    #         report_seq_len,
-    #         dtype=torch.long,
-    #         device=causal_2d_attention_mask.device,
-    #     )
-    #     # Lower left of attention matrix:
-    #     lower_left = non_causal_2d_attention_mask.expand(-1, -1, report_seq_len, -1)
-    #     lower_left = lower_left * causal_2d_attention_mask.permute(0, 1, 3, 2)
-    #     left = torch.cat((upper_left, lower_left), dim=2)
-    #     right = torch.cat((upper_right, lower_right), dim=2)
-    #     mixed_causality_4d_attention_mask = torch.cat((left, right), dim=-1)
-    #     return mixed_causality_4d_attention_mask
-    # @staticmethod
-    # def create_4d_attention_mask_mixed_causality_past_key_values(non_causal_2d_attention_mask, causal_2d_attention_mask):
-    #     non_causal_2d_attention_mask = non_causal_2d_attention_mask[:, None, None, :]
-    #     causal_2d_attention_mask = causal_2d_attention_mask[:, None, None, :]
-    #     mixed_causality_4d_attention_mask = torch.cat((non_causal_2d_attention_mask, causal_2d_attention_mask), dim=-1)
-    #     return mixed_causality_4d_attention_mask
-    def position_ids_from_time_deltas_and_attention_mask(self, time_deltas, attention_mask):
-        mask_value = torch.finfo(time_deltas.dtype).max if self.config.time_delta_monotonic_inversion else torch.finfo(time_deltas.dtype).min
-        masked_time_deltas = torch.where(attention_mask == 1, time_deltas[:, :, 0], mask_value)
-        _, col_indices = torch.sort(masked_time_deltas, descending=not self.config.time_delta_monotonic_inversion)
-        num_rows, num_cols, _ = time_deltas.shape
-        row_indices = torch.arange(num_rows, device=time_deltas.device).view(-1, 1).repeat(1, num_cols).view(-1)
-        position_ids = torch.zeros_like(col_indices, device=time_deltas.device)
-        position_ids[row_indices, col_indices.flatten()] = torch.arange(num_cols, device=time_deltas.device)[None, :].expand(num_rows, -1).flatten()
-        position_ids.masked_fill_(attention_mask == 0, 1)  # Following: https://github.com/huggingface/transformers/blob/c5f0288bc7d76f65996586f79f69fba8867a0e67/src/transformers/models/llama/modeling_llama.py#L1285
-        return position_ids
-    def get_dataset(self, dataset_path, train_transforms=None, test_transforms=None, max_train_images_per_study=None, study_id_split='mimic_iv_ed_mimic_cxr_jpg', test_set_only=False):
-        assert max_train_images_per_study is not None, 'max_train_images_per_study must be defined.'
-        assert test_transforms is not None, 'test_transforms must be defined.'
         def train_set_transform(batch):
@@ -1081,7 +1035,7 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
-            batch['images'] = [torch.stack([train_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
@@ -1104,7 +1058,7 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
-            batch['images'] = [torch.stack([test_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
@@ -1177,7 +1131,9 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
         else:
             return test_set
-    def get_stage_1_dataset(self, dataset_path, train_transforms, test_transforms, max_train_images_per_study):
         def train_set_transform(batch):
@@ -1192,7 +1148,7 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
-            batch['images'] = [torch.stack([train_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
@@ -1204,7 +1160,7 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
-            batch['images'] = [torch.stack([test_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
@@ -1256,138 +1212,4 @@ class CXRMateEDModel(transformers.LlavaForConditionalGeneration):
         test_set = Subset(test_set, indices)
         return train_set, val_set, test_set
-    def prepare_index_value_feats(self, table, batch):
-        index_value_columns = (self.tables[table].get('index_columns', []) + self.tables[table].get('value_columns', []))
-        index_value_columns = [f'{table}_{i}' for i in index_value_columns] if table != 'mimic_cxr_2_0_0_metadata' else index_value_columns
-        # Map to indices with lookup table:
-        if 'index_columns' in self.tables[table]:
-            for i in self.tables[table]['index_columns']:
-                k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
-                batch[k] = [
-                    [self.luts[table][i][str(k)] if k is not None else None for k in j] if j is not None else None for j in batch[k]
-                ]
-        batch_index_value_feats_list = []
-        batch_token_type_ids_list = []
-        batch_time_deltas_list = []
-        for batch_idx in range(len(batch['study_id'])):
-            if any([batch[k][batch_idx] for k in index_value_columns]):
-                num_rows = [len(batch[i][batch_idx]) for i in index_value_columns]
-                assert all(x == num_rows[0] for x in num_rows)
-                num_rows = num_rows[0]
-                # The y-index and the datetime for each group:
-                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
-                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
-                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
-                    assert len(set(y_indices)) == len(datetime)
-                else:
-                    y_indices = [0] * num_rows
-                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
-                time_deltas = torch.tensor([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])[:, None]
-                tensor = torch.zeros(max(y_indices) + 1, self.luts[table]['total'])
-                # Index columns to feats:
-                if 'index_columns' in self.tables[table]:
-                    for i in self.tables[table]['index_columns']:
-                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
-                        y_indices_column = [y_idx for y_idx, x_idx in zip(y_indices, batch[k][batch_idx]) if x_idx is not None]
-                        x_indices_column = [x_idx for x_idx in batch[k][batch_idx] if x_idx is not None]
-                        tensor[y_indices_column, x_indices_column] = 1.0
-                if 'value_columns' in self.tables[table]:
-                    for i in self.tables[table]['value_columns']:
-                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
-                        y_indices_column = [y_idx for y_idx, value in zip(y_indices, batch[k][batch_idx]) if value is not None]
-                        x_indices_column = [self.luts[table][i] for value in batch[k][batch_idx] if value is not None]
-                        values = [value for value in batch[k][batch_idx] if value is not None]
-                        tensor[y_indices_column, x_indices_column] = torch.tensor(values, dtype=tensor.dtype)
-                        assert not torch.isnan(tensor).any()
-            else:
-                tensor = torch.empty(0, self.luts[table]['total'])
-                time_deltas = torch.empty(0, 1)
-            batch_index_value_feats_list.append(tensor)
-            batch_token_type_ids_list.append(torch.full(
-                    [tensor.shape[0]],
-                    self.token_type_to_token_type_id[table],
-                    dtype=torch.long,
-                )
-            )
-            batch_time_deltas_list.append(time_deltas)
-            assert tensor.shape[0] == batch_token_type_ids_list[-1].shape[0]
-            assert tensor.shape[0] == time_deltas.shape[0]
-        batch_index_value_feats = torch.nn.utils.rnn.pad_sequence(batch_index_value_feats_list, batch_first=True, padding_value=-1)  # Pad value of -1 is not ideal. Need to use something else.
-        batch_token_type_ids = torch.nn.utils.rnn.pad_sequence(batch_token_type_ids_list, batch_first=True, padding_value=0)
-        batch_time_deltas = torch.nn.utils.rnn.pad_sequence(batch_time_deltas_list, batch_first=True, padding_value=0)
-        batch_mask = (batch_index_value_feats != -1).any(dim=-1).int()
-        return batch_index_value_feats, batch_token_type_ids, batch_time_deltas, batch_mask
-    def prepare_text_prompt(self, table, column, batch):
-        key = f'{table}_{column}' if not table == 'mimic_cxr_sectioned' else column
-        batch_text_list = []
-        batch_time_deltas_list = []
-        for batch_idx in range(len(batch['study_id'])):
-            if batch[key][batch_idx]:
-                num_rows = len(batch[key][batch_idx])
-                # The y-index and the datetime for each group:
-                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
-                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
-                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
-                    assert len(set(y_indices)) == len(datetime)
-                else:
-                    y_indices = [0] * num_rows
-                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
-                # Remove None values:
-                text_rows = batch[key][batch_idx] if isinstance(batch[key][batch_idx], list) else [batch[key][batch_idx]]
-                y_indices = [i for i, j in zip(y_indices, text_rows) if j is not None]
-                text_rows = [i for i in text_rows if i is not None]
-                datetime = [datetime[i] for i in set(y_indices)]
-                if text_rows:
-                    # Those in the same group (or those with the same y-index) get joined as the same string:
-                    batch_text_list.append([', '.join([text_rows[j] for j in range(len(y_indices)) if y_indices[j] == k]) + '.' for k in set(y_indices)])
-                    batch_time_deltas_list.append([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])
-                    assert len(batch_time_deltas_list[-1]) == len(batch_text_list[-1])
-                else:
-                    batch_text_list.append([])
-                    batch_time_deltas_list.append([])
-            else:
-                batch_text_list.append([])
-                batch_time_deltas_list.append([])
-        return batch_text_list, batch_time_deltas_list
-    @staticmethod
-    def collate_fn(batch):
-        keys = set().union(*(d.keys() for d in batch))
-        batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
-        batch = {k: torch.stack(v) if isinstance(v[0], torch.Tensor) else v for k, v in batch.items()}
-        return batch
-    @staticmethod
-    def prepare_dataset(physionet_dir: str, database_dir: str):
-        prepare_dataset(physionet_dir=physionet_dir, database_dir=database_dir)

 import torch
 import transformers
 from huggingface_hub import hf_hub_download
+from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from torch.nn import CrossEntropyLoss
 from torch.utils.data import Subset
 from torchvision.io import decode_image
+from torchvision.transforms import v2
+from transformers import PreTrainedTokenizerFast
 from transformers.modeling_outputs import ModelOutput, Seq2SeqLMOutput
 from transformers.utils import check_min_version, logging
 from .configuration_cxrmate_ed import CXRMateEDConfig
         self.inf_time_delta_value = self.time_delta_map(float('inf'))
+        # Image transformations:
+        self.train_transforms = v2.Compose(
+            [
+                v2.Grayscale(num_output_channels=3),
+                v2.Resize(
+                    size=self.config.vision_config.image_size,
+                    antialias=True,
+                    interpolation=v2.InterpolationMode.BICUBIC,
+                ),
+                v2.RandomCrop(
+                    size=[self.config.vision_config.image_size, self.config.vision_config.image_size],
+                    pad_if_needed=True,
+                ),
+                v2.RandomRotation(degrees=5),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+            ]
+        )
+        self.test_transforms = v2.Compose(
+            [
+                v2.Grayscale(num_output_channels=3),
+                v2.Resize(
+                    size=self.config.vision_config.image_size,
+                    antialias=True,
+                    interpolation=v2.InterpolationMode.BICUBIC,
+                ),
+                v2.CenterCrop(size=[self.config.vision_config.image_size, self.config.vision_config.image_size]),
+                v2.ToDtype(torch.float32, scale=True),
+                v2.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+            ]
+        )
+        self.post_init()
     def forward(
         self,
                 sections[j].append(section_string)
         return tuple(sections.values())
     def prepare_inputs(
         self,
         images,
         assert inputs_embeds.shape[1] == token_type_ids.shape[1]
         return inputs_embeds, attention_mask, token_type_ids, position_ids, bos_token_ids
+    def tokenize_text_prompt(self, tokenizer: PreTrainedTokenizerFast, **kwargs):
+        """
+        Tokenize the text columns from MIMIC-IV ED and MIMIC-CXR (excluding the findings and impression sections).
+        Time deltas for the input_ids are also prepared here.
+        Argument/s:
+            tokenizer - Hugging Face tokenizer.
+        Returns:
+            ed - dictionary containing the input_ids, token_type_ids, attention_mask and time_deltas for the ED module columns.
+            cxr - dictionary containing the input_ids, token_type_ids, and attention_mask for MIMIC-CXR columns.
+        """
+        batch_size = len(kwargs['study_id'])
+        tokenized = {
+            'input_ids': {i: [] for i in range(batch_size)},
+            'token_type_ids': {i: [] for i in range(batch_size)},
+            'time_delta': {i: [] for i in range(batch_size)},
+            'attention_mask': torch.empty(batch_size, 0, 1, device=self.device),
+        }
+        prompt_text_columns = [f'{k}_{j}' if k != 'mimic_cxr_sectioned' else j for k, v in self.tables.items() if 'text_columns' in v for j in (v['text_columns'] if isinstance(v['text_columns'], list) else [v['text_columns']])] + ['prior_findings', 'prior_impression']
+        for i in prompt_text_columns:
+            if i in kwargs:
+                if f'{i}_time_delta' not in kwargs:
+                    kwargs[f'{i}_time_delta'] = [[self.zero_time_delta_value for _ in j] if j is not None else None for j in kwargs[i]]
+                for x, (y, z) in enumerate(zip(kwargs[i], kwargs[f'{i}_time_delta'])):
+                    if y is not None:
+                        assert isinstance(y, list)
+                        assert isinstance(z, list)
+                        for text, time_delta in zip(y, z):
+                            if text is not None:
+                                tokenized['input_ids'][x].append(
+                                    tokenizer(text, add_special_tokens=False, return_tensors='pt')['input_ids'].to(device=self.device)
+                                )
+                                tokenized['token_type_ids'][x].append(
+                                    torch.full(
+                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
+                                        self.token_type_to_token_type_id[i],
+                                        dtype=torch.long,
+                                        device=self.device,
+                                    )
+                                )
+                                tokenized['time_delta'][x].append(
+                                    torch.full(
+                                        (1, tokenized['input_ids'][x][-1].shape[-1]),
+                                        time_delta,
+                                        dtype=torch.float32,
+                                        device=self.device,
+                                    )
+                                )
+        tokenized['input_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['input_ids'].values()]
+        tokenized['token_type_ids'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, dtype=torch.long, device=self.device) for j in tokenized['token_type_ids'].values()]
+        tokenized['time_delta'] = [torch.cat(j, dim=1).T if j else torch.empty(0, 1, device=self.device) for j in tokenized['time_delta'].values()]
+        tokenized['input_ids'] = torch.nn.utils.rnn.pad_sequence(
+            tokenized['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
+        )[:, :, 0]
+        tokenized['token_type_ids'] = torch.nn.utils.rnn.pad_sequence(
+            tokenized['token_type_ids'], batch_first=True, padding_value=0,
+        )[:, :, 0]
+        tokenized['attention_mask'] = (tokenized['input_ids'] != tokenizer.pad_token_id).int()
+        tokenized['time_delta'] = torch.nn.utils.rnn.pad_sequence(
+            tokenized['time_delta'], batch_first=True, padding_value=0,
+        )
+        return tokenized
+    def position_ids_from_time_deltas_and_attention_mask(self, time_deltas, attention_mask):
+        mask_value = torch.finfo(time_deltas.dtype).max if self.config.time_delta_monotonic_inversion else torch.finfo(time_deltas.dtype).min
+        masked_time_deltas = torch.where(attention_mask == 1, time_deltas[:, :, 0], mask_value)
+        _, col_indices = torch.sort(masked_time_deltas, descending=not self.config.time_delta_monotonic_inversion)
+        num_rows, num_cols, _ = time_deltas.shape
+        row_indices = torch.arange(num_rows, device=time_deltas.device).view(-1, 1).repeat(1, num_cols).view(-1)
+        position_ids = torch.zeros_like(col_indices, device=time_deltas.device)
+        position_ids[row_indices, col_indices.flatten()] = torch.arange(num_cols, device=time_deltas.device)[None, :].expand(num_rows, -1).flatten()
+        position_ids.masked_fill_(attention_mask == 0, 1)  # Following: https://github.com/huggingface/transformers/blob/c5f0288bc7d76f65996586f79f69fba8867a0e67/src/transformers/models/llama/modeling_llama.py#L1285
+        return position_ids
+    def prepare_index_value_feats(self, table, batch):
+        index_value_columns = (self.tables[table].get('index_columns', []) + self.tables[table].get('value_columns', []))
+        index_value_columns = [f'{table}_{i}' for i in index_value_columns] if table != 'mimic_cxr_2_0_0_metadata' else index_value_columns
+        # Map to indices with lookup table:
+        if 'index_columns' in self.tables[table]:
+            for i in self.tables[table]['index_columns']:
+                k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                batch[k] = [
+                    [self.luts[table][i][str(k)] if k is not None else None for k in j] if j is not None else None for j in batch[k]
+                ]
+        batch_index_value_feats_list = []
+        batch_token_type_ids_list = []
+        batch_time_deltas_list = []
+        for batch_idx in range(len(batch['study_id'])):
+            if any([batch[k][batch_idx] for k in index_value_columns]):
+                num_rows = [len(batch[i][batch_idx]) for i in index_value_columns]
+                assert all(x == num_rows[0] for x in num_rows)
+                num_rows = num_rows[0]
+                # The y-index and the datetime for each group:
+                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
+                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
+                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
+                    assert len(set(y_indices)) == len(datetime)
+                else:
+                    y_indices = [0] * num_rows
+                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
+                time_deltas = torch.tensor([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])[:, None]
+                tensor = torch.zeros(max(y_indices) + 1, self.luts[table]['total'])
+                # Index columns to feats:
+                if 'index_columns' in self.tables[table]:
+                    for i in self.tables[table]['index_columns']:
+                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                        y_indices_column = [y_idx for y_idx, x_idx in zip(y_indices, batch[k][batch_idx]) if x_idx is not None]
+                        x_indices_column = [x_idx for x_idx in batch[k][batch_idx] if x_idx is not None]
+                        tensor[y_indices_column, x_indices_column] = 1.0
+                if 'value_columns' in self.tables[table]:
+                    for i in self.tables[table]['value_columns']:
+                        k = f'{table}_{i}' if not table == 'mimic_cxr_2_0_0_metadata' else i
+                        y_indices_column = [y_idx for y_idx, value in zip(y_indices, batch[k][batch_idx]) if value is not None]
+                        x_indices_column = [self.luts[table][i] for value in batch[k][batch_idx] if value is not None]
+                        values = [value for value in batch[k][batch_idx] if value is not None]
+                        tensor[y_indices_column, x_indices_column] = torch.tensor(values, dtype=tensor.dtype)
+                        assert not torch.isnan(tensor).any()
+            else:
+                tensor = torch.empty(0, self.luts[table]['total'])
+                time_deltas = torch.empty(0, 1)
+            batch_index_value_feats_list.append(tensor)
+            batch_token_type_ids_list.append(torch.full(
+                    [tensor.shape[0]],
+                    self.token_type_to_token_type_id[table],
+                    dtype=torch.long,
+                )
+            )
+            batch_time_deltas_list.append(time_deltas)
+            assert tensor.shape[0] == batch_token_type_ids_list[-1].shape[0]
+            assert tensor.shape[0] == time_deltas.shape[0]
+        batch_index_value_feats = torch.nn.utils.rnn.pad_sequence(batch_index_value_feats_list, batch_first=True, padding_value=-1)  # Pad value of -1 is not ideal. Need to use something else.
+        batch_token_type_ids = torch.nn.utils.rnn.pad_sequence(batch_token_type_ids_list, batch_first=True, padding_value=0)
+        batch_time_deltas = torch.nn.utils.rnn.pad_sequence(batch_time_deltas_list, batch_first=True, padding_value=0)
+        batch_mask = (batch_index_value_feats != -1).any(dim=-1).int()
+        return batch_index_value_feats, batch_token_type_ids, batch_time_deltas, batch_mask
+    def prepare_text_prompt(self, table, column, batch):
+        key = f'{table}_{column}' if not table == 'mimic_cxr_sectioned' else column
+        batch_text_list = []
+        batch_time_deltas_list = []
+        for batch_idx in range(len(batch['study_id'])):
+            if batch[key][batch_idx]:
+                num_rows = len(batch[key][batch_idx])
+                # The y-index and the datetime for each group:
+                if isinstance(batch[self.tables[table]['groupby']][batch_idx], list):
+                    y_indices = [d.setdefault(x, len(d)) for d in [{}] for x in batch[self.tables[table]['groupby']][batch_idx]]
+                    datetime = [j for i, j in enumerate(batch[self.tables[table]['time_column']][batch_idx]) if j not in batch[self.tables[table]['time_column']][batch_idx][:i]]
+                    assert len(set(y_indices)) == len(datetime)
+                else:
+                    y_indices = [0] * num_rows
+                    datetime = batch[self.tables[table]['time_column']][batch_idx] if 'time_column' in self.tables[table] else [batch['latest_study_datetime'][batch_idx]]
+                # Remove None values:
+                text_rows = batch[key][batch_idx] if isinstance(batch[key][batch_idx], list) else [batch[key][batch_idx]]
+                y_indices = [i for i, j in zip(y_indices, text_rows) if j is not None]
+                text_rows = [i for i in text_rows if i is not None]
+                datetime = [datetime[i] for i in set(y_indices)]
+                if text_rows:
+                    # Those in the same group (or those with the same y-index) get joined as the same string:
+                    batch_text_list.append([', '.join([text_rows[j] for j in range(len(y_indices)) if y_indices[j] == k]) + '.' for k in set(y_indices)])
+                    batch_time_deltas_list.append([compute_time_delta(i, batch['latest_study_datetime'][batch_idx], self.time_delta_map, to_tensor=False) for i in datetime])
+                    assert len(batch_time_deltas_list[-1]) == len(batch_text_list[-1])
+                else:
+                    batch_text_list.append([])
+                    batch_time_deltas_list.append([])
+            else:
+                batch_text_list.append([])
+                batch_time_deltas_list.append([])
+        return batch_text_list, batch_time_deltas_list
     @staticmethod
     def create_4d_attention_mask_mixed_causality(non_causal_2d_attention_mask, causal_2d_attention_mask, dtype):
         mixed_causality_4d_attention_mask[mixed_causality_4d_attention_mask == 1] = 0.0
         return mixed_causality_4d_attention_mask
+    @staticmethod
+    def collate_fn(batch):
+        keys = set().union(*(d.keys() for d in batch))
+        batch = {j: [i.setdefault(j, None) for i in batch] for j in keys}
+        batch = {k: torch.stack(v) if isinstance(v[0], torch.Tensor) else v for k, v in batch.items()}
+        return batch
+    @staticmethod
+    def prepare_dataset(physionet_dir: str, database_dir: str):
+        prepare_dataset(physionet_dir=physionet_dir, database_dir=database_dir)
+    def get_dataset(self, database_dir, max_train_images_per_study=None, study_id_split='mimic_iv_ed_mimic_cxr_jpg', test_set_only=False):
+        dataset_path = os.path.join(database_dir, 'mimic_iv_ed_mimic_cxr_jpg_dataset')
+        assert max_train_images_per_study is not None or test_set_only, 'max_train_images_per_study must be defined if training.'
         def train_set_transform(batch):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([self.train_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([self.test_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
         else:
             return test_set
+    def get_stage_1_dataset(self, database_dir, max_train_images_per_study):
+        dataset_path = os.path.join(database_dir, 'mimic_iv_ed_mimic_cxr_jpg_dataset')
         def train_set_transform(batch):
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([self.train_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
             # Sort based on ViewPosition:
             batch['images'] = [list(zip(*sorted(zip(i, v), key=lambda x: VIEW_ORDER.index(x[1]))))[0] for i, v in zip(batch['images'], batch['ViewPosition'])]
+            batch['images'] = [torch.stack([self.test_transforms(j) for j in i]) for i in batch['images']]
             max_size = max(i.shape[0] for i in batch['images'])
             batch['image_time_deltas'] = [[self.zero_time_delta_value if j < i.shape[0] else self.inf_time_delta_value for j in range(max_size)] for i in batch['images']]
             batch['images'] = torch.nn.utils.rnn.pad_sequence(batch['images'], batch_first=True, padding_value=0.0)
         test_set = Subset(test_set, indices)
         return train_set, val_set, test_set