Geneformer / geneformer /collator_for_classification.py

Christina Theodoris

Subclass collator for cell classification

402ba9b over 1 year ago

29.8 kB

	"""
	Geneformer collator for gene and cell classification.

	Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
	"""
	import numpy as np
	import torch
	import warnings
	from enum import Enum
	from typing import Dict, List, Optional, Union

	from transformers import (
	DataCollatorForTokenClassification,
	SpecialTokensMixin,
	BatchEncoding,
	)
	from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
	from transformers.utils.generic import _is_tensorflow, _is_torch

	from .pretrainer import token_dictionary

	EncodedInput = List[int]
	logger = logging.get_logger(__name__)
	VERY_LARGE_INTEGER = int(
	1e30
	) # This is used to set the max input length for a model with infinite size input
	LARGE_INTEGER = int(
	1e20
	) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER

	# precollator functions

	class ExplicitEnum(Enum):
	"""
	Enum with more explicit error message for missing values.
	"""

	@classmethod
	def _missing_(cls, value):
	raise ValueError(
	"%r is not a valid %s, please select one of %s"
	% (value, cls.__name__, str(list(cls._value2member_map_.keys())))
	)

	class TruncationStrategy(ExplicitEnum):
	"""
	Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
	tab-completion in an IDE.
	"""

	ONLY_FIRST = "only_first"
	ONLY_SECOND = "only_second"
	LONGEST_FIRST = "longest_first"
	DO_NOT_TRUNCATE = "do_not_truncate"



	class PaddingStrategy(ExplicitEnum):
	"""
	Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
	in an IDE.
	"""

	LONGEST = "longest"
	MAX_LENGTH = "max_length"
	DO_NOT_PAD = "do_not_pad"



	class TensorType(ExplicitEnum):
	"""
	Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
	tab-completion in an IDE.
	"""

	PYTORCH = "pt"
	TENSORFLOW = "tf"
	NUMPY = "np"
	JAX = "jax"


	class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
	mask_token = "<mask>"
	mask_token_id = token_dictionary.get("<mask>")
	pad_token = "<pad>"
	pad_token_id = token_dictionary.get("<pad>")
	padding_side = "right"
	all_special_ids = [
	token_dictionary.get("<mask>"),
	token_dictionary.get("<pad>")
	]
	model_input_names = ["input_ids"]

	def _get_padding_truncation_strategies(
	self, padding=True, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
	):
	"""
	Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
	and pad_to_max_length) and behaviors.
	"""
	old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
	old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)

	# Backward compatibility for previous behavior, maybe we should deprecate it:
	# If you only set max_length, it activates truncation for max_length
	if max_length is not None and padding is False and truncation is False:
	if verbose:
	if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
	logger.warning(
	"Truncation was not explicitly activated but `max_length` is provided a specific value, "
	"please use `truncation=True` to explicitly truncate examples to max length. "
	"Defaulting to 'longest_first' truncation strategy. "
	"If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
	"more precisely by providing a specific strategy to `truncation`."
	)
	self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
	truncation = "longest_first"

	# Get padding strategy
	if padding is False and old_pad_to_max_length:
	if verbose:
	warnings.warn(
	"The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
	"use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
	"use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
	"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
	"maximal input size of the model (e.g. 512 for Bert).",
	FutureWarning,
	)
	if max_length is None:
	padding_strategy = PaddingStrategy.LONGEST
	else:
	padding_strategy = PaddingStrategy.MAX_LENGTH
	elif padding is not False:
	if padding is True:
	padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
	elif not isinstance(padding, PaddingStrategy):
	padding_strategy = PaddingStrategy(padding)
	elif isinstance(padding, PaddingStrategy):
	padding_strategy = padding
	else:
	padding_strategy = PaddingStrategy.DO_NOT_PAD

	# Get truncation strategy
	if truncation is False and old_truncation_strategy != "do_not_truncate":
	if verbose:
	warnings.warn(
	"The `truncation_strategy` argument is deprecated and will be removed in a future version, "
	"use `truncation=True` to truncate examples to a max length. You can give a specific "
	"length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
	"maximal input size of the model (e.g. 512 for Bert). "
	" If you have pairs of inputs, you can give a specific truncation strategy selected among "
	"`truncation='only_first'` (will only truncate the first sentence in the pairs) "
	"`truncation='only_second'` (will only truncate the second sentence in the pairs) "
	"or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
	FutureWarning,
	)
	truncation_strategy = TruncationStrategy(old_truncation_strategy)
	elif truncation is not False:
	if truncation is True:
	truncation_strategy = (
	TruncationStrategy.LONGEST_FIRST
	) # Default to truncate the longest sequences in pairs of inputs
	elif not isinstance(truncation, TruncationStrategy):
	truncation_strategy = TruncationStrategy(truncation)
	elif isinstance(truncation, TruncationStrategy):
	truncation_strategy = truncation
	else:
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

	# Set max length if needed
	if max_length is None:
	if padding_strategy == PaddingStrategy.MAX_LENGTH:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
	logger.warning(
	"Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
	"Default to no padding."
	)
	self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
	padding_strategy = PaddingStrategy.DO_NOT_PAD
	else:
	max_length = self.model_max_length

	if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
	logger.warning(
	"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
	"Default to no truncation."
	)
	self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
	else:
	max_length = self.model_max_length

	# Test if we have a padding token
	if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
	raise ValueError(
	"Asking to pad but the tokenizer does not have a padding token. "
	"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
	"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
	)

	# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
	if (
	truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
	and padding_strategy != PaddingStrategy.DO_NOT_PAD
	and pad_to_multiple_of is not None
	and max_length is not None
	and (max_length % pad_to_multiple_of != 0)
	):
	raise ValueError(
	f"Truncation and padding are both activated but "
	f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
	)

	return padding_strategy, truncation_strategy, max_length, kwargs

	def pad(
	self,
	encoded_inputs: Union[
	BatchEncoding,
	List[BatchEncoding],
	Dict[str, EncodedInput],
	Dict[str, List[EncodedInput]],
	List[Dict[str, EncodedInput]],
	],
	class_type, # options: "gene" or "cell"
	padding: Union[bool, str, PaddingStrategy] = True,
	max_length: Optional[int] = None,
	pad_to_multiple_of: Optional[int] = None,
	return_attention_mask: Optional[bool] = True,
	return_tensors: Optional[Union[str, TensorType]] = None,
	verbose: bool = True,
	) -> BatchEncoding:
	"""
	Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
	in the batch.

	Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
	``self.pad_token_id`` and ``self.pad_token_type_id``)

	.. note::

	If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
	result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
	case of PyTorch tensors, you will lose the specific device of your tensors however.

	Args:
	encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
	Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
	List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
	List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
	well as in a PyTorch Dataloader collate function.

	Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
	see the note above for the return type.
	padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
	Select a strategy to pad the returned sequences (according to the model's padding side and padding
	index) among:

	* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
	single sequence if provided).
	* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
	maximum acceptable input length for the model if that argument is not provided.
	* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
	different lengths).
	max_length (:obj:`int`, `optional`):
	Maximum length of the returned list and optionally padding length (see above).
	pad_to_multiple_of (:obj:`int`, `optional`):
	If set will pad the sequence to a multiple of the provided value.

	This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
	>= 7.5 (Volta).
	return_attention_mask (:obj:`bool`, `optional`):
	Whether to return the attention mask. If left to the default, will return the attention mask according
	to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.

	`What are attention masks? <../glossary.html#attention-mask>`__
	return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
	If set, will return tensors instead of list of python integers. Acceptable values are:

	* :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
	* :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
	* :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
	verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
	Whether or not to print more information and warnings.
	"""
	# If we have a list of dicts, let's convert it in a dict of lists
	# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
	if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
	encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

	# The model's main input name, usually `input_ids`, has be passed for padding
	if self.model_input_names[0] not in encoded_inputs:
	raise ValueError(
	"You should supply an encoding or a list of encodings to this method"
	f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
	)

	required_input = encoded_inputs[self.model_input_names[0]]

	if not required_input:
	if return_attention_mask:
	encoded_inputs["attention_mask"] = []
	return encoded_inputs

	# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
	# and rebuild them afterwards if no return_tensors is specified
	# Note that we lose the specific device the tensor may be on for PyTorch

	first_element = required_input[0]
	if isinstance(first_element, (list, tuple)):
	# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
	index = 0
	while len(required_input[index]) == 0:
	index += 1
	if index < len(required_input):
	first_element = required_input[index][0]
	# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
	if not isinstance(first_element, (int, list, tuple)):
	if is_tf_available() and _is_tensorflow(first_element):
	return_tensors = "tf" if return_tensors is None else return_tensors
	elif is_torch_available() and _is_torch(first_element):
	return_tensors = "pt" if return_tensors is None else return_tensors
	elif isinstance(first_element, np.ndarray):
	return_tensors = "np" if return_tensors is None else return_tensors
	else:
	raise ValueError(
	f"type of {first_element} unknown: {type(first_element)}. "
	f"Should be one of a python, numpy, pytorch or tensorflow object."
	)

	for key, value in encoded_inputs.items():
	encoded_inputs[key] = to_py_obj(value)

	# Convert padding_strategy in PaddingStrategy
	padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
	padding=padding, max_length=max_length, verbose=verbose
	)

	required_input = encoded_inputs[self.model_input_names[0]]
	if required_input and not isinstance(required_input[0], (list, tuple)):
	encoded_inputs = self._pad(
	encoded_inputs,
	class_type=class_type,
	max_length=max_length,
	padding_strategy=padding_strategy,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)
	return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

	batch_size = len(required_input)
	assert all(
	len(v) == batch_size for v in encoded_inputs.values()
	), "Some items in the output dictionary have a different batch size than others."

	if padding_strategy == PaddingStrategy.LONGEST:
	max_length = max(len(inputs) for inputs in required_input)
	padding_strategy = PaddingStrategy.MAX_LENGTH

	batch_outputs = {}
	for i in range(batch_size):
	inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
	outputs = self._pad(
	inputs,
	class_type=class_type,
	max_length=max_length,
	padding_strategy=padding_strategy,
	pad_to_multiple_of=pad_to_multiple_of,
	return_attention_mask=return_attention_mask,
	)

	for key, value in outputs.items():
	if key not in batch_outputs:
	batch_outputs[key] = []
	batch_outputs[key].append(value)
	if class_type == "cell":
	del batch_outputs["label"]
	return BatchEncoding(batch_outputs, tensor_type=return_tensors)

	def _pad(
	self,
	encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
	class_type, # options: "gene" or "cell"
	max_length: Optional[int] = None,
	padding_strategy: PaddingStrategy = PaddingStrategy.LONGEST,
	pad_to_multiple_of: Optional[int] = None,
	return_attention_mask: Optional[bool] = True,
	) -> dict:
	"""
	Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

	Args:
	encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
	max_length: maximum length of the returned list and optionally padding length (see below).
	Will truncate by taking into account the special tokens.
	padding_strategy: PaddingStrategy to use for padding.

	- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
	- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
	- PaddingStrategy.DO_NOT_PAD: Do not pad
	The tokenizer padding sides are defined in self.padding_side:

	- 'left': pads on the left of the sequences
	- 'right': pads on the right of the sequences
	pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
	This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
	>= 7.5 (Volta).
	return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
	"""
	# Load from model defaults
	if return_attention_mask is None:
	return_attention_mask = "attention_mask" in self.model_input_names

	required_input = encoded_inputs[self.model_input_names[0]]

	if padding_strategy == PaddingStrategy.LONGEST:
	max_length = len(required_input)

	if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
	max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

	needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

	if needs_to_be_padded:
	difference = max_length - len(required_input)
	if self.padding_side == "right":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = (
	encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
	)
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
	encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
	if class_type == "gene":
	encoded_inputs["labels"] = encoded_inputs["labels"] + [-100] * difference
	elif self.padding_side == "left":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
	"token_type_ids"
	]
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
	encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
	if class_type == "gene":
	encoded_inputs["labels"] = [-100] * difference + encoded_inputs["labels"]
	else:
	raise ValueError("Invalid padding strategy:" + str(self.padding_side))
	elif return_attention_mask and "attention_mask" not in encoded_inputs:
	encoded_inputs["attention_mask"] = [1] * len(required_input)

	return encoded_inputs

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
	Args:
	token_ids_0 (:obj:`List[int]`):
	List of ids of the first sequence.
	token_ids_1 (:obj:`List[int]`, `optional`):
	List of ids of the second sequence.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the token list is already formatted with special tokens for the model.
	Returns:
	A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	assert already_has_special_tokens and token_ids_1 is None, (
	"You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
	"Please use a slow (full python) tokenizer to activate this argument."
	"Or set `return_special_tokens_mask=True` when calling the encoding method "
	"to get the special tokens mask in any tokenizer. "
	)

	all_special_ids = self.all_special_ids # cache the property

	special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]

	return special_tokens_mask

	def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
	"""
	Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
	vocabulary.
	Args:
	tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
	Returns:
	:obj:`int` or :obj:`List[int]`: The token id or list of token ids.
	"""
	if tokens is None:
	return None

	if isinstance(tokens, str):
	return self._convert_token_to_id_with_added_voc(tokens)

	ids = []
	for token in tokens:
	ids.append(self._convert_token_to_id_with_added_voc(token))
	return ids

	def _convert_token_to_id_with_added_voc(self, token):
	if token is None:
	return None

	return token_dictionary.get(token)

	def __len__(self):
	return len(token_dictionary)


	# collator functions

	class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
	"""
	Data collator that will dynamically pad the inputs received, as well as the labels.
	Args:
	tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
	The tokenizer used for encoding the data.
	padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
	Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
	among:
	* :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
	sequence if provided).
	* :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
	maximum acceptable input length for the model if that argument is not provided.
	* :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
	different lengths).
	max_length (:obj:`int`, `optional`):
	Maximum length of the returned list and optionally padding length (see above).
	pad_to_multiple_of (:obj:`int`, `optional`):
	If set will pad the sequence to a multiple of the provided value.
	This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
	7.5 (Volta).
	label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
	The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
	"""

	tokenizer = PrecollatorForGeneAndCellClassification()
	class_type = "gene"
	padding: Union[bool, str, PaddingStrategy] = True
	max_length: Optional[int] = None
	pad_to_multiple_of: Optional[int] = None
	label_pad_token_id: int = -100

	def __init__(self, args, *kwargs) -> None:
	super().__init__(
	tokenizer=self.tokenizer,
	padding=self.padding,
	max_length=self.max_length,
	pad_to_multiple_of=self.pad_to_multiple_of,
	label_pad_token_id=self.label_pad_token_id,
	args, *kwargs)

	def _prepare_batch(self, features):
	label_name = "label" if "label" in features[0].keys() else "labels"
	labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
	batch = self.tokenizer.pad(
	features,
	class_type=self.class_type,
	padding=self.padding,
	max_length=self.max_length,
	pad_to_multiple_of=self.pad_to_multiple_of,
	return_tensors="pt",
	)
	return batch

	def __call__(self, features):
	batch = self._prepare_batch(features)

	batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
	return batch


	class DataCollatorForCellClassification(DataCollatorForGeneClassification):

	class_type = "cell"

	def _prepare_batch(self, features):

	batch = super()._prepare_batch(features)

	# Special handling for labels.
	# Ensure that tensor is created with the correct type
	# (it should be automatically the case, but let's make sure of it.)
	first = features[0]
	if "label" in first and first["label"] is not None:
	label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
	dtype = torch.long if isinstance(label, int) else torch.float
	batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)

	return batch