| | import numpy as np
|
| | from .constants import (
|
| | QUESTION_COLUMN_NAME,
|
| | CONTEXT_COLUMN_NAME,
|
| | ANSWER_COLUMN_NAME,
|
| | ANSWERABLE_COLUMN_NAME,
|
| | ID_COLUMN_NAME
|
| | )
|
| |
|
| | def get_sketch_features(
|
| | tokenizer,
|
| | mode,
|
| | data_args
|
| | ):
|
| | """
|
| | Get the features for sketch model.
|
| |
|
| | Args:
|
| | tokenizer (Tokenizer): Tokenizer for tokenizing input examples.
|
| | mode (str): Mode of operation ("train", "eval", or "test").
|
| | data_args (dict): Additional arguments for data loading.
|
| |
|
| | Returns:
|
| | tuple: A tuple containing the function for preparing features and a boolean value indicating if labels are required.
|
| | """
|
| |
|
| | pad_on_right = tokenizer.padding_side == "right"
|
| | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
| |
|
| | def tokenize_fn(examples):
|
| | """
|
| | Tokenize input examples.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized examples.
|
| | """
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | questions = [q.strip() for q in examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME]]
|
| | contexts = [c.strip() for c in examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME]]
|
| |
|
| |
|
| | tokenized_examples = tokenizer(
|
| | questions,
|
| | contexts,
|
| | truncation="only_second" if pad_on_right else "only_first",
|
| | max_length=max_seq_length,
|
| | stride=data_args.doc_stride,
|
| | return_overflowing_tokens=True,
|
| | return_offsets_mapping=True,
|
| | return_token_type_ids=data_args.return_token_type_ids,
|
| | padding="max_length" if data_args.pad_to_max_length else False,
|
| | )
|
| |
|
| | return tokenized_examples
|
| |
|
| |
|
| |
|
| |
|
| | def prepare_train_features(examples):
|
| | """
|
| | Prepare training features by tokenizing the input examples and adding labels.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized and labeled examples.
|
| | """
|
| |
|
| | tokenized_examples = tokenize_fn(examples)
|
| | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
| |
|
| |
|
| |
|
| | tokenized_examples["labels"] = []
|
| | for i in range(len(tokenized_examples["input_ids"])):
|
| | sample_index = sample_mapping[i]
|
| |
|
| |
|
| | is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
| | tokenized_examples["labels"].append(1 if is_impossible else 0)
|
| |
|
| | return tokenized_examples
|
| |
|
| |
|
| | def prepare_eval_features(examples):
|
| | """
|
| | Prepare evaluation features by tokenizing the input examples and adding labels.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized and labeled examples.
|
| |
|
| | """
|
| |
|
| | tokenized_examples = tokenize_fn(examples)
|
| | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
| |
|
| |
|
| | tokenized_examples["example_id"] = []
|
| | tokenized_examples["labels"] = []
|
| |
|
| | for i in range(len(tokenized_examples["input_ids"])):
|
| |
|
| | sample_index = sample_mapping[i]
|
| |
|
| |
|
| | id_col = examples[ID_COLUMN_NAME][sample_index]
|
| | tokenized_examples["example_id"].append(id_col)
|
| |
|
| |
|
| |
|
| | is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
| | tokenized_examples["labels"].append(1 if is_impossible else 0)
|
| |
|
| | return tokenized_examples
|
| |
|
| |
|
| | def prepare_test_features(examples):
|
| | """
|
| | Prepare test features by tokenizing the input examples and adding example ids.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized and labeled examples.
|
| |
|
| | """
|
| |
|
| | tokenized_examples = tokenize_fn(examples)
|
| | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
| |
|
| |
|
| | tokenized_examples["example_id"] = []
|
| |
|
| | for i in range(len(tokenized_examples["input_ids"])):
|
| |
|
| | sample_index = sample_mapping[i]
|
| |
|
| |
|
| | id_col = examples[ID_COLUMN_NAME][sample_index]
|
| |
|
| |
|
| | tokenized_examples["example_id"].append(id_col)
|
| |
|
| | return tokenized_examples
|
| |
|
| |
|
| | if mode == "train":
|
| | get_features_fn = prepare_train_features
|
| | elif mode == "eval":
|
| | get_features_fn = prepare_eval_features
|
| | elif mode == "test":
|
| | get_features_fn = prepare_test_features
|
| |
|
| | return get_features_fn, True
|
| |
|
| | def get_intensive_features(
|
| | tokenizer,
|
| | mode,
|
| | data_args
|
| | ):
|
| | """
|
| | Generate intensive features for training, evaluation, or testing.
|
| |
|
| | Args:
|
| | tokenizer (Tokenizer): The tokenizer used to tokenize the input examples.
|
| | mode (str): The mode of operation. Must be one of "train", "eval", or "test".
|
| | data_args (DataArguments): The data arguments containing the configuration for tokenization.
|
| |
|
| | Returns:
|
| | tuple: A tuple containing the function to prepare the features and a boolean indicating if the tokenizer is beam-based.
|
| |
|
| | Raises:
|
| | ValueError: If the mode is not one of "train", "eval", or "test".
|
| |
|
| | """
|
| | pad_on_right = tokenizer.padding_side == "right"
|
| | max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
| | beam_based = data_args.intensive_model_type in ["xlnet", "xlm"]
|
| |
|
| | def tokenize_fn(examples):
|
| | """
|
| | Tokenize input examples.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized examples.
|
| | """
|
| |
|
| |
|
| |
|
| |
|
| | tokenized_examples = tokenizer(
|
| | examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
|
| | examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
|
| | truncation="only_second" if pad_on_right else "only_first",
|
| | max_length=max_seq_length,
|
| | stride=data_args.doc_stride,
|
| | return_overflowing_tokens=True,
|
| | return_offsets_mapping=True,
|
| | return_token_type_ids=data_args.return_token_type_ids,
|
| | padding="max_length" if data_args.pad_to_max_length else False,
|
| | )
|
| |
|
| | return tokenized_examples
|
| |
|
| | def prepare_train_features(examples):
|
| | """
|
| | Prepare training features by tokenizing the input examples and adding labels.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized and labeled examples.
|
| | """
|
| |
|
| | tokenized_examples = tokenize_fn(examples)
|
| | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
| | offset_mapping = tokenized_examples.pop("offset_mapping")
|
| |
|
| |
|
| | tokenized_examples["start_positions"] = []
|
| | tokenized_examples["end_positions"] = []
|
| | tokenized_examples["is_impossibles"] = []
|
| |
|
| | if beam_based:
|
| |
|
| | tokenized_examples["cls_index"] = []
|
| | tokenized_examples["p_mask"] = []
|
| |
|
| | for i, offsets in enumerate(offset_mapping):
|
| |
|
| |
|
| | input_ids = tokenized_examples["input_ids"][i]
|
| | cls_index = input_ids.index(tokenizer.cls_token_id)
|
| |
|
| |
|
| | sequence_ids = tokenized_examples.sequence_ids(i)
|
| | context_index = 1 if pad_on_right else 0
|
| |
|
| |
|
| |
|
| |
|
| | if beam_based:
|
| | tokenized_examples["cls_index"].append(cls_index)
|
| | tokenized_examples["p_mask"].append(
|
| | [
|
| | 0.0 if s == context_index or k == cls_index else 1.0
|
| | for s, k in enumerate(sequence_ids)
|
| | ]
|
| | )
|
| |
|
| |
|
| | sample_index = sample_mapping[i]
|
| | answers = examples[ANSWER_COLUMN_NAME][sample_index]
|
| | is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
| |
|
| |
|
| | if is_impossible or len(answers["answer_start"]) == 0:
|
| | tokenized_examples["start_positions"].append(cls_index)
|
| | tokenized_examples["end_positions"].append(cls_index)
|
| | tokenized_examples["is_impossibles"].append(1.0)
|
| | else:
|
| |
|
| | start_char = answers["answer_start"][0]
|
| | end_char = start_char + len(answers["text"][0])
|
| |
|
| |
|
| |
|
| |
|
| | token_start_index = 0
|
| | while sequence_ids[token_start_index] != context_index:
|
| | token_start_index += 1
|
| |
|
| |
|
| | token_end_index = len(input_ids) - 1
|
| | while sequence_ids[token_end_index] != context_index:
|
| | token_end_index -= 1
|
| |
|
| |
|
| | if not (offsets[token_start_index][0] <= start_char and
|
| | offsets[token_end_index][1] >= end_char
|
| | ):
|
| | tokenized_examples["start_positions"].append(cls_index)
|
| | tokenized_examples["end_positions"].append(cls_index)
|
| | tokenized_examples["is_impossibles"].append(1.0)
|
| | else:
|
| |
|
| |
|
| | while (token_start_index < len(offsets) and
|
| | offsets[token_start_index][0] <= start_char):
|
| | token_start_index += 1
|
| | tokenized_examples["start_positions"].append(token_start_index - 1)
|
| |
|
| | while offsets[token_end_index][1] >= end_char:
|
| | token_end_index -= 1
|
| | tokenized_examples["end_positions"].append(token_end_index + 1)
|
| | tokenized_examples["is_impossibles"].append(0.0)
|
| |
|
| | return tokenized_examples
|
| |
|
| |
|
| | def prepare_eval_features(examples):
|
| | """
|
| | Prepare evaluation features by tokenizing the input examples and adding labels.
|
| |
|
| | Args:
|
| | examples (dict): Input examples.
|
| |
|
| | Returns:
|
| | dict: Tokenized and labeled examples.
|
| | """
|
| |
|
| | tokenized_examples = tokenize_fn(examples)
|
| | sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
| |
|
| |
|
| | tokenized_examples["example_id"] = []
|
| |
|
| | if beam_based:
|
| |
|
| | tokenized_examples["cls_index"] = []
|
| | tokenized_examples["p_mask"] = []
|
| |
|
| | for i, input_ids in enumerate(tokenized_examples["input_ids"]):
|
| |
|
| | cls_index = input_ids.index(tokenizer.cls_token_id)
|
| |
|
| | sequence_ids = tokenized_examples.sequence_ids(i)
|
| | context_index = 1 if pad_on_right else 0
|
| |
|
| | if beam_based:
|
| |
|
| |
|
| |
|
| | tokenized_examples["cls_index"].append(cls_index)
|
| | tokenized_examples["p_mask"].append(
|
| | [
|
| | 0.0 if s == context_index or k == cls_index else 1.0
|
| | for s, k in enumerate(sequence_ids)
|
| | ]
|
| | )
|
| |
|
| | sample_index = sample_mapping[i]
|
| | id_col = examples[ID_COLUMN_NAME][sample_index]
|
| | tokenized_examples["example_id"].append(id_col)
|
| |
|
| |
|
| |
|
| | tokenized_examples["offset_mapping"][i] = [
|
| | (o if sequence_ids[k] == context_index else None)
|
| | for k, o in enumerate(tokenized_examples["offset_mapping"][i])
|
| | ]
|
| |
|
| | return tokenized_examples
|
| |
|
| | if mode == "train":
|
| | get_features_fn = prepare_train_features
|
| | elif mode == "eval":
|
| | get_features_fn = prepare_eval_features
|
| | elif mode == "test":
|
| | get_features_fn = prepare_eval_features
|
| |
|
| | return get_features_fn, True
|
| |
|
| |
|