|
import numpy as np
|
|
from .constants import (
|
|
QUESTION_COLUMN_NAME,
|
|
CONTEXT_COLUMN_NAME,
|
|
ANSWER_COLUMN_NAME,
|
|
ANSWERABLE_COLUMN_NAME,
|
|
ID_COLUMN_NAME
|
|
)
|
|
|
|
def get_sketch_features(
|
|
tokenizer,
|
|
mode,
|
|
data_args
|
|
):
|
|
"""
|
|
Get the features for sketch model.
|
|
|
|
Args:
|
|
tokenizer (Tokenizer): Tokenizer for tokenizing input examples.
|
|
mode (str): Mode of operation ("train", "eval", or "test").
|
|
data_args (dict): Additional arguments for data loading.
|
|
|
|
Returns:
|
|
tuple: A tuple containing the function for preparing features and a boolean value indicating if labels are required.
|
|
"""
|
|
|
|
pad_on_right = tokenizer.padding_side == "right"
|
|
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
|
|
|
def tokenize_fn(examples):
|
|
"""
|
|
Tokenize input examples.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized examples.
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
questions = [q.strip() for q in examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME]]
|
|
contexts = [c.strip() for c in examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME]]
|
|
|
|
|
|
tokenized_examples = tokenizer(
|
|
questions,
|
|
contexts,
|
|
truncation="only_second" if pad_on_right else "only_first",
|
|
max_length=max_seq_length,
|
|
stride=data_args.doc_stride,
|
|
return_overflowing_tokens=True,
|
|
return_offsets_mapping=True,
|
|
return_token_type_ids=data_args.return_token_type_ids,
|
|
padding="max_length" if data_args.pad_to_max_length else False,
|
|
)
|
|
|
|
return tokenized_examples
|
|
|
|
|
|
|
|
|
|
def prepare_train_features(examples):
|
|
"""
|
|
Prepare training features by tokenizing the input examples and adding labels.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized and labeled examples.
|
|
"""
|
|
|
|
tokenized_examples = tokenize_fn(examples)
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
|
|
|
|
|
|
tokenized_examples["labels"] = []
|
|
for i in range(len(tokenized_examples["input_ids"])):
|
|
sample_index = sample_mapping[i]
|
|
|
|
|
|
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
|
tokenized_examples["labels"].append(1 if is_impossible else 0)
|
|
|
|
return tokenized_examples
|
|
|
|
|
|
def prepare_eval_features(examples):
|
|
"""
|
|
Prepare evaluation features by tokenizing the input examples and adding labels.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized and labeled examples.
|
|
|
|
"""
|
|
|
|
tokenized_examples = tokenize_fn(examples)
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
|
|
|
|
tokenized_examples["example_id"] = []
|
|
tokenized_examples["labels"] = []
|
|
|
|
for i in range(len(tokenized_examples["input_ids"])):
|
|
|
|
sample_index = sample_mapping[i]
|
|
|
|
|
|
id_col = examples[ID_COLUMN_NAME][sample_index]
|
|
tokenized_examples["example_id"].append(id_col)
|
|
|
|
|
|
|
|
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
|
tokenized_examples["labels"].append(1 if is_impossible else 0)
|
|
|
|
return tokenized_examples
|
|
|
|
|
|
def prepare_test_features(examples):
|
|
"""
|
|
Prepare test features by tokenizing the input examples and adding example ids.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized and labeled examples.
|
|
|
|
"""
|
|
|
|
tokenized_examples = tokenize_fn(examples)
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
|
|
|
|
tokenized_examples["example_id"] = []
|
|
|
|
for i in range(len(tokenized_examples["input_ids"])):
|
|
|
|
sample_index = sample_mapping[i]
|
|
|
|
|
|
id_col = examples[ID_COLUMN_NAME][sample_index]
|
|
|
|
|
|
tokenized_examples["example_id"].append(id_col)
|
|
|
|
return tokenized_examples
|
|
|
|
|
|
if mode == "train":
|
|
get_features_fn = prepare_train_features
|
|
elif mode == "eval":
|
|
get_features_fn = prepare_eval_features
|
|
elif mode == "test":
|
|
get_features_fn = prepare_test_features
|
|
|
|
return get_features_fn, True
|
|
|
|
def get_intensive_features(
|
|
tokenizer,
|
|
mode,
|
|
data_args
|
|
):
|
|
"""
|
|
Generate intensive features for training, evaluation, or testing.
|
|
|
|
Args:
|
|
tokenizer (Tokenizer): The tokenizer used to tokenize the input examples.
|
|
mode (str): The mode of operation. Must be one of "train", "eval", or "test".
|
|
data_args (DataArguments): The data arguments containing the configuration for tokenization.
|
|
|
|
Returns:
|
|
tuple: A tuple containing the function to prepare the features and a boolean indicating if the tokenizer is beam-based.
|
|
|
|
Raises:
|
|
ValueError: If the mode is not one of "train", "eval", or "test".
|
|
|
|
"""
|
|
pad_on_right = tokenizer.padding_side == "right"
|
|
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
|
|
beam_based = data_args.intensive_model_type in ["xlnet", "xlm"]
|
|
|
|
def tokenize_fn(examples):
|
|
"""
|
|
Tokenize input examples.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized examples.
|
|
"""
|
|
|
|
|
|
|
|
|
|
tokenized_examples = tokenizer(
|
|
examples[QUESTION_COLUMN_NAME if pad_on_right else CONTEXT_COLUMN_NAME],
|
|
examples[CONTEXT_COLUMN_NAME if pad_on_right else QUESTION_COLUMN_NAME],
|
|
truncation="only_second" if pad_on_right else "only_first",
|
|
max_length=max_seq_length,
|
|
stride=data_args.doc_stride,
|
|
return_overflowing_tokens=True,
|
|
return_offsets_mapping=True,
|
|
return_token_type_ids=data_args.return_token_type_ids,
|
|
padding="max_length" if data_args.pad_to_max_length else False,
|
|
)
|
|
|
|
return tokenized_examples
|
|
|
|
def prepare_train_features(examples):
|
|
"""
|
|
Prepare training features by tokenizing the input examples and adding labels.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized and labeled examples.
|
|
"""
|
|
|
|
tokenized_examples = tokenize_fn(examples)
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
offset_mapping = tokenized_examples.pop("offset_mapping")
|
|
|
|
|
|
tokenized_examples["start_positions"] = []
|
|
tokenized_examples["end_positions"] = []
|
|
tokenized_examples["is_impossibles"] = []
|
|
|
|
if beam_based:
|
|
|
|
tokenized_examples["cls_index"] = []
|
|
tokenized_examples["p_mask"] = []
|
|
|
|
for i, offsets in enumerate(offset_mapping):
|
|
|
|
|
|
input_ids = tokenized_examples["input_ids"][i]
|
|
cls_index = input_ids.index(tokenizer.cls_token_id)
|
|
|
|
|
|
sequence_ids = tokenized_examples.sequence_ids(i)
|
|
context_index = 1 if pad_on_right else 0
|
|
|
|
|
|
|
|
|
|
if beam_based:
|
|
tokenized_examples["cls_index"].append(cls_index)
|
|
tokenized_examples["p_mask"].append(
|
|
[
|
|
0.0 if s == context_index or k == cls_index else 1.0
|
|
for s, k in enumerate(sequence_ids)
|
|
]
|
|
)
|
|
|
|
|
|
sample_index = sample_mapping[i]
|
|
answers = examples[ANSWER_COLUMN_NAME][sample_index]
|
|
is_impossible = examples[ANSWERABLE_COLUMN_NAME][sample_index]
|
|
|
|
|
|
if is_impossible or len(answers["answer_start"]) == 0:
|
|
tokenized_examples["start_positions"].append(cls_index)
|
|
tokenized_examples["end_positions"].append(cls_index)
|
|
tokenized_examples["is_impossibles"].append(1.0)
|
|
else:
|
|
|
|
start_char = answers["answer_start"][0]
|
|
end_char = start_char + len(answers["text"][0])
|
|
|
|
|
|
|
|
|
|
token_start_index = 0
|
|
while sequence_ids[token_start_index] != context_index:
|
|
token_start_index += 1
|
|
|
|
|
|
token_end_index = len(input_ids) - 1
|
|
while sequence_ids[token_end_index] != context_index:
|
|
token_end_index -= 1
|
|
|
|
|
|
if not (offsets[token_start_index][0] <= start_char and
|
|
offsets[token_end_index][1] >= end_char
|
|
):
|
|
tokenized_examples["start_positions"].append(cls_index)
|
|
tokenized_examples["end_positions"].append(cls_index)
|
|
tokenized_examples["is_impossibles"].append(1.0)
|
|
else:
|
|
|
|
|
|
while (token_start_index < len(offsets) and
|
|
offsets[token_start_index][0] <= start_char):
|
|
token_start_index += 1
|
|
tokenized_examples["start_positions"].append(token_start_index - 1)
|
|
|
|
while offsets[token_end_index][1] >= end_char:
|
|
token_end_index -= 1
|
|
tokenized_examples["end_positions"].append(token_end_index + 1)
|
|
tokenized_examples["is_impossibles"].append(0.0)
|
|
|
|
return tokenized_examples
|
|
|
|
|
|
def prepare_eval_features(examples):
|
|
"""
|
|
Prepare evaluation features by tokenizing the input examples and adding labels.
|
|
|
|
Args:
|
|
examples (dict): Input examples.
|
|
|
|
Returns:
|
|
dict: Tokenized and labeled examples.
|
|
"""
|
|
|
|
tokenized_examples = tokenize_fn(examples)
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
|
|
|
|
|
|
tokenized_examples["example_id"] = []
|
|
|
|
if beam_based:
|
|
|
|
tokenized_examples["cls_index"] = []
|
|
tokenized_examples["p_mask"] = []
|
|
|
|
for i, input_ids in enumerate(tokenized_examples["input_ids"]):
|
|
|
|
cls_index = input_ids.index(tokenizer.cls_token_id)
|
|
|
|
sequence_ids = tokenized_examples.sequence_ids(i)
|
|
context_index = 1 if pad_on_right else 0
|
|
|
|
if beam_based:
|
|
|
|
|
|
|
|
tokenized_examples["cls_index"].append(cls_index)
|
|
tokenized_examples["p_mask"].append(
|
|
[
|
|
0.0 if s == context_index or k == cls_index else 1.0
|
|
for s, k in enumerate(sequence_ids)
|
|
]
|
|
)
|
|
|
|
sample_index = sample_mapping[i]
|
|
id_col = examples[ID_COLUMN_NAME][sample_index]
|
|
tokenized_examples["example_id"].append(id_col)
|
|
|
|
|
|
|
|
tokenized_examples["offset_mapping"][i] = [
|
|
(o if sequence_ids[k] == context_index else None)
|
|
for k, o in enumerate(tokenized_examples["offset_mapping"][i])
|
|
]
|
|
|
|
return tokenized_examples
|
|
|
|
if mode == "train":
|
|
get_features_fn = prepare_train_features
|
|
elif mode == "eval":
|
|
get_features_fn = prepare_eval_features
|
|
elif mode == "test":
|
|
get_features_fn = prepare_eval_features
|
|
|
|
return get_features_fn, True
|
|
|
|
|