Spaces:

etweedy
/

roberta-squad-v2

Runtime error

App Files Files Community

roberta-squad-v2 / lib /utils.py

etweedy

Upload 8 files

d034971 about 2 years ago

raw

history blame

7.37 kB

	import numpy as np
	from scipy.special import softmax
	import collections
	import torch
	from torch.utils.data import DataLoader
	from transformers import default_data_collator

	def preprocess_examples(examples, tokenizer , max_length = 384, stride = 128):
	"""
	Preprocesses and tokenizes examples in preparation for inference

	Parameters:
	-----------
	examples : datasets.Dataset
	The dataset of examples. Must have columns:
	'id', 'question', 'context'
	tokenizer : transformers.AutoTokenizer
	The tokenizer for the model
	max_length : int
	The max length for context truncation
	stride : int
	The stride for context truncation

	Returns:
	--------
	inputs : dict
	The tokenized and processed data dictionary with
	keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
	All values are lists of length = # of inputs output by tokenizer
	inputs['input_ids'][k] : list
	token ids corresponding to tokens in feature k
	inputs['attention_mask'][k] : list
	attention mask for feature k
	inputs['offset_ids'][k] : list
	offset ids for feature k
	inputs['example_id'][k] : int
	id of example from which feature k originated
	"""
	questions = [q.strip() for q in examples["question"]]
	inputs = tokenizer(
	questions,
	examples['context'],
	max_length=max_length,
	truncation="only_second",
	stride=stride,
	return_overflowing_tokens=True,
	return_offsets_mapping=True,
	padding="max_length",
	)

	sample_map = inputs.pop("overflow_to_sample_mapping")
	example_ids = []

	for i in range(len(inputs["input_ids"])):
	sample_idx = sample_map[i]
	example_ids.append(examples["id"][sample_idx])

	sequence_ids = inputs.sequence_ids(i)
	offset = inputs["offset_mapping"][i]
	inputs["offset_mapping"][i] = [
	o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
	]

	inputs["example_id"] = example_ids
	return inputs


	def make_predictions(model,tokenizer,inputs,examples,
	n_best = 20,max_answer_length=30):
	"""
	Generates a list of prediction data based on logits

	Parameters:
	-----------
	model : transformers.AutoModelForQuestionAnswering
	The trained model
	tokenizer : transformers.AutoTokenizer
	The model's tokenizer
	inputs : dict
	The tokenized and processed data dictionary with
	keys 'input_ids', 'attention_mask', 'offset_ids', 'example_id'
	All values are lists of length = # of inputs output by tokenizer
	inputs['input_ids'][k] : list
	token ids corresponding to tokens in feature k
	inputs['attention_mask'][k] : list
	attention mask for feature k
	inputs['offset_ids'][k] : list
	offset ids for feature k
	inputs['example_id'][k] : int
	id of example from which feature k originated
	examples : datasets.Dataset
	The dataset of examples. Must have columns:
	'id', 'question', 'context'
	n_best : int
	The number of top start/end (by logit) indices to consider
	max_answer_length : int
	The maximum length (in characters) allowed for a candidate answer

	Returns:
	--------
	predicted_answers : list(dict)
	predicted_answers[k] has keys 'id','prediction_text','confidence'
	predicted_answers[k]['id'] : int
	The unique id of the example
	predicted_answers[k]['prediction_text'] : str
	The predicted answer as a string
	predicted_answers[k]['confidence'] : float
	The predicted probability corresponding to the answer, i.e. the
	corresponding output of a softmax function on logits
	"""
	assert n_best <= len(inputs['input_ids'][0]), 'n_best cannot be larger than max_length'

	if torch.backends.mps.is_available():
	device = "mps"
	elif torch.cuda.us_available():
	device = "cuda"
	else:
	device = "cpu"
	data_for_model = inputs.remove_columns(["example_id", "offset_mapping"])
	data_for_model.set_format("torch",device=device)
	dl = DataLoader(
	data_for_model,
	collate_fn=default_data_collator,
	batch_size=len(inputs)
	)
	model = model.to(device)
	for batch in dl:
	outputs = model(**batch)

	start_logits = outputs.start_logits.cpu().detach().numpy()
	end_logits = outputs.end_logits.cpu().detach().numpy()
	example_to_inputs = collections.defaultdict(list)
	for idx, feature in enumerate(inputs):
	example_to_inputs[feature["example_id"]].append(idx)

	predicted_answers = []
	for example in examples:
	example_id = example["id"]
	context = example["context"]
	answers = []

	for feature_index in example_to_inputs[example_id]:
	start_logit = start_logits[feature_index]
	end_logit = end_logits[feature_index]
	offsets = inputs[feature_index]['offset_mapping']

	start_indices = np.argsort(start_logit)[-1:-n_best-1:-1].tolist()
	end_indices = np.argsort(end_logit)[-1 :-n_best-1: -1].tolist()

	for start_index in start_indices:
	for end_index in end_indices:
	# Skip answers with a length that is either < 0 or > max_answer_length.
	if(
	end_index < start_index
	or end_index - start_index + 1 > max_answer_length
	):
	continue

	if (offsets[start_index] is None)^(offsets[end_index] is None):
	continue
	if (offsets[start_index] is None)&(offsets[end_index] is None):
	answers.append(
	{
	"text": '',
	"logit_score": start_logit[start_index] + end_logit[end_index],
	}
	)
	else:
	answers.append(
	{
	"text": context[offsets[start_index][0] : offsets[end_index][1]],
	"logit_score": start_logit[start_index] + end_logit[end_index],
	}
	)
	answer_logits = [a['logit_score'] for a in answers]
	answer_probs = softmax(answer_logits)

	if len(answers)>0:
	best_answer = max(answers, key=lambda x:x['logit_score'])
	predicted_answers.append(
	{'id':example_id, 'prediction_text':best_answer['text'], 'confidence':answer_probs[0]}
	)
	else:
	predicted_answers.append({'id':example_id, 'prediction_text':'','confidence':answer_probs[0]})
	for pred in predicted_answers:
	if pred['prediction_text'] == '':
	pred['prediction_text'] = "I don't have an answer based on the context provided."
	return predicted_answers