# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from collections import namedtuple from copy import deepcopy from typing import Sequence, Optional import datasets import evaluate # TODO: Add BibTeX citation _CITATION = """\ @misc{nereval, title={{NER-Evaluation}: Named Entity Evaluation as in SemEval 2013 task 9.1}, url={https://github.com/davidsbatista/NER-Evaluation}, note={Software available from https://github.com/davidsbatista/NER-Evaluation}, author={Batista David}, year={2018}, } """ # TODO: Add description of the module here _DESCRIPTION = """\ ner-eval is a Python frame for sequence labeling evaluation. I twas used in SemEval 2013 task 9.1. It supports exact match, partial match, spurious and other errors. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions: List of List of predicted labels (Estimated targets as returned by a tagger) references: List of List of reference labels (Ground truth (correct) target values) tags: List of tags to evaluate. default: None Returns: 'scores' dict. Summary of the scores for overall and each tag. { "overall": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.0, "ent_type_recall": 0.0, "ent_type_f1": 0, "partial_precision": 0.0, "partial_recall": 0.0, "partial_f1": 0, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "ORG": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.0, "ent_type_recall": 0.0, "ent_type_f1": 0, "partial_precision": 0.0, "partial_recall": 0.0, "partial_f1": 0, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "PER": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.0, "ent_type_recall": 0.0, "ent_type_f1": 0, "partial_precision": 0.0, "partial_recall": 0.0, "partial_f1": 0, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "LOC": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.0, "ent_type_recall": 0.0, "ent_type_f1": 0, "partial_precision": 0.0, "partial_recall": 0.0, "partial_f1": 0, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, } Examples: >>> my_new_module = evaluate.load("fschlatt/ner_eval") >>> results = my_new_module.compute( ... references=[["B-LOC", "I-LOC", "I-LOC", "B-ORG", "I-ORG", "O", "B-PER", "I-PER", "I-PER", "O"]], ... predictions=[["B-LOC", "I-LOC", "O", "O", "B-ORG", "I-ORG", "O", "B-PER", "I-PER", "O"]] ... ) >>> print(results) { "overall": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 2 / 3, "ent_type_recall": 2 / 3, "ent_type_f1": 2 / 3, "partial_precision": 1 / 3, "partial_recall": 1 / 3, "partial_f1": 1 / 3, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "ORG": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.0, "ent_type_recall": 0.0, "ent_type_f1": 0, "partial_precision": 0.0, "partial_recall": 0.0, "partial_f1": 0, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "PER": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.5, "ent_type_recall": 1.0, "ent_type_f1": 2 / 3, "partial_precision": 0.25, "partial_recall": 0.5, "partial_f1": 1 / 3, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, }, "LOC": { "strict_precision": 0.0, "strict_recall": 0.0, "strict_f1": 0, "ent_type_precision": 0.5, "ent_type_recall": 1.0, "ent_type_f1": 2 / 3, "partial_precision": 0.25, "partial_recall": 0.5, "partial_f1": 1 / 3, "exact_precision": 0.0, "exact_recall": 0.0, "exact_f1": 0, } } """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class NEREval(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, homepage="https://github.com/davidsbatista/NER-Evaluation", inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features( { "predictions": datasets.Sequence( datasets.Value("string", id="label"), id="sequence" ), "references": datasets.Sequence( datasets.Value("string", id="label"), id="sequence" ), } ), # Additional links to the codebase or references codebase_urls=["https://github.com/davidsbatista/NER-Evaluation"], reference_urls=[ "https://github.com/davidsbatista/NER-Evaluation", "https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/", ], ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _compute( self, predictions: Sequence[Sequence[str]], references: Sequence[Sequence[str]], tags: Optional[Sequence[str]] = None, modes: Optional[Sequence[str]] = None, ): if tags is None: tags = list(parse_tags(predictions).union(parse_tags(references))) evaluator = Evaluator(predictions, references, tags) results, agg_results = evaluator.evaluate() out = {"overall": parse_results(results, modes)} for tag, tag_result in agg_results.items(): out = {**out, tag: parse_results(tag_result, modes)} return out def parse_results(results, modes: Optional[Sequence[str]] = None): if modes is None: modes = ["strict", "ent_type", "partial", "exact"] out = {} for mode in modes: out[f"{mode}_precision"] = results[mode]["precision"] out[f"{mode}_recall"] = results[mode]["recall"] out[f"{mode}_f1"] = results[mode]["f1"] return out def parse_tags(tokens: Sequence[Sequence[str]]): tags = set() for seq in tokens: for t in seq: tags.add(t.split("-")[-1]) tags.discard("O") return tags Entity = namedtuple("Entity", "e_type start_offset end_offset") class Evaluator: def __init__(self, true, pred, tags): """ """ if len(true) != len(pred): raise ValueError("Number of predicted documents does not equal true") self.true = true self.pred = pred self.tags = tags # Setup dict into which metrics will be stored. self.metrics_results = { "correct": 0, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "possible": 0, "actual": 0, "precision": 0, "recall": 0, "f1": 0, } # Copy results dict to cover the four schemes. self.results = { "strict": deepcopy(self.metrics_results), "ent_type": deepcopy(self.metrics_results), "partial": deepcopy(self.metrics_results), "exact": deepcopy(self.metrics_results), } # Create an accumulator to store results self.evaluation_agg_entities_type = {e: deepcopy(self.results) for e in tags} def evaluate(self): for true_ents, pred_ents in zip(self.true, self.pred): # Check that the length of the true and predicted examples are the # same. This must be checked here, because another error may not # be thrown if the lengths do not match. if len(true_ents) != len(pred_ents): raise ValueError("Prediction length does not match true example length") # Compute results for one message tmp_results, tmp_agg_results = compute_metrics( collect_named_entities(true_ents), collect_named_entities(pred_ents), self.tags, ) # Cycle through each result and accumulate # TODO: Combine these loops below: for eval_schema in self.results: for metric in self.results[eval_schema]: self.results[eval_schema][metric] += tmp_results[eval_schema][ metric ] # Calculate global precision and recall self.results = compute_precision_recall_f1_wrapper(self.results) # Aggregate results by entity type for e_type in self.tags: for eval_schema in tmp_agg_results[e_type]: for metric in tmp_agg_results[e_type][eval_schema]: self.evaluation_agg_entities_type[e_type][eval_schema][ metric ] += tmp_agg_results[e_type][eval_schema][metric] # Calculate precision recall at the individual entity level self.evaluation_agg_entities_type[ e_type ] = compute_precision_recall_f1_wrapper( self.evaluation_agg_entities_type[e_type] ) return self.results, self.evaluation_agg_entities_type def collect_named_entities(tokens): """ Creates a list of Entity named-tuples, storing the entity type and the start and end offsets of the entity. :param tokens: a list of tags :return: a list of Entity named-tuples """ named_entities = [] start_offset = None end_offset = None ent_type = None for offset, token_tag in enumerate(tokens): if token_tag == "O": if ent_type is not None and start_offset is not None: end_offset = offset - 1 named_entities.append(Entity(ent_type, start_offset, end_offset)) start_offset = None end_offset = None ent_type = None elif ent_type is None: ent_type = token_tag[2:] start_offset = offset elif ent_type != token_tag[2:] or ( ent_type == token_tag[2:] and token_tag[:1] == "B" ): end_offset = offset - 1 named_entities.append(Entity(ent_type, start_offset, end_offset)) # start of a new entity ent_type = token_tag[2:] start_offset = offset end_offset = None # catches an entity that goes up until the last token if ent_type is not None and start_offset is not None and end_offset is None: named_entities.append(Entity(ent_type, start_offset, len(tokens) - 1)) return named_entities def compute_metrics(true_named_entities, pred_named_entities, tags): eval_metrics = { "correct": 0, "incorrect": 0, "partial": 0, "missed": 0, "spurious": 0, "precision": 0, "recall": 0, "f1": 0, } # overall results evaluation = { "strict": deepcopy(eval_metrics), "ent_type": deepcopy(eval_metrics), "partial": deepcopy(eval_metrics), "exact": deepcopy(eval_metrics), } # results by entity type evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in tags} # keep track of entities that overlapped true_which_overlapped_with_pred = [] # Subset into only the tags that we are interested in. # NOTE: we remove the tags we don't want from both the predicted and the # true entities. This covers the two cases where mismatches can occur: # # 1) Where the model predicts a tag that is not present in the true data # 2) Where there is a tag in the true data that the model is not capable of # predicting. true_named_entities = [ent for ent in true_named_entities if ent.e_type in tags] pred_named_entities = [ent for ent in pred_named_entities if ent.e_type in tags] # go through each predicted named-entity for pred in pred_named_entities: found_overlap = False # Check each of the potential scenarios in turn. See # http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/ # for scenario explanation. # Scenario I: Exact match between true and pred if pred in true_named_entities: true_which_overlapped_with_pred.append(pred) evaluation["strict"]["correct"] += 1 evaluation["ent_type"]["correct"] += 1 evaluation["exact"]["correct"] += 1 evaluation["partial"]["correct"] += 1 # for the agg. by e_type results evaluation_agg_entities_type[pred.e_type]["strict"]["correct"] += 1 evaluation_agg_entities_type[pred.e_type]["ent_type"]["correct"] += 1 evaluation_agg_entities_type[pred.e_type]["exact"]["correct"] += 1 evaluation_agg_entities_type[pred.e_type]["partial"]["correct"] += 1 else: # check for overlaps with any of the true entities for true in true_named_entities: pred_range = range(pred.start_offset, pred.end_offset) true_range = range(true.start_offset, true.end_offset) # Scenario IV: Offsets match, but entity type is wrong if ( true.start_offset == pred.start_offset and pred.end_offset == true.end_offset and true.e_type != pred.e_type ): # overall results evaluation["strict"]["incorrect"] += 1 evaluation["ent_type"]["incorrect"] += 1 evaluation["partial"]["correct"] += 1 evaluation["exact"]["correct"] += 1 # aggregated by entity type results evaluation_agg_entities_type[true.e_type]["strict"][ "incorrect" ] += 1 evaluation_agg_entities_type[true.e_type]["ent_type"][ "incorrect" ] += 1 evaluation_agg_entities_type[true.e_type]["partial"]["correct"] += 1 evaluation_agg_entities_type[true.e_type]["exact"]["correct"] += 1 true_which_overlapped_with_pred.append(true) found_overlap = True break # check for an overlap i.e. not exact boundary match, with true entities elif find_overlap(true_range, pred_range): true_which_overlapped_with_pred.append(true) # Scenario V: There is an overlap (but offsets do not match # exactly), and the entity type is the same. # 2.1 overlaps with the same entity type if pred.e_type == true.e_type: # overall results evaluation["strict"]["incorrect"] += 1 evaluation["ent_type"]["correct"] += 1 evaluation["partial"]["partial"] += 1 evaluation["exact"]["incorrect"] += 1 # aggregated by entity type results evaluation_agg_entities_type[true.e_type]["strict"][ "incorrect" ] += 1 evaluation_agg_entities_type[true.e_type]["ent_type"][ "correct" ] += 1 evaluation_agg_entities_type[true.e_type]["partial"][ "partial" ] += 1 evaluation_agg_entities_type[true.e_type]["exact"][ "incorrect" ] += 1 found_overlap = True break # Scenario VI: Entities overlap, but the entity type is # different. else: # overall results evaluation["strict"]["incorrect"] += 1 evaluation["ent_type"]["incorrect"] += 1 evaluation["partial"]["partial"] += 1 evaluation["exact"]["incorrect"] += 1 # aggregated by entity type results # Results against the true entity evaluation_agg_entities_type[true.e_type]["strict"][ "incorrect" ] += 1 evaluation_agg_entities_type[true.e_type]["partial"][ "partial" ] += 1 evaluation_agg_entities_type[true.e_type]["ent_type"][ "incorrect" ] += 1 evaluation_agg_entities_type[true.e_type]["exact"][ "incorrect" ] += 1 # Results against the predicted entity # evaluation_agg_entities_type[pred.e_type]['strict']['spurious'] += 1 found_overlap = True break # Scenario II: Entities are spurious (i.e., over-generated). if not found_overlap: # Overall results evaluation["strict"]["spurious"] += 1 evaluation["ent_type"]["spurious"] += 1 evaluation["partial"]["spurious"] += 1 evaluation["exact"]["spurious"] += 1 # Aggregated by entity type results # NOTE: when pred.e_type is not found in tags # or when it simply does not appear in the test set, then it is # spurious, but it is not clear where to assign it at the tag # level. In this case, it is applied to all target_tags # found in this example. This will mean that the sum of the # evaluation_agg_entities will not equal evaluation. for true in tags: evaluation_agg_entities_type[true]["strict"]["spurious"] += 1 evaluation_agg_entities_type[true]["ent_type"]["spurious"] += 1 evaluation_agg_entities_type[true]["partial"]["spurious"] += 1 evaluation_agg_entities_type[true]["exact"]["spurious"] += 1 # Scenario III: Entity was missed entirely. for true in true_named_entities: if true in true_which_overlapped_with_pred: continue else: # overall results evaluation["strict"]["missed"] += 1 evaluation["ent_type"]["missed"] += 1 evaluation["partial"]["missed"] += 1 evaluation["exact"]["missed"] += 1 # for the agg. by e_type evaluation_agg_entities_type[true.e_type]["strict"]["missed"] += 1 evaluation_agg_entities_type[true.e_type]["ent_type"]["missed"] += 1 evaluation_agg_entities_type[true.e_type]["partial"]["missed"] += 1 evaluation_agg_entities_type[true.e_type]["exact"]["missed"] += 1 # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the # overall results, and use these to calculate precision and recall. for eval_type in evaluation: evaluation[eval_type] = compute_actual_possible(evaluation[eval_type]) # Compute 'possible', 'actual', and precision and recall on entity level # results. Start by cycling through the accumulated results. for entity_type, entity_level in evaluation_agg_entities_type.items(): # Cycle through the evaluation types for each dict containing entity # level results. for eval_type in entity_level: evaluation_agg_entities_type[entity_type][ eval_type ] = compute_actual_possible(entity_level[eval_type]) return evaluation, evaluation_agg_entities_type def find_overlap(true_range, pred_range): """Find the overlap between two ranges Find the overlap between two ranges. Return the overlapping values if present, else return an empty set(). Examples: >>> find_overlap((1, 2), (2, 3)) 2 >>> find_overlap((1, 2), (3, 4)) set() """ true_set = set(true_range) pred_set = set(pred_range) overlaps = true_set.intersection(pred_set) return overlaps def compute_actual_possible(results): """ Takes a result dict that has been output by compute metrics. Returns the results dict with actual, possible populated. When the results dicts is from partial or ent_type metrics, then partial_or_type=True to ensure the right calculation is used for calculating precision and recall. """ correct = results["correct"] incorrect = results["incorrect"] partial = results["partial"] missed = results["missed"] spurious = results["spurious"] # Possible: number annotations in the gold-standard which contribute to the # final score possible = correct + incorrect + partial + missed # Actual: number of annotations produced by the NER system actual = correct + incorrect + partial + spurious results["actual"] = actual results["possible"] = possible return results def compute_precision_recall_f1(results, partial_or_type=False): """ Takes a result dict that has been output by compute metrics. Returns the results dict with precison and recall populated. When the results dicts is from partial or ent_type metrics, then partial_or_type=True to ensure the right calculation is used for calculating precision and recall. """ actual = results["actual"] possible = results["possible"] partial = results["partial"] correct = results["correct"] if partial_or_type: precision = (correct + 0.5 * partial) / actual if actual > 0 else 0 recall = (correct + 0.5 * partial) / possible if possible > 0 else 0 else: precision = correct / actual if actual > 0 else 0 recall = correct / possible if possible > 0 else 0 results["precision"] = precision results["recall"] = recall results["f1"] = ( precision * recall * 2 / (precision + recall) if precision + recall > 0 else 0 ) return results def compute_precision_recall_f1_wrapper(results): """ Wraps the compute_precision_recall_f1 function and runs on a dict of results """ results_a = { key: compute_precision_recall_f1(value, True) for key, value in results.items() if key in ["partial", "ent_type"] } results_b = { key: compute_precision_recall_f1(value) for key, value in results.items() if key in ["strict", "exact"] } results = {**results_a, **results_b} return results