Spaces:

fschlatt
/

ner_eval

Runtime error

App Files Files Community

fschlatt commited on May 23, 2023

Commit

d93bc17

1 Parent(s): ebf03b2

initial commit

Browse files

Files changed (4) hide show

.gitignore +1 -0
ner_eval.py +668 -40
tests.py +0 -17
tests/test_ner_eval.py +319 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

ner_eval.py CHANGED Viewed

@@ -11,24 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""TODO: Add a description here."""
-import evaluate
-import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
-@InProceedings{huggingface:module,
-title = {A great new module},
-authors={huggingface, Inc.},
-year={2020}
 }
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
-This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
@@ -36,49 +41,166 @@ This new module is designed to solve this great ML task and is crafted with a lo
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
 Returns:
-    accuracy: description of the first score,
-    another_score: description of the second score,
 Examples:
-    Examples should be written in doctest format, and should illustrate how
-    to use the function.
-    >>> my_new_module = evaluate.load("my_new_module")
-    >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
     >>> print(results)
-    {'accuracy': 1.0}
 """
-# TODO: Define external resources urls if needed
-BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
-class ner_eval(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
-        # TODO: Specifies the evaluate.EvaluationModuleInfo object
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
-            # Homepage of the module for documentation
-            homepage="http://module.homepage",
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
-            reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
@@ -86,10 +208,516 @@ class ner_eval(evaluate.Metric):
         # TODO: Download external resources if needed
         pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from collections import namedtuple
+from copy import deepcopy
+from typing import Sequence, Optional
+import datasets
+import evaluate
 # TODO: Add BibTeX citation
 _CITATION = """\
+@misc{nereval,
+  title={{NER-Evaluation}: Named Entity Evaluation as in SemEval 2013 task 9.1},
+  url={https://github.com/davidsbatista/NER-Evaluation},
+  note={Software available from https://github.com/davidsbatista/NER-Evaluation},
+  author={Batista David},
+  year={2018},
 }
 """
 # TODO: Add description of the module here
 _DESCRIPTION = """\
+ner-eval is a Python frame for sequence labeling evaluation. I twas used in SemEval 2013 task 9.1.
+It supports exact match, partial match, spurious and other errors.
 """
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
+    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
+    references: List of List of reference labels (Ground truth (correct) target values)
+    tags: List of tags to evaluate. default: None
 Returns:
+    'scores' dict. Summary of the scores for overall and each tag.
+         {
+            "overall": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "ORG": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "PER": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "LOC": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+        }
 Examples:
+    >>> my_new_module = evaluate.load("fschlatt/ner_eval")
+    >>> results = my_new_module.compute(
+    ...     references=[["B-LOC", "I-LOC", "I-LOC", "B-ORG", "I-ORG", "O", "B-PER", "I-PER", "I-PER", "O"]],
+    ...     predictions=[["B-LOC", "I-LOC", "O", "O", "B-ORG", "I-ORG", "O", "B-PER", "I-PER", "O"]]
+    ... )
     >>> print(results)
+    {
+        "overall": {
+            "strict_precision": 0.0,
+            "strict_recall": 0.0,
+            "strict_f1": 0,
+            "ent_type_precision": 2 / 3,
+            "ent_type_recall": 2 / 3,
+            "ent_type_f1": 2 / 3,
+            "partial_precision": 1 / 3,
+            "partial_recall": 1 / 3,
+            "partial_f1": 1 / 3,
+            "exact_precision": 0.0,
+            "exact_recall": 0.0,
+            "exact_f1": 0,
+        },
+        "ORG": {
+            "strict_precision": 0.0,
+            "strict_recall": 0.0,
+            "strict_f1": 0,
+            "ent_type_precision": 0.0,
+            "ent_type_recall": 0.0,
+            "ent_type_f1": 0,
+            "partial_precision": 0.0,
+            "partial_recall": 0.0,
+            "partial_f1": 0,
+            "exact_precision": 0.0,
+            "exact_recall": 0.0,
+            "exact_f1": 0,
+        },
+        "PER": {
+            "strict_precision": 0.0,
+            "strict_recall": 0.0,
+            "strict_f1": 0,
+            "ent_type_precision": 0.5,
+            "ent_type_recall": 1.0,
+            "ent_type_f1": 2 / 3,
+            "partial_precision": 0.25,
+            "partial_recall": 0.5,
+            "partial_f1": 1 / 3,
+            "exact_precision": 0.0,
+            "exact_recall": 0.0,
+            "exact_f1": 0,
+        },
+        "LOC": {
+            "strict_precision": 0.0,
+            "strict_recall": 0.0,
+            "strict_f1": 0,
+            "ent_type_precision": 0.5,
+            "ent_type_recall": 1.0,
+            "ent_type_f1": 2 / 3,
+            "partial_precision": 0.25,
+            "partial_recall": 0.5,
+            "partial_f1": 1 / 3,
+            "exact_precision": 0.0,
+            "exact_recall": 0.0,
+            "exact_f1": 0,
+        }
+    }
 """
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
+class NEREval(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
     def _info(self):
         return evaluate.MetricInfo(
             # This is the description that will appear on the modules page.
             module_type="metric",
             description=_DESCRIPTION,
             citation=_CITATION,
+            homepage="https://github.com/davidsbatista/NER-Evaluation",
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "predictions": datasets.Sequence(
+                        datasets.Value("string", id="label"), id="sequence"
+                    ),
+                    "references": datasets.Sequence(
+                        datasets.Value("string", id="label"), id="sequence"
+                    ),
+                }
+            ),
             # Additional links to the codebase or references
+            codebase_urls=["https://github.com/davidsbatista/NER-Evaluation"],
+            reference_urls=[
+                "https://github.com/davidsbatista/NER-Evaluation",
+                "https://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/",
+            ],
         )
     def _download_and_prepare(self, dl_manager):
         # TODO: Download external resources if needed
         pass
+    def _compute(
+        self,
+        predictions: Sequence[Sequence[str]],
+        references: Sequence[Sequence[str]],
+        tags: Optional[Sequence[str]] = None,
+        modes: Optional[Sequence[str]] = None,
+    ):
+        if tags is None:
+            tags = list(parse_tags(predictions).union(parse_tags(references)))
+        evaluator = Evaluator(predictions, references, tags)
+        results, agg_results = evaluator.evaluate()
+        out = {"overall": parse_results(results, modes)}
+        for tag, tag_result in agg_results.items():
+            out = {**out, tag: parse_results(tag_result, modes)}
+        return out
+def parse_results(results, modes: Optional[Sequence[str]] = None):
+    if modes is None:
+        modes = ["strict", "ent_type", "partial", "exact"]
+    out = {}
+    for mode in modes:
+        out[f"{mode}_precision"] = results[mode]["precision"]
+        out[f"{mode}_recall"] = results[mode]["recall"]
+        out[f"{mode}_f1"] = results[mode]["f1"]
+    return out
+def parse_tags(tokens: Sequence[Sequence[str]]):
+    tags = set()
+    for seq in tokens:
+        for t in seq:
+            tags.add(t.split("-")[-1])
+    tags.discard("O")
+    return tags
+Entity = namedtuple("Entity", "e_type start_offset end_offset")
+class Evaluator:
+    def __init__(self, true, pred, tags):
+        """ """
+        if len(true) != len(pred):
+            raise ValueError("Number of predicted documents does not equal true")
+        self.true = true
+        self.pred = pred
+        self.tags = tags
+        # Setup dict into which metrics will be stored.
+        self.metrics_results = {
+            "correct": 0,
+            "incorrect": 0,
+            "partial": 0,
+            "missed": 0,
+            "spurious": 0,
+            "possible": 0,
+            "actual": 0,
+            "precision": 0,
+            "recall": 0,
+        }
+        # Copy results dict to cover the four schemes.
+        self.results = {
+            "strict": deepcopy(self.metrics_results),
+            "ent_type": deepcopy(self.metrics_results),
+            "partial": deepcopy(self.metrics_results),
+            "exact": deepcopy(self.metrics_results),
+        }
+        # Create an accumulator to store results
+        self.evaluation_agg_entities_type = {e: deepcopy(self.results) for e in tags}
+    def evaluate(self):
+        for true_ents, pred_ents in zip(self.true, self.pred):
+            # Check that the length of the true and predicted examples are the
+            # same. This must be checked here, because another error may not
+            # be thrown if the lengths do not match.
+            if len(true_ents) != len(pred_ents):
+                raise ValueError("Prediction length does not match true example length")
+            # Compute results for one message
+            tmp_results, tmp_agg_results = compute_metrics(
+                collect_named_entities(true_ents),
+                collect_named_entities(pred_ents),
+                self.tags,
+            )
+            # Cycle through each result and accumulate
+            # TODO: Combine these loops below:
+            for eval_schema in self.results:
+                for metric in self.results[eval_schema]:
+                    self.results[eval_schema][metric] += tmp_results[eval_schema][
+                        metric
+                    ]
+            # Calculate global precision and recall
+            self.results = compute_precision_recall_f1_wrapper(self.results)
+            # Aggregate results by entity type
+            for e_type in self.tags:
+                for eval_schema in tmp_agg_results[e_type]:
+                    for metric in tmp_agg_results[e_type][eval_schema]:
+                        self.evaluation_agg_entities_type[e_type][eval_schema][
+                            metric
+                        ] += tmp_agg_results[e_type][eval_schema][metric]
+                # Calculate precision recall at the individual entity level
+                self.evaluation_agg_entities_type[
+                    e_type
+                ] = compute_precision_recall_f1_wrapper(
+                    self.evaluation_agg_entities_type[e_type]
+                )
+        return self.results, self.evaluation_agg_entities_type
+def collect_named_entities(tokens):
+    """
+    Creates a list of Entity named-tuples, storing the entity type and the start and end
+    offsets of the entity.
+    :param tokens: a list of tags
+    :return: a list of Entity named-tuples
+    """
+    named_entities = []
+    start_offset = None
+    end_offset = None
+    ent_type = None
+    for offset, token_tag in enumerate(tokens):
+        if token_tag == "O":
+            if ent_type is not None and start_offset is not None:
+                end_offset = offset - 1
+                named_entities.append(Entity(ent_type, start_offset, end_offset))
+                start_offset = None
+                end_offset = None
+                ent_type = None
+        elif ent_type is None:
+            ent_type = token_tag[2:]
+            start_offset = offset
+        elif ent_type != token_tag[2:] or (
+            ent_type == token_tag[2:] and token_tag[:1] == "B"
+        ):
+            end_offset = offset - 1
+            named_entities.append(Entity(ent_type, start_offset, end_offset))
+            # start of a new entity
+            ent_type = token_tag[2:]
+            start_offset = offset
+            end_offset = None
+    # catches an entity that goes up until the last token
+    if ent_type is not None and start_offset is not None and end_offset is None:
+        named_entities.append(Entity(ent_type, start_offset, len(tokens) - 1))
+    return named_entities
+def compute_metrics(true_named_entities, pred_named_entities, tags):
+    eval_metrics = {
+        "correct": 0,
+        "incorrect": 0,
+        "partial": 0,
+        "missed": 0,
+        "spurious": 0,
+        "precision": 0,
+        "recall": 0,
+    }
+    # overall results
+    evaluation = {
+        "strict": deepcopy(eval_metrics),
+        "ent_type": deepcopy(eval_metrics),
+        "partial": deepcopy(eval_metrics),
+        "exact": deepcopy(eval_metrics),
+    }
+    # results by entity type
+    evaluation_agg_entities_type = {e: deepcopy(evaluation) for e in tags}
+    # keep track of entities that overlapped
+    true_which_overlapped_with_pred = []
+    # Subset into only the tags that we are interested in.
+    # NOTE: we remove the tags we don't want from both the predicted and the
+    # true entities. This covers the two cases where mismatches can occur:
+    #
+    # 1) Where the model predicts a tag that is not present in the true data
+    # 2) Where there is a tag in the true data that the model is not capable of
+    # predicting.
+    true_named_entities = [ent for ent in true_named_entities if ent.e_type in tags]
+    pred_named_entities = [ent for ent in pred_named_entities if ent.e_type in tags]
+    # go through each predicted named-entity
+    for pred in pred_named_entities:
+        found_overlap = False
+        # Check each of the potential scenarios in turn. See
+        # http://www.davidsbatista.net/blog/2018/05/09/Named_Entity_Evaluation/
+        # for scenario explanation.
+        # Scenario I: Exact match between true and pred
+        if pred in true_named_entities:
+            true_which_overlapped_with_pred.append(pred)
+            evaluation["strict"]["correct"] += 1
+            evaluation["ent_type"]["correct"] += 1
+            evaluation["exact"]["correct"] += 1
+            evaluation["partial"]["correct"] += 1
+            # for the agg. by e_type results
+            evaluation_agg_entities_type[pred.e_type]["strict"]["correct"] += 1
+            evaluation_agg_entities_type[pred.e_type]["ent_type"]["correct"] += 1
+            evaluation_agg_entities_type[pred.e_type]["exact"]["correct"] += 1
+            evaluation_agg_entities_type[pred.e_type]["partial"]["correct"] += 1
+        else:
+            # check for overlaps with any of the true entities
+            for true in true_named_entities:
+                pred_range = range(pred.start_offset, pred.end_offset)
+                true_range = range(true.start_offset, true.end_offset)
+                # Scenario IV: Offsets match, but entity type is wrong
+                if (
+                    true.start_offset == pred.start_offset
+                    and pred.end_offset == true.end_offset
+                    and true.e_type != pred.e_type
+                ):
+                    # overall results
+                    evaluation["strict"]["incorrect"] += 1
+                    evaluation["ent_type"]["incorrect"] += 1
+                    evaluation["partial"]["correct"] += 1
+                    evaluation["exact"]["correct"] += 1
+                    # aggregated by entity type results
+                    evaluation_agg_entities_type[true.e_type]["strict"][
+                        "incorrect"
+                    ] += 1
+                    evaluation_agg_entities_type[true.e_type]["ent_type"][
+                        "incorrect"
+                    ] += 1
+                    evaluation_agg_entities_type[true.e_type]["partial"]["correct"] += 1
+                    evaluation_agg_entities_type[true.e_type]["exact"]["correct"] += 1
+                    true_which_overlapped_with_pred.append(true)
+                    found_overlap = True
+                    break
+                # check for an overlap i.e. not exact boundary match, with true entities
+                elif find_overlap(true_range, pred_range):
+                    true_which_overlapped_with_pred.append(true)
+                    # Scenario V: There is an overlap (but offsets do not match
+                    # exactly), and the entity type is the same.
+                    # 2.1 overlaps with the same entity type
+                    if pred.e_type == true.e_type:
+                        # overall results
+                        evaluation["strict"]["incorrect"] += 1
+                        evaluation["ent_type"]["correct"] += 1
+                        evaluation["partial"]["partial"] += 1
+                        evaluation["exact"]["incorrect"] += 1
+                        # aggregated by entity type results
+                        evaluation_agg_entities_type[true.e_type]["strict"][
+                            "incorrect"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["ent_type"][
+                            "correct"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["partial"][
+                            "partial"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["exact"][
+                            "incorrect"
+                        ] += 1
+                        found_overlap = True
+                        break
+                    # Scenario VI: Entities overlap, but the entity type is
+                    # different.
+                    else:
+                        # overall results
+                        evaluation["strict"]["incorrect"] += 1
+                        evaluation["ent_type"]["incorrect"] += 1
+                        evaluation["partial"]["partial"] += 1
+                        evaluation["exact"]["incorrect"] += 1
+                        # aggregated by entity type results
+                        # Results against the true entity
+                        evaluation_agg_entities_type[true.e_type]["strict"][
+                            "incorrect"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["partial"][
+                            "partial"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["ent_type"][
+                            "incorrect"
+                        ] += 1
+                        evaluation_agg_entities_type[true.e_type]["exact"][
+                            "incorrect"
+                        ] += 1
+                        # Results against the predicted entity
+                        # evaluation_agg_entities_type[pred.e_type]['strict']['spurious'] += 1
+                        found_overlap = True
+                        break
+            # Scenario II: Entities are spurious (i.e., over-generated).
+            if not found_overlap:
+                # Overall results
+                evaluation["strict"]["spurious"] += 1
+                evaluation["ent_type"]["spurious"] += 1
+                evaluation["partial"]["spurious"] += 1
+                evaluation["exact"]["spurious"] += 1
+                # Aggregated by entity type results
+                # NOTE: when pred.e_type is not found in tags
+                # or when it simply does not appear in the test set, then it is
+                # spurious, but it is not clear where to assign it at the tag
+                # level. In this case, it is applied to all target_tags
+                # found in this example. This will mean that the sum of the
+                # evaluation_agg_entities will not equal evaluation.
+                for true in tags:
+                    evaluation_agg_entities_type[true]["strict"]["spurious"] += 1
+                    evaluation_agg_entities_type[true]["ent_type"]["spurious"] += 1
+                    evaluation_agg_entities_type[true]["partial"]["spurious"] += 1
+                    evaluation_agg_entities_type[true]["exact"]["spurious"] += 1
+    # Scenario III: Entity was missed entirely.
+    for true in true_named_entities:
+        if true in true_which_overlapped_with_pred:
+            continue
+        else:
+            # overall results
+            evaluation["strict"]["missed"] += 1
+            evaluation["ent_type"]["missed"] += 1
+            evaluation["partial"]["missed"] += 1
+            evaluation["exact"]["missed"] += 1
+            # for the agg. by e_type
+            evaluation_agg_entities_type[true.e_type]["strict"]["missed"] += 1
+            evaluation_agg_entities_type[true.e_type]["ent_type"]["missed"] += 1
+            evaluation_agg_entities_type[true.e_type]["partial"]["missed"] += 1
+            evaluation_agg_entities_type[true.e_type]["exact"]["missed"] += 1
+    # Compute 'possible', 'actual' according to SemEval-2013 Task 9.1 on the
+    # overall results, and use these to calculate precision and recall.
+    for eval_type in evaluation:
+        evaluation[eval_type] = compute_actual_possible(evaluation[eval_type])
+    # Compute 'possible', 'actual', and precision and recall on entity level
+    # results. Start by cycling through the accumulated results.
+    for entity_type, entity_level in evaluation_agg_entities_type.items():
+        # Cycle through the evaluation types for each dict containing entity
+        # level results.
+        for eval_type in entity_level:
+            evaluation_agg_entities_type[entity_type][
+                eval_type
+            ] = compute_actual_possible(entity_level[eval_type])
+    return evaluation, evaluation_agg_entities_type
+def find_overlap(true_range, pred_range):
+    """Find the overlap between two ranges
+    Find the overlap between two ranges. Return the overlapping values if
+    present, else return an empty set().
+    Examples:
+    >>> find_overlap((1, 2), (2, 3))
+    2
+    >>> find_overlap((1, 2), (3, 4))
+    set()
+    """
+    true_set = set(true_range)
+    pred_set = set(pred_range)
+    overlaps = true_set.intersection(pred_set)
+    return overlaps
+def compute_actual_possible(results):
+    """
+    Takes a result dict that has been output by compute metrics.
+    Returns the results dict with actual, possible populated.
+    When the results dicts is from partial or ent_type metrics, then
+    partial_or_type=True to ensure the right calculation is used for
+    calculating precision and recall.
+    """
+    correct = results["correct"]
+    incorrect = results["incorrect"]
+    partial = results["partial"]
+    missed = results["missed"]
+    spurious = results["spurious"]
+    # Possible: number annotations in the gold-standard which contribute to the
+    # final score
+    possible = correct + incorrect + partial + missed
+    # Actual: number of annotations produced by the NER system
+    actual = correct + incorrect + partial + spurious
+    results["actual"] = actual
+    results["possible"] = possible
+    return results
+def compute_precision_recall_f1(results, partial_or_type=False):
+    """
+    Takes a result dict that has been output by compute metrics.
+    Returns the results dict with precison and recall populated.
+    When the results dicts is from partial or ent_type metrics, then
+    partial_or_type=True to ensure the right calculation is used for
+    calculating precision and recall.
+    """
+    actual = results["actual"]
+    possible = results["possible"]
+    partial = results["partial"]
+    correct = results["correct"]
+    if partial_or_type:
+        precision = (correct + 0.5 * partial) / actual if actual > 0 else 0
+        recall = (correct + 0.5 * partial) / possible if possible > 0 else 0
+    else:
+        precision = correct / actual if actual > 0 else 0
+        recall = correct / possible if possible > 0 else 0
+    results["precision"] = precision
+    results["recall"] = recall
+    results["f1"] = (
+        precision * recall * 2 / (precision + recall) if precision + recall > 0 else 0
+    )
+    return results
+def compute_precision_recall_f1_wrapper(results):
+    """
+    Wraps the compute_precision_recall_f1 function and runs on a dict of results
+    """
+    results_a = {
+        key: compute_precision_recall_f1(value, True)
+        for key, value in results.items()
+        if key in ["partial", "ent_type"]
+    }
+    results_b = {
+        key: compute_precision_recall_f1(value)
+        for key, value in results.items()
+        if key in ["strict", "exact"]
+    }
+    results = {**results_a, **results_b}
+    return results

tests.py DELETED Viewed

@@ -1,17 +0,0 @@
-test_cases = [
-    {
-        "predictions": [0, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0}
-    },
-    {
-        "predictions": [1, 1],
-        "references": [1, 1],
-        "result": {"metric_score": 1}
-    },
-    {
-        "predictions": [1, 0],
-        "references": [1, 1],
-        "result": {"metric_score": 0.5}
-    }
-]

tests/test_ner_eval.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import evaluate
+import pytest
+ner_eval = evaluate.load("ner_eval.py")
+test_cases = [
+    {
+        "predictions": ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "O", "O", "B-ORG"],
+        "references": ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "O", "O", "B-ORG"],
+        "results": {
+            "overall": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+            "LOC": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+            "PER": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+            "ORG": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+        },
+    },
+    {
+        "predictions": [
+            "B-LOC",
+            "I-LOC",
+            "O",
+            "B-PER",
+            "I-PER",
+            "I-PER",
+            "I-PER",
+            "O",
+            "B-LOC",
+            "O",
+        ],
+        "references": [
+            "B-LOC",
+            "I-LOC",
+            "O",
+            "B-PER",
+            "I-PER",
+            "I-PER",
+            "I-PER",
+            "O",
+            "B-LOC",
+            "O",
+        ],
+        "results": {
+            "overall": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+            "LOC": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+            "PER": {
+                "strict_precision": 1.0,
+                "strict_recall": 1.0,
+                "strict_f1": 1.0,
+                "ent_type_precision": 1.0,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 1.0,
+                "partial_precision": 1.0,
+                "partial_recall": 1.0,
+                "partial_f1": 1.0,
+                "exact_precision": 1.0,
+                "exact_recall": 1.0,
+                "exact_f1": 1.0,
+            },
+        },
+    },
+    {
+        "predictions": ["O", "B-LOC", "I-LOC", "B-PER", "I-PER", "O", "B-ORG"],
+        "references": ["O", "B-LOC", "I-LOC", "O", "B-PER", "I-PER", "O", "B-ORG"],
+    },
+    {
+        "predictions": ["B-PER", "O", "B-LOC", "I-LOC", "O", "B-ORG", "I-ORG"],
+        "references": ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "O", "B-ORG"],
+        "results": {
+            "overall": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "ORG": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "PER": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "LOC": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+        },
+    },
+    {
+        "predictions": [
+            "B-LOC",
+            "I-LOC",
+            "I-LOC",
+            "B-ORG",
+            "I-ORG",
+            "O",
+            "B-PER",
+            "I-PER",
+            "I-PER",
+            "O",
+        ],
+        "references": [
+            "B-LOC",
+            "I-LOC",
+            "O",
+            "O",
+            "B-ORG",
+            "I-ORG",
+            "O",
+            "B-PER",
+            "I-PER",
+            "O",
+        ],
+        "results": {
+            "overall": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 2 / 3,
+                "ent_type_recall": 2 / 3,
+                "ent_type_f1": 2 / 3,
+                "partial_precision": 1 / 3,
+                "partial_recall": 1 / 3,
+                "partial_f1": 1 / 3,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "ORG": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.0,
+                "ent_type_recall": 0.0,
+                "ent_type_f1": 0,
+                "partial_precision": 0.0,
+                "partial_recall": 0.0,
+                "partial_f1": 0,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "PER": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.5,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 2 / 3,
+                "partial_precision": 0.25,
+                "partial_recall": 0.5,
+                "partial_f1": 1 / 3,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+            "LOC": {
+                "strict_precision": 0.0,
+                "strict_recall": 0.0,
+                "strict_f1": 0,
+                "ent_type_precision": 0.5,
+                "ent_type_recall": 1.0,
+                "ent_type_f1": 2 / 3,
+                "partial_precision": 0.25,
+                "partial_recall": 0.5,
+                "partial_f1": 1 / 3,
+                "exact_precision": 0.0,
+                "exact_recall": 0.0,
+                "exact_f1": 0,
+            },
+        },
+    },
+]
+def compare_results(result1, result2):
+    # recursively check if dictionaries are equal
+    if isinstance(result1, dict):
+        for key in result1.keys():
+            if not compare_results(result1[key], result2[key]):
+                return False
+        return True
+    elif isinstance(result1, list):
+        for item1, item2 in zip(result1, result2):
+            if not compare_results(item1, item2):
+                return False
+        return True
+    else:
+        return result1 == result2
+@pytest.mark.parametrize("case", test_cases)
+def test_metric(case):
+    if "results" not in case:
+        with pytest.raises(ValueError):
+            results = ner_eval.compute(
+                predictions=[case["predictions"]], references=[case["references"]]
+            )
+    else:
+        results = ner_eval.compute(
+            predictions=[case["predictions"]], references=[case["references"]]
+        )
+        assert compare_results(results, case["results"])