# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """TODO: Add a description here.""" import evaluate import datasets # TODO: Add BibTeX citation _CITATION = """\ @InProceedings{huggingface:module, title = {A great new module}, authors={huggingface, Inc.}, year={2020} } """ # TODO: Add description of the module here _DESCRIPTION = """\ This new module is designed to solve this great ML task and is crafted with a lot of care. """ # TODO: Add description of the arguments of the module here _KWARGS_DESCRIPTION = """ Calculates how good are predictions given some references, using certain scores Args: predictions: list of predictions to score. Each predictions should be a string with tokens separated by spaces. references: list of reference for each prediction. Each reference should be a string with tokens separated by spaces. Returns: accuracy: description of the first score, another_score: description of the second score, Examples: Examples should be written in doctest format, and should illustrate how to use the function. >>> metric = evaluate.load("DarrenChensformer/aciton_generation") >>> results = metric.compute(references=[0, 1], predictions=[0, 1]) >>> print(results) {'accuracy': 1.0} """ # TODO: Define external resources urls if needed BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" class BaseEvaluater: eps = 1e-8 def __call__(self, preds, labels): return self._compute(preds, labels) def _compute(self, preds, labels): # calculate precision, recall, f1 tp, fp, fn = 0, 0, 0 for pred, label in zip(preds, labels): tp += len(set(pred) & set(label)) fp += len(set(pred) - set(label)) fn += len(set(label) - set(pred)) precision = tp / (tp + fp + self.eps) recall = tp / (tp + fn + self.eps) f1 = 2 * precision * recall / (precision + recall) return { "precision": round(precision, 4), "recall": round(recall, 4), "f1": round(f1, 4) } class ClassEvaluater(BaseEvaluater): def __init__(self, valid_labels=None): self.valid_labels = valid_labels def __call__(self, preds, labels): preds = map(self.extract_class, preds) labels = map(self.extract_class, labels) # helper function to extract valid tags preds = list(map(self.extract_valid, preds)) labels = list(map(self.extract_valid, labels)) return self._compute(preds, labels) def extract_valid(self, tags): # TODO: if valid_labels is None: tags = list(filter(lambda tag: tag in self.valid_labels, tags)) return tags def extract_class(self, tags): tags = map(lambda tag: tag.replace("/ ", "/"), tags) tags = list(map(self.batch_extract_class, tags)) # deduplicate tags = list(dict.fromkeys(tags)) return tags def batch_extract_class(self, tag): # filter out invalid tags tag = tag.split('/') if len(tag)==3: _class = '/'.join(tag[:2]) elif len(tag)==4: _class = '/'.join(tag[:3]) elif len(tag)==1: _class = '' else: _class = None if _class in self.valid_labels: return _class else: return "" class PhraseEvaluater(BaseEvaluater): def __init__(self, valid_labels=None): self.valid_labels = valid_labels def __call__(self, preds, labels): preds = map(self.extract_phrase, preds) labels = map(self.extract_phrase, labels) return self._compute(preds, labels) def extract_phrase(self, tags): tags = map(lambda tag: tag.replace("/ ", "/"), tags) tags = list(map(self.batch_extract_phrase, tags)) # deduplicate tags = list(dict.fromkeys(tags)) return tags def batch_extract_phrase(self, phrase): # filter out invalid tags tag = phrase.split('/') if len(tag)==3: _class = '/'.join(tag[:2]) elif len(tag)==4: _class = '/'.join(tag[:3]) elif len(tag)==1: _class = '' else: _class = None if _class in self.valid_labels: return phrase.replace(_class, '') else: return "" @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class action_generation(evaluate.Metric): """TODO: Short description of my evaluation module.""" def _info(self): # TODO: Specifies the evaluate.EvaluationModuleInfo object return evaluate.MetricInfo( # This is the description that will appear on the modules page. module_type="metric", description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, # This defines the format of each prediction and reference features=datasets.Features({ 'predictions': datasets.Sequence(datasets.Value('string')), 'references': datasets.Sequence(datasets.Value('string')), }), # Homepage of the module for documentation homepage="http://module.homepage", # Additional links to the codebase or references codebase_urls=["http://github.com/path/to/codebase/of/new_module"], reference_urls=["http://path.to.reference.url/new_module"] ) def _download_and_prepare(self, dl_manager): """Optional: download external resources useful to compute the scores""" # TODO: Download external resources if needed pass def _compute(self, predictions, references, valid_labels=None, detailed_scores=False, weights={"class": 0.8, "phrase": 0.2} ): """Returns the scores""" weights = {"class": 0.8, "phrase": 0.2} class_eval = ClassEvaluater(valid_labels)(predictions, references) phrase_eval = PhraseEvaluater(valid_labels)(predictions, references) weight_sum = { key: round((class_eval[key] * weights["class"]) + (phrase_eval[key] * weights["phrase"]), 4) for key in class_eval } if detailed_scores: results = { "class": class_eval, "phrase": phrase_eval, "weighted_sum": weight_sum } else: results = weight_sum return results