# Copyright 2022 The HuggingFace Evaluate Authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ XTREME-S benchmark metric. """ from typing import List import datasets from datasets.config import PY_VERSION from packaging import version from sklearn.metrics import f1_score import evaluate if PY_VERSION < version.parse("3.8"): import importlib_metadata else: import importlib.metadata as importlib_metadata # TODO(Patrick/Anton) _CITATION = """\ """ _DESCRIPTION = """\ XTREME-S is a benchmark to evaluate universal cross-lingual speech representations in many languages. XTREME-S covers four task families: speech recognition, classification, speech-to-text translation and retrieval. """ _KWARGS_DESCRIPTION = """ Compute XTREME-S evaluation metric associated to each XTREME-S dataset. Args: predictions: list of predictions to score. Each translation should be tokenized into a list of tokens. references: list of lists of references for each translation. Each reference should be tokenized into a list of tokens. bleu_kwargs: optional dict of keywords to be passed when computing 'bleu'. Keywords include Dict can be one of 'smooth_method', 'smooth_value', 'force', 'lowercase', 'tokenize', 'use_effective_order'. wer_kwargs: optional dict of keywords to be passed when computing 'wer' and 'cer'. Keywords include 'concatenate_texts'. Returns: depending on the XTREME-S task, one or several of: "accuracy": Accuracy - for 'fleurs-lang_id', 'minds14' "f1": F1 score - for 'minds14' "wer": Word error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel' "cer": Character error rate - for 'mls', 'fleurs-asr', 'voxpopuli', 'babel' "bleu": BLEU score according to the `sacrebleu` metric - for 'covost2' Examples: >>> xtreme_s_metric = evaluate.load('xtreme_s', 'mls') # 'mls', 'voxpopuli', 'fleurs-asr' or 'babel' >>> references = ["it is sunny here", "paper and pen are essentials"] >>> predictions = ["it's sunny", "paper pen are essential"] >>> results = xtreme_s_metric.compute(predictions=predictions, references=references) >>> print({k: round(v, 2) for k, v in results.items()}) {'wer': 0.56, 'cer': 0.27} >>> xtreme_s_metric = evaluate.load('xtreme_s', 'covost2') >>> references = ["bonjour paris", "il est necessaire de faire du sport de temps en temp"] >>> predictions = ["bonjour paris", "il est important de faire du sport souvent"] >>> results = xtreme_s_metric.compute(predictions=predictions, references=references) >>> print({k: round(v, 2) for k, v in results.items()}) {'bleu': 31.65} >>> xtreme_s_metric = evaluate.load('xtreme_s', 'fleurs-lang_id') >>> references = [0, 1, 0, 0, 1] >>> predictions = [0, 1, 1, 0, 0] >>> results = xtreme_s_metric.compute(predictions=predictions, references=references) >>> print({k: round(v, 2) for k, v in results.items()}) {'accuracy': 0.6} >>> xtreme_s_metric = evaluate.load('xtreme_s', 'minds14') >>> references = [0, 1, 0, 0, 1] >>> predictions = [0, 1, 1, 0, 0] >>> results = xtreme_s_metric.compute(predictions=predictions, references=references) >>> print({k: round(v, 2) for k, v in results.items()}) {'f1': 0.58, 'accuracy': 0.6} """ _CONFIG_NAMES = ["fleurs-asr", "mls", "voxpopuli", "babel", "covost2", "fleurs-lang_id", "minds14"] SENTENCE_DELIMITER = "" try: from jiwer import transforms as tr _jiwer_available = True except ImportError: _jiwer_available = False if _jiwer_available and version.parse(importlib_metadata.version("jiwer")) < version.parse("2.3.0"): class SentencesToListOfCharacters(tr.AbstractTransform): def __init__(self, sentence_delimiter: str = " "): self.sentence_delimiter = sentence_delimiter def process_string(self, s: str): return list(s) def process_list(self, inp: List[str]): chars = [] for sent_idx, sentence in enumerate(inp): chars.extend(self.process_string(sentence)) if self.sentence_delimiter is not None and self.sentence_delimiter != "" and sent_idx < len(inp) - 1: chars.append(self.sentence_delimiter) return chars cer_transform = tr.Compose( [tr.RemoveMultipleSpaces(), tr.Strip(), SentencesToListOfCharacters(SENTENCE_DELIMITER)] ) elif _jiwer_available: cer_transform = tr.Compose( [ tr.RemoveMultipleSpaces(), tr.Strip(), tr.ReduceToSingleSentence(SENTENCE_DELIMITER), tr.ReduceToListOfListOfChars(), ] ) else: cer_transform = None def simple_accuracy(preds, labels): return float((preds == labels).mean()) def f1_and_simple_accuracy(preds, labels): return { "f1": float(f1_score(y_true=labels, y_pred=preds, average="macro")), "accuracy": simple_accuracy(preds, labels), } def bleu( preds, labels, smooth_method="exp", smooth_value=None, force=False, lowercase=False, tokenize=None, use_effective_order=False, ): # xtreme-s can only have one label labels = [[label] for label in labels] preds = list(preds) try: import sacrebleu as scb except ImportError: raise ValueError( "sacrebleu has to be installed in order to apply the bleu metric for covost2." "You can install it via `pip install sacrebleu`." ) if version.parse(scb.__version__) < version.parse("1.4.12"): raise ImportWarning( "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" 'You can install it with `pip install "sacrebleu>=1.4.12"`.' ) references_per_prediction = len(labels[0]) if any(len(refs) != references_per_prediction for refs in labels): raise ValueError("Sacrebleu requires the same number of references for each prediction") transformed_references = [[refs[i] for refs in labels] for i in range(references_per_prediction)] output = scb.corpus_bleu( preds, transformed_references, smooth_method=smooth_method, smooth_value=smooth_value, force=force, lowercase=lowercase, use_effective_order=use_effective_order, **(dict(tokenize=tokenize) if tokenize else {}), ) return {"bleu": output.score} def wer_and_cer(preds, labels, concatenate_texts, config_name): try: from jiwer import compute_measures except ImportError: raise ValueError( f"jiwer has to be installed in order to apply the wer metric for {config_name}." "You can install it via `pip install jiwer`." ) if concatenate_texts: wer = compute_measures(labels, preds)["wer"] cer = compute_measures(labels, preds, truth_transform=cer_transform, hypothesis_transform=cer_transform)["wer"] return {"wer": wer, "cer": cer} else: def compute_score(preds, labels, score_type="wer"): incorrect = 0 total = 0 for prediction, reference in zip(preds, labels): if score_type == "wer": measures = compute_measures(reference, prediction) elif score_type == "cer": measures = compute_measures( reference, prediction, truth_transform=cer_transform, hypothesis_transform=cer_transform ) incorrect += measures["substitutions"] + measures["deletions"] + measures["insertions"] total += measures["substitutions"] + measures["deletions"] + measures["hits"] return incorrect / total return {"wer": compute_score(preds, labels, "wer"), "cer": compute_score(preds, labels, "cer")} @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class XtremeS(evaluate.EvaluationModule): def _info(self): if self.config_name not in _CONFIG_NAMES: raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}") pred_type = "int64" if self.config_name in ["fleurs-lang_id", "minds14"] else "string" return evaluate.EvaluationModuleInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=datasets.Features( {"predictions": datasets.Value(pred_type), "references": datasets.Value(pred_type)} ), codebase_urls=[], reference_urls=[], format="numpy", ) def _compute(self, predictions, references, bleu_kwargs=None, wer_kwargs=None): bleu_kwargs = bleu_kwargs if bleu_kwargs is not None else {} wer_kwargs = wer_kwargs if wer_kwargs is not None else {} if self.config_name == "fleurs-lang_id": return {"accuracy": simple_accuracy(predictions, references)} elif self.config_name == "minds14": return f1_and_simple_accuracy(predictions, references) elif self.config_name == "covost2": smooth_method = bleu_kwargs.pop("smooth_method", "exp") smooth_value = bleu_kwargs.pop("smooth_value", None) force = bleu_kwargs.pop("force", False) lowercase = bleu_kwargs.pop("lowercase", False) tokenize = bleu_kwargs.pop("tokenize", None) use_effective_order = bleu_kwargs.pop("use_effective_order", False) return bleu( preds=predictions, labels=references, smooth_method=smooth_method, smooth_value=smooth_value, force=force, lowercase=lowercase, tokenize=tokenize, use_effective_order=use_effective_order, ) elif self.config_name in ["fleurs-asr", "mls", "voxpopuli", "babel"]: concatenate_texts = wer_kwargs.pop("concatenate_texts", False) return wer_and_cer(predictions, references, concatenate_texts, self.config_name) else: raise KeyError(f"You should supply a configuration name selected in {_CONFIG_NAMES}")