Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import sacrebleu | |
import numpy as np | |
from rouge_score import rouge_scorer, scoring | |
def process_results(doc, results): | |
# (Pdb)doc.keys() | |
# dict_keys(['document', 'summary', 'id']) | |
# (Pdb++) results | |
# [' The Welsh Government has announced | |
# breakpoint() | |
completion = results[0] | |
# true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"] | |
# all_refs = true_refs + false_refs | |
document = doc["document"] | |
true_refs = [doc["summary"]] | |
all_refs = true_refs | |
# ROUGE-N | |
rouge_scores = [rouge([ref], [completion]) for ref in all_refs] | |
# ROUGE-1 | |
rouge1_scores = [score["rouge1"] for score in rouge_scores] | |
# ROUGE-2 | |
rouge2_scores = [score["rouge2"] for score in rouge_scores] | |
# ROUGE-L | |
rougeL_scores = [score["rougeLsum"] for score in rouge_scores] | |
res = { | |
"rouge1": rouge1_scores[0], | |
"rouge2": rouge2_scores[0], | |
"rougeL": rougeL_scores[0], | |
} | |
return res | |
def bleu(refs, preds): | |
""" | |
Returns `t5` style BLEU scores. See the related implementation: | |
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41 | |
:param refs: | |
A `list` of `list` of reference `str`s. | |
:param preds: | |
A `list` of predicted `str`s. | |
""" | |
score = sacrebleu.corpus_bleu( | |
preds, | |
refs, | |
smooth_method="exp", | |
smooth_value=0.0, | |
force=False, | |
lowercase=False, | |
tokenize="intl", | |
use_effective_order=False, | |
).score | |
return score | |
def rouge(refs, preds): | |
""" | |
Returns `t5` style ROUGE scores. See the related implementation: | |
https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68 | |
:param refs: | |
A `list` of reference `strs`. | |
:param preds: | |
A `list` of predicted `strs`. | |
""" | |
rouge_types = ["rouge1", "rouge2", "rougeLsum"] | |
scorer = rouge_scorer.RougeScorer(rouge_types) | |
# Add newlines between sentences to correctly compute `rougeLsum`. | |
def _prepare_summary(summary): | |
summary = summary.replace(" . ", ".\n") | |
return summary | |
# Accumulate confidence intervals. | |
aggregator = scoring.BootstrapAggregator() | |
for ref, pred in zip(refs, preds): | |
ref = _prepare_summary(ref) | |
pred = _prepare_summary(pred) | |
aggregator.add_scores(scorer.score(ref, pred)) | |
result = aggregator.aggregate() | |
return {type: result[type].mid.fmeasure * 100 for type in rouge_types} | |