Spaces:
Running
Running
File size: 6,756 Bytes
2885a60 dbb453d 2885a60 dbb453d 2885a60 dbb453d 2885a60 dbb453d 2885a60 ac939ac 2885a60 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import datasets
import evaluate
from harim_scorer import Harimplus_Scorer
logger = evaluate.logging.get_logger(__name__)
CODEBASE_URL='https://huggingface.co/spaces/NCSOFT/harim_plus'
PAPER_URL='https://arxiv.org/abs/2211.12118'
_CITATION = """\
@inproceedings{son-etal-2022-harim,
title = "{H}a{R}i{M}$^+$: Evaluating Summary Quality with Hallucination Risk",
author = "Son, Seonil (Simon) and
Park, Junsoo and
Hwang, Jeong-in and
Lee, Junghwa and
Noh, Hyungjong and
Lee, Yeonsoo",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.aacl-main.66",
pages = "895--924",
abstract = "One of the challenges of developing a summarization model arises from the difficulty in measuring the factual inconsistency of the generated text. In this study, we reinterpret the decoder overconfidence-regularizing objective suggested in (Miao et al., 2021) as a hallucination risk measurement to better estimate the quality of generated summaries. We propose a reference-free metric, HaRiM+, which only requires an off-the-shelf summarization model to compute the hallucination risk based on token likelihoods. Deploying it requires no additional training of models or ad-hoc modules, which usually need alignment to human judgments. For summary-quality estimation, HaRiM+ records state-of-the-art correlation to human judgment on three summary-quality annotation sets: FRANK, QAGS, and SummEval. We hope that our work, which merits the use of summarization models, facilitates the progress of both automated evaluation and generation of summary.",
}
"""
_DESCRIPTION = """\
HaRiM+ is a reference-less (i.e. scoring summary quality only requires an article) evaluation metric score for summarization task which hurls the power of summarization model.
It will work great ranking the summary-article pairs according to its quality.
Note that the score range is unbound.
Summarization model inside the HaRiM+ will read and evaluate how good the quality of a summary given the paired source article.
HaRiM+ is proved effective for benchmarking summarization systems (system-level performance) as well as ranking the article-summary pairs (segment-level performance) in comprehensive aspect such as factuality, consistency, coherency, fluency, and relevance. For details, refer to our paper published in AACL2022.
"""
_KWARGS_DESCRIPTION = """
HaRiM+ score.
Args:
For scorer = evaluate.load():
`pretrained_name` (str or pathlib.Path): summarization model checkpoint or path, loaded by transformers.AutoModelForSeq2SeqLM.from_pretrained(). Defaults to Yale-LILY/brio-cnndm-uncased.
`tokenizer`: (use when your tokenizer cannot be loaded by from_pretrained)Tokenizer function compatible with transformers.PreTrainedTokenizer. It requires tokenizer.pad_token|eos_token|bos_token and tokenizer.__call__() method for HaRiM+ score computation.
For scorer.compute():
`predictions` (list of str): generated summaries
`references` (list of str): source articles to be summarized
`use_aggregator` (bool): if True, average of the scores are returned
Returns:
'results' (dict): {
'harim+' (List[float] or float): HaRiM+ score to use,
'harim' (List[float] or float): HaRiM term for computing the score above,
'log_ppl' (List[float] or float): Log perplexity term. Same as (Yuan et al., NeurIPS 2021),
'lambda' (float): (recommend not to modify this) Balancing coeff. for computing harim+ from harim and log_ppl.
}
Examples:
>>> summaries = ["hello there", "hello there"]
>>> articles = ["hello, this is the article to be summarized", "hello, this is the article to be summarized"]
>>> scorer = evaluate.load("NCSOFT/harim_plus") #, pretrained_name='PRETRAINEDNAME', tokenizer=TOKENIZER # optional
>>> results = scorer.compute(predictions=summaries, references=articles) # use_aggregator=True # optional
>>> print([round(v, 2) for v in results["harim+"]])
[float, float]
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Harimplus(evaluate.Metric):
def __init__(self,
pretrained_name='facebook/bart-large-cnn',
tokenizer=None,
device='cuda',
**kwargs
):
super().__init__(**kwargs)
self.myconfig = dict(
pretrained_name=pretrained_name,
tokenizer=tokenizer,
device=device,
)
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
homepage=CODEBASE_URL,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
codebase_urls=[CODEBASE_URL],
reference_urls=[CODEBASE_URL, PAPER_URL],
)
def _download_and_prepare(self, dl_manager):
pretrained_name = self.myconfig['pretrained_name']
is_custom_tokenizer = self.myconfig['tokenizer'] is not None
logger.warning(
"Loading HaRiM+ score"
f"\tpretrained_name = {pretrained_name}"
)
if is_custom_tokenizer:
logger.warning(
f"tokenizer is overriden by \n\tself.myconfig['tokenizer']"
)
logger.warning(
"You can change checkpoints with `pretrained_name` kwarg in evaluate.load. Strongly recommend to use *-large or larger ones."
"Refrain from using checkpoints trained on noisy corpus such as bbc-XSUM.")
# download the model checkpoint specified by self.myconfig_name and set up the scorer
self.scorer = Harimplus_Scorer(**self.myconfig)
def _compute(self, predictions=None,
references=None,
use_aggregator=False,
bsz=32,
tokenwise_score=False):
summaries = predictions
articles = references
scores = self.scorer.compute(predictions=summaries, references=articles, use_aggregator=use_aggregator, bsz=bsz, tokenwise_score=tokenwise_score)
return scores
|