pminervini's picture
update
c323865
raw
history blame
5.83 kB
import os
from typing import Union, List
from lm_eval.api.task import Task
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_task
from lm_eval.api.metrics import mean
import spacy
from selfcheckgpt.modeling_selfcheck import SelfCheckMQAG, SelfCheckNLI, SelfCheckBERTScore, SelfCheckNgram
@register_task("selfcheckgpt")
class SelfCheckGpt(Task):
VERSION = 0.0
DATASET_PATH = "potsawee/wiki_bio_gpt3_hallucination"
DATASET_NAME = None
OUTPUT_TYPE = 'generate_until'
def __init__(self, data_dir=None, cache_dir=None, download_mode=None, config=None):
super().__init__(data_dir=data_dir, cache_dir=cache_dir, download_mode=download_mode, config=config)
self.generation_kwargs = {"temperature": 0.0, "do_sample": False}
self.generation_kwargs_sampling_number = 5 # the number of sampling for self-consistence
self.generation_kwargs_sampling = {"temperature": 1.0, "do_sample": False}
self.selfcheckgpt_type = os.environ.get('SELFCHECKGPTTYPE', 'SelfCheckNgram')
self.selfcheckgpt_device = os.environ.get('SELFCHECKGPTDEVICE', 'cpu')
self.selfcheckgpt_nlp = spacy.load("en_core_web_sm")
if self.selfcheckgpt_type == 'SelfCheckNgram':
self.selfcheckgpt = SelfCheckNgram(n=1)
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
self.selfcheckgpt = SelfCheckBERTScore(rescale_with_baseline=True)
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
self.selfcheckgpt = SelfCheckMQAG(device=self.selfcheckgpt_device)
elif self.selfcheckgpt_type == 'SelfCheckNLI':
self.selfcheckgpt = SelfCheckNLI(device=self.selfcheckgpt_device)
def has_training_docs(self):
return False
def has_validation_docs(self):
return True
def has_test_docs(self):
return False
def validation_docs(self):
return self.dataset["evaluation"]
def doc_to_text(self, doc):
doc_text = doc["wiki_bio_text"]
doc_text = doc_text.split()
doc_text = " ".join(doc_text[:5])
doc_text = f"Please generating a Wikipedia passage starting with: {doc_text}\n"
return doc_text
def doc_to_target(self, doc):
answer = doc['wiki_bio_text']
return answer
def construct_requests(
self, doc: dict, ctx: str, **kwargs
) -> Union[List[Instance], Instance]:
arguments = (ctx, self.generation_kwargs)
request_list = [
Instance(
request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=arguments,
idx=0,
**kwargs
),
]
sampling_arguments = (ctx, self.generation_kwargs_sampling)
request_list.extend([
Instance(
request_type=self.OUTPUT_TYPE,
doc=doc,
arguments=sampling_arguments,
idx=idx,
**kwargs
)
for idx in range(1, self.generation_kwargs_sampling_number+1)
]
)
return request_list
def process_results(self, doc, results):
response_temperature_0 = results[0]
other_responses = results[1:]
passage = self.doc_to_target(doc)
sentences = self.selfcheckgpt_nlp(response_temperature_0)
sentences = [sent.text.strip() for sent in sentences.sents]
if self.selfcheckgpt_type == 'SelfCheckNgram':
selfcheckgpt_scores = self.selfcheckgpt.predict(
sentences = sentences,
passage = response_temperature_0,
sampled_passages = other_responses,
)
return {'avg-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_neg_logprob'],
'max-selfcheckgpt': selfcheckgpt_scores['doc_level']['avg_max_neg_logprob']}
elif self.selfcheckgpt_type == 'SelfCheckBERTScore':
selfcheckgpt_scores = self.selfcheckgpt.predict(
sentences = sentences,
sampled_passages = other_responses,
)
elif self.selfcheckgpt_type == 'SelfCheckMQAG':
selfcheckgpt_scores = self.selfcheckgpt.predict(
sentences = sentences,
sampled_passages = other_responses,
)
elif self.selfcheckgpt_type == 'SelfCheckNLI':
selfcheckgpt_scores = self.selfcheckgpt.predict(
sentences = sentences,
passage = response_temperature_0,
sampled_passages = other_responses,
num_questions_per_sent = 5, # number of questions to be drawn
scoring_method = 'bayes_with_alpha', # options = 'counting', 'bayes', 'bayes_with_alpha'
beta1 = 0.8, beta2 = 0.8, # additional params depending on scoring_method
)
selfcheckgpt_scores_avg = sum(selfcheckgpt_scores) / len(selfcheckgpt_scores) if len(selfcheckgpt_scores) > 0 else 0
selfcheckgpt_scores_max = max(selfcheckgpt_scores)
return {'avg-selfcheckgpt': selfcheckgpt_scores_avg, 'max-selfcheckgpt': selfcheckgpt_scores_max}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {k: mean for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {k: False for k in ["avg-selfcheckgpt", "max-selfcheckgpt"]}