metric / llm_as_judge.py
Elron's picture
Upload llm_as_judge.py with huggingface_hub
f418928 verified
raw
history blame
2.01 kB
from typing import Any, Dict, List
import evaluate
from .api import produce
from .inference import InferenceEngine
from .metrics import BulkInstanceMetric
class LLMAsJudge(BulkInstanceMetric):
"""LLM as judge based metric class for evaluating correctness.
Attributes:
main_score (str): The main score used for evaluation.
reduction_map (dict): A dictionary specifying the reduction method for the metric.
betch_size (int): The size of the bulk.
recipe (str): The unitxt recipe that will be used to create the judge dataset.
inference (InferenceEngine): the module that creates the inference.
Methods:
prepare(self): Initialization method for the metric.
compute(self, references, predictions, additional_inputs): Method to compute the metric.
Usage:
metric = LlamaIndexCorrectnessMetric()
scores = metric.compute(references, prediction, additional_inputs)
"""
main_score: str = "llm_as_judge"
reduction_map: Dict[str, List[str]] = None
batch_size: int = 32
recipe: str
inference_model: InferenceEngine
def prepare(self):
super().prepare()
if self.reduction_map is None:
self.reduction_map = {"mean": [self.main_score]}
def compute(
self,
references: List[List[Any]],
predictions: List[Any],
task_data: List[Dict],
) -> List[Dict[str, Any]]:
instances = [
{
**task_data_instance,
**{"model_output": prediction, "rating_label": "[[5]]"},
}
for task_data_instance, prediction in zip(task_data, predictions)
]
dataset = produce(instances, self.recipe)
verdicts = self.inference_model.infer(dataset)
meta_metric = evaluate.load("unitxt/metric")
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
return [{self.main_score: instance["prediction"]} for instance in meta_scores]