|
from typing import Any, Dict, List |
|
|
|
import evaluate |
|
|
|
from .api import produce |
|
from .inference import InferenceEngine |
|
from .metrics import BulkInstanceMetric |
|
|
|
|
|
class LLMAsJudge(BulkInstanceMetric): |
|
"""LLM as judge based metric class for evaluating correctness. |
|
|
|
Attributes: |
|
main_score (str): The main score used for evaluation. |
|
reduction_map (dict): A dictionary specifying the reduction method for the metric. |
|
betch_size (int): The size of the bulk. |
|
recipe (str): The unitxt recipe that will be used to create the judge dataset. |
|
inference (InferenceEngine): the module that creates the inference. |
|
|
|
Methods: |
|
prepare(self): Initialization method for the metric. |
|
compute(self, references, predictions, additional_inputs): Method to compute the metric. |
|
|
|
Usage: |
|
metric = LlamaIndexCorrectnessMetric() |
|
scores = metric.compute(references, prediction, additional_inputs) |
|
""" |
|
|
|
main_score: str = "llm_as_judge" |
|
reduction_map: Dict[str, List[str]] = None |
|
batch_size: int = 32 |
|
recipe: str |
|
inference_model: InferenceEngine |
|
|
|
def prepare(self): |
|
super().prepare() |
|
if self.reduction_map is None: |
|
self.reduction_map = {"mean": [self.main_score]} |
|
|
|
def compute( |
|
self, |
|
references: List[List[Any]], |
|
predictions: List[Any], |
|
task_data: List[Dict], |
|
) -> List[Dict[str, Any]]: |
|
instances = [ |
|
{ |
|
**task_data_instance, |
|
**{"model_output": prediction, "rating_label": "[[5]]"}, |
|
} |
|
for task_data_instance, prediction in zip(task_data, predictions) |
|
] |
|
|
|
dataset = produce(instances, self.recipe) |
|
verdicts = self.inference_model.infer(dataset) |
|
meta_metric = evaluate.load("unitxt/metric") |
|
meta_scores = meta_metric.compute(predictions=verdicts, references=dataset) |
|
return [{self.main_score: instance["prediction"]} for instance in meta_scores] |
|
|