Spaces:

unitxt
/

metric

Running

App Files Files Community

metric / llm_as_judge.py

Elron

Upload llm_as_judge.py with huggingface_hub

f418928 verified 10 months ago

raw

history blame

2.01 kB

	from typing import Any, Dict, List

	import evaluate

	from .api import produce
	from .inference import InferenceEngine
	from .metrics import BulkInstanceMetric


	class LLMAsJudge(BulkInstanceMetric):
	"""LLM as judge based metric class for evaluating correctness.

	Attributes:
	main_score (str): The main score used for evaluation.
	reduction_map (dict): A dictionary specifying the reduction method for the metric.
	betch_size (int): The size of the bulk.
	recipe (str): The unitxt recipe that will be used to create the judge dataset.
	inference (InferenceEngine): the module that creates the inference.

	Methods:
	prepare(self): Initialization method for the metric.
	compute(self, references, predictions, additional_inputs): Method to compute the metric.

	Usage:
	metric = LlamaIndexCorrectnessMetric()
	scores = metric.compute(references, prediction, additional_inputs)
	"""

	main_score: str = "llm_as_judge"
	reduction_map: Dict[str, List[str]] = None
	batch_size: int = 32
	recipe: str
	inference_model: InferenceEngine

	def prepare(self):
	super().prepare()
	if self.reduction_map is None:
	self.reduction_map = {"mean": [self.main_score]}

	def compute(
	self,
	references: List[List[Any]],
	predictions: List[Any],
	task_data: List[Dict],
	) -> List[Dict[str, Any]]:
	instances = [
	{
	**task_data_instance,
	**{"model_output": prediction, "rating_label": "[[5]]"},
	}
	for task_data_instance, prediction in zip(task_data, predictions)
	]

	dataset = produce(instances, self.recipe)
	verdicts = self.inference_model.infer(dataset)
	meta_metric = evaluate.load("unitxt/metric")
	meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
	return [{self.main_score: instance["prediction"]} for instance in meta_scores]