Elron commited on
Commit
f418928
·
verified ·
1 Parent(s): 5c531b1

Upload llm_as_judge.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. llm_as_judge.py +58 -0
llm_as_judge.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Dict, List
2
+
3
+ import evaluate
4
+
5
+ from .api import produce
6
+ from .inference import InferenceEngine
7
+ from .metrics import BulkInstanceMetric
8
+
9
+
10
+ class LLMAsJudge(BulkInstanceMetric):
11
+ """LLM as judge based metric class for evaluating correctness.
12
+
13
+ Attributes:
14
+ main_score (str): The main score used for evaluation.
15
+ reduction_map (dict): A dictionary specifying the reduction method for the metric.
16
+ betch_size (int): The size of the bulk.
17
+ recipe (str): The unitxt recipe that will be used to create the judge dataset.
18
+ inference (InferenceEngine): the module that creates the inference.
19
+
20
+ Methods:
21
+ prepare(self): Initialization method for the metric.
22
+ compute(self, references, predictions, additional_inputs): Method to compute the metric.
23
+
24
+ Usage:
25
+ metric = LlamaIndexCorrectnessMetric()
26
+ scores = metric.compute(references, prediction, additional_inputs)
27
+ """
28
+
29
+ main_score: str = "llm_as_judge"
30
+ reduction_map: Dict[str, List[str]] = None
31
+ batch_size: int = 32
32
+ recipe: str
33
+ inference_model: InferenceEngine
34
+
35
+ def prepare(self):
36
+ super().prepare()
37
+ if self.reduction_map is None:
38
+ self.reduction_map = {"mean": [self.main_score]}
39
+
40
+ def compute(
41
+ self,
42
+ references: List[List[Any]],
43
+ predictions: List[Any],
44
+ task_data: List[Dict],
45
+ ) -> List[Dict[str, Any]]:
46
+ instances = [
47
+ {
48
+ **task_data_instance,
49
+ **{"model_output": prediction, "rating_label": "[[5]]"},
50
+ }
51
+ for task_data_instance, prediction in zip(task_data, predictions)
52
+ ]
53
+
54
+ dataset = produce(instances, self.recipe)
55
+ verdicts = self.inference_model.infer(dataset)
56
+ meta_metric = evaluate.load("unitxt/metric")
57
+ meta_scores = meta_metric.compute(predictions=verdicts, references=dataset)
58
+ return [{self.main_score: instance["prediction"]} for instance in meta_scores]