metric / llm_as_judge.py
Elron's picture
Upload folder using huggingface_hub
f6ebc4f verified
raw
history blame
9.55 kB
from typing import Any, Dict, List, Literal, Optional
from .api import evaluate, produce
from .artifact import Artifact, fetch_artifact, settings
from .formats import Format
from .inference import InferenceEngine, OpenAiInferenceEngine
from .metrics import BulkInstanceMetric
from .operator import SequentialOperator
from .system_prompts import SystemPrompt
from .templates import Template
class LLMAsJudge(BulkInstanceMetric):
"""LLM as judge based metric class for evaluating correctness.
Attributes:
main_score (str): The main score label used for evaluation.
task (Literal["rating.single_turn"]): The type of task the llm-as-judge runs. This defines the output and input
format of the jude model.
template (Template): The template used when generating inputs for the judge llm.
format (Format): The format used when generating inputs for judge llm.
system_prompt (SystemPrompt): The system prompt used when generating inputs for judge llm.
strip_system_prompt_and_format_from_inputs (bool): Whether to strip the system prompt and formatting from the
inputs that the models that is being judges received, when they are inserted to the llm-as-judge prompt.
inference_model (InferenceEngine): the module that creates the inference of the judge llm.
reduction_map (dict): A dictionary specifying the reduction method for the metric.
batch_size (int): The size of the bulk.
"""
main_score: str = "llm_as_judge"
task: Literal[
"rating.single_turn",
"rating.single_turn_with_reference",
"pairwise_comparative_rating.single_turn",
]
template: Template
format: Format = None
system_prompt: SystemPrompt = None
strip_system_prompt_and_format_from_inputs: bool = True
inference_model: InferenceEngine
reduction_map: Optional[Dict[str, List[str]]] = None
batch_size: int = 32
prediction_type = Any # Because handled with multiple tasks
def _get_input_instances(self, task_data: List[Dict]) -> List:
if self.strip_system_prompt_and_format_from_inputs:
instances = []
for task_data_instance in task_data:
template = task_data_instance["metadata"]["template"]
template, _ = fetch_artifact(template)
instance = SequentialOperator(
steps=[template, "formats.empty"]
).process_instance(
{
"input_fields": task_data_instance,
"reference_fields": task_data_instance,
}
)
instances.append(instance["source"])
"""
We also have access to: instance["target"]
instance["references"]
"""
return instances
return [t["source"] for t in task_data]
def _get_instance_for_judge_model(
self, input_instances: List[str], predictions: List, references: List
) -> List[Dict]:
if self.task == "rating.single_turn":
instances = [
{
"question": input_instance,
"answer": prediction,
"rating": 5.0, # This is a dummy value that is not used in practice
}
for input_instance, prediction, reference in zip(
input_instances, predictions, references
)
]
elif self.task == "rating.single_turn_with_reference":
instances = [
{
"question": input_instance,
"answer": prediction,
"reference_answer": reference[0],
"rating": 5.0, # This is a dummy value that is not used in practice
}
for input_instance, prediction, reference in zip(
input_instances, predictions, references
)
]
elif self.task == "pairwise_comparative_rating.single_turn":
instances = [
{
"question": input_instance,
"answer_a": prediction,
"answer_b": reference[0],
"model_a": "input_model",
"model_b": "baseline_model",
"answer_a_preference": 0, # This is a dummy value that is not used in practice,
}
for input_instance, prediction, reference in zip(
input_instances, predictions, references
)
]
else:
raise NotImplementedError(
f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type."
)
return instances
@staticmethod
def _add_metadata_to_judge_instances(
instances: List[List[Any]], task_data: List[Dict]
):
for instance, data in zip(instances, task_data):
instance["data_classification_policy"] = data["metadata"][
"data_classification_policy"
]
def prepare(self):
super().prepare()
if self.task == "pairwise_comparative_rating.single_turn":
self.reduction_map = {"weighted_win_rate": [self.main_score]}
if self.reduction_map is None:
self.reduction_map = {"mean": [self.main_score]}
def verify(self):
supported_tasks = [
"rating.single_turn",
"rating.single_turn_with_reference",
"pairwise_comparative_rating.single_turn",
]
assert self.task in supported_tasks, (
f"Error in 'LLMAsJudge' metric. {self.task} is not a supported task type."
f"The supported tasks types are: {', '.join(supported_tasks)}."
)
if not isinstance(self.template, Template):
raise ValueError(
f"Provided template argument to 'LLMAsJudge' metric is not of type Template, but {type(self.template)}"
)
if self.format and not isinstance(self.format, Format):
raise ValueError(
f"Provided format argument to 'LLMAsJudge' metric is not of type Format, but {type(self.format)}"
)
if self.system_prompt and not isinstance(self.system_prompt, SystemPrompt):
raise ValueError(
f"Provided system_prompt argument to 'LLMAsJudge' metric is not of type SystemPrompt, but {type(self.system_prompt)}"
)
if isinstance(self.inference_model, OpenAiInferenceEngine):
if self.format:
raise ValueError(
"Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
"not support formatting. Please remove the format definition from the recipe"
" (OpenAi Chat API take care of the formatting automatically)."
)
if self.system_prompt:
raise ValueError(
"Error in 'LLMAsJudge' metric. Inference model 'OpenAiInferenceEngine' does "
"not support system prompt. Please remove the system_prompt definition from the recipe"
" (Current implementation of Unitxt does not support this."
" Support will be added in future updates)."
)
def compute(
self,
references: List[List[Any]],
predictions: List[Any],
task_data: List[Dict],
) -> List[Dict[str, Any]]:
input_instances = self._get_input_instances(task_data)
instances = self._get_instance_for_judge_model(
input_instances, predictions, references
)
self._add_metadata_to_judge_instances(instances, task_data)
card = f"cards.dynamic_cards_for_llm_judges.{self.task}"
recipe_args = {
"card": card,
"template": self.template,
"demos_pool_size": 0,
"num_demos": 0,
"__type__": settings.default_recipe,
}
if self.system_prompt:
recipe_args["system_prompt"] = self.system_prompt
if self.format:
recipe_args["format"] = self.format
recipe = Artifact.from_dict(recipe_args)
dataset = produce(instances, recipe)
verdicts = self.inference_model.infer(dataset)
meta_scores = evaluate(predictions=verdicts, data=dataset)
res_list = []
for instance, verdict in zip(meta_scores, verdicts):
if self.task == "pairwise_comparative_rating.single_turn":
is_model_b_the_baseline = (
instance["task_data"]["model_b"] == "baseline_model"
)
if is_model_b_the_baseline:
model_a_preference_score = instance["processed_prediction"]
else:
model_a_preference_score = instance["processed_prediction"] * -1
res = {
self.main_score: model_a_preference_score,
"judge_raw_output": verdict,
"judge_raw_input": instance["source"],
}
else:
res = {
self.main_score: instance["processed_prediction"],
"judge_raw_output": verdict,
"judge_raw_input": instance["source"],
}
res_list.append(res)
return res_list