|
import pandas as pd |
|
from typing import Union |
|
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score |
|
from sklearn.metrics import classification_report, confusion_matrix |
|
|
|
from openfactcheck.core.base import OpenFactCheck |
|
from importlib import resources as pkg_resources |
|
|
|
from openfactcheck.templates import factchecker as gold_templates_dir |
|
|
|
|
|
gold_claims_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/claims.jsonl") |
|
gold_documents_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/documents.jsonl") |
|
|
|
class FactCheckerEvaluator(OpenFactCheck): |
|
""" |
|
This class is used to evaluate the performance of a FactChecker. |
|
|
|
Parameters |
|
---------- |
|
input_path : Union[str, pd.DataFrame] |
|
The path to the CSV file or the DataFrame containing the FactChecker responses. |
|
The CSV file should have the following three columns: |
|
- label: The label assigned by the FactChecker. This should be a boolean value. |
|
- time: The time taken by the FactChecker to respond. |
|
- cost: The cost of the FactChecker response. |
|
eval_type : str |
|
The type of evaluation to perform. Either "claim" or "document". |
|
gold_path : str |
|
Optional. The path to the gold standard file. If not provided, the default gold standard file will be used. |
|
This is useful when evaluating the FactChecker on a different dataset. |
|
eval_type : str |
|
|
|
Attributes |
|
---------- |
|
input : Union[str, pd.DataFrame] |
|
The path to the CSV file or the DataFrame containing the FactChecker responses. |
|
gold_path : str |
|
The path to the gold standard file. |
|
eval_type : str |
|
The type of evaluation to perform. Either "claim" or "document". |
|
results : dict |
|
The evaluation results. |
|
confusion_matrix : numpy.ndarray |
|
The confusion matrix of the evaluation. |
|
classification_report : dict |
|
The classification report of the evaluation. |
|
|
|
Methods |
|
------- |
|
__call__(): |
|
This function evaluates the performance of the FactChecker. |
|
evaluate_binary_classification(y_true, y_pred, pos_label="yes"): |
|
This function evaluates the performance of a binary classification model. |
|
""" |
|
def __init__(self, input: Union[str, pd.DataFrame], eval_type: str, gold_path: str = ""): |
|
if gold_path == "": |
|
if eval_type == "claims": |
|
gold_path = gold_claims_template_path |
|
elif eval_type == "documents": |
|
gold_path = gold_documents_template_path |
|
else: |
|
raise ValueError("Invalid evaluation type. Please provide a valid evaluation type.") |
|
|
|
self.input = input |
|
self.gold_path = gold_path |
|
self.eval_type = eval_type |
|
|
|
|
|
self.results = None |
|
self.confusion_matrix = None |
|
self.classification_report = None |
|
|
|
@staticmethod |
|
def evaluate_binary_classification(y_true, y_pred, pos_label="yes"): |
|
accuracy = accuracy_score(y_true, y_pred) |
|
precision = precision_score(y_true, y_pred, pos_label=pos_label) |
|
recall = recall_score(y_true, y_pred, pos_label=pos_label) |
|
F1 = f1_score(y_true, y_pred, pos_label=pos_label) |
|
|
|
metrics = { |
|
"accuracy": round(accuracy, 3), |
|
"precision": round(precision, 3), |
|
"recall": round(recall, 3), |
|
"F1": round(F1, 3), |
|
} |
|
return metrics |
|
|
|
def __call__(self): |
|
""" |
|
This function evaluates the performance of the FactChecker. |
|
""" |
|
|
|
df_gold = pd.read_json(self.gold_path, lines=True) |
|
|
|
|
|
if isinstance(self.input, pd.DataFrame): |
|
df_input = self.input |
|
else: |
|
|
|
df_input = pd.read_csv(self.input) |
|
|
|
|
|
assert len(df_input.columns) == 3 |
|
|
|
|
|
assert df_input.columns[0] == "label", f"The first column should be 'label' but is {df_input.columns[0]}." |
|
assert df_input.columns[1] == "time", f"The second column should be 'time' but is {df_input.columns[1]}." |
|
assert df_input.columns[2] == "cost", f"The third column should be 'cost' but is {df_input.columns[2]}." |
|
|
|
|
|
if self.eval_type == "claims": |
|
gold_labels = df_gold['claim_label'].to_list() |
|
elif self.eval_type == "documents": |
|
gold_labels = df_gold['response_label'].to_list() |
|
predictions = df_input[df_input.columns[0]].to_list() |
|
|
|
|
|
assert (len(gold_labels) == len(predictions)), "The number of gold labels and predictions should be the same." |
|
|
|
|
|
assert all(isinstance(label, bool) for label in gold_labels), "The gold labels should be boolean values." |
|
assert all(isinstance(label, bool) for label in predictions), "The predictions should be boolean values." |
|
|
|
|
|
r1 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=True) |
|
r2 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=False) |
|
|
|
|
|
total_time = 0 |
|
total_cost = 0 |
|
|
|
|
|
if "time" in df_input.columns[1]: |
|
total_time = df_input[df_input.columns[1]].astype(float).sum() |
|
|
|
|
|
if "cost" in df_input.columns[2]: |
|
total_cost = df_input[df_input.columns[2]].astype(float).sum() |
|
|
|
self.results = { |
|
"True_as_positive": r1, |
|
"False_as_positive": r2, |
|
"total_time": total_time, |
|
"total_cost": total_cost, |
|
"num_samples": len(predictions) |
|
} |
|
|
|
|
|
self.confusion_matrix = confusion_matrix(y_true=gold_labels, y_pred=predictions, labels=[True, False]) |
|
|
|
|
|
self.classification_report = classification_report(gold_labels, predictions) |
|
|
|
return self.results |