File size: 6,600 Bytes
974cf69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
import pandas as pd
from typing import Union
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from openfactcheck.core.base import OpenFactCheck
from importlib import resources as pkg_resources
from openfactcheck.templates import factchecker as gold_templates_dir
# Import solver configuration templates
gold_claims_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/claims.jsonl")
gold_documents_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/documents.jsonl")
class FactCheckerEvaluator(OpenFactCheck):
"""
This class is used to evaluate the performance of a FactChecker.
Parameters
----------
input_path : Union[str, pd.DataFrame]
The path to the CSV file or the DataFrame containing the FactChecker responses.
The CSV file should have the following three columns:
- label: The label assigned by the FactChecker. This should be a boolean value.
- time: The time taken by the FactChecker to respond.
- cost: The cost of the FactChecker response.
eval_type : str
The type of evaluation to perform. Either "claim" or "document".
gold_path : str
Optional. The path to the gold standard file. If not provided, the default gold standard file will be used.
This is useful when evaluating the FactChecker on a different dataset.
eval_type : str
Attributes
----------
input : Union[str, pd.DataFrame]
The path to the CSV file or the DataFrame containing the FactChecker responses.
gold_path : str
The path to the gold standard file.
eval_type : str
The type of evaluation to perform. Either "claim" or "document".
results : dict
The evaluation results.
confusion_matrix : numpy.ndarray
The confusion matrix of the evaluation.
classification_report : dict
The classification report of the evaluation.
Methods
-------
__call__():
This function evaluates the performance of the FactChecker.
evaluate_binary_classification(y_true, y_pred, pos_label="yes"):
This function evaluates the performance of a binary classification model.
"""
def __init__(self, input: Union[str, pd.DataFrame], eval_type: str, gold_path: str = ""):
if gold_path == "":
if eval_type == "claims":
gold_path = gold_claims_template_path
elif eval_type == "documents":
gold_path = gold_documents_template_path
else:
raise ValueError("Invalid evaluation type. Please provide a valid evaluation type.")
self.input = input
self.gold_path = gold_path
self.eval_type = eval_type
# Results
self.results = None
self.confusion_matrix = None
self.classification_report = None
@staticmethod
def evaluate_binary_classification(y_true, y_pred, pos_label="yes"):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label=pos_label)
recall = recall_score(y_true, y_pred, pos_label=pos_label)
F1 = f1_score(y_true, y_pred, pos_label=pos_label)
metrics = {
"accuracy": round(accuracy, 3),
"precision": round(precision, 3),
"recall": round(recall, 3),
"F1": round(F1, 3),
}
return metrics
def __call__(self):
"""
This function evaluates the performance of the FactChecker.
"""
# Load the gold standard file
df_gold = pd.read_json(self.gold_path, lines=True)
# Check if the input is a DataFrame
if isinstance(self.input, pd.DataFrame):
df_input = self.input
else:
# Read the CSV file
df_input = pd.read_csv(self.input)
# Check if the FactChecker responses have the correct number of columns
assert len(df_input.columns) == 3
# Check if the FactChecker responses have the correct column names
assert df_input.columns[0] == "label", f"The first column should be 'label' but is {df_input.columns[0]}."
assert df_input.columns[1] == "time", f"The second column should be 'time' but is {df_input.columns[1]}."
assert df_input.columns[2] == "cost", f"The third column should be 'cost' but is {df_input.columns[2]}."
# Get the gold labels and the predictions
if self.eval_type == "claims":
gold_labels = df_gold['claim_label'].to_list()
elif self.eval_type == "documents":
gold_labels = df_gold['response_label'].to_list()
predictions = df_input[df_input.columns[0]].to_list()
# Check if the number of gold labels and predictions are the same
assert (len(gold_labels) == len(predictions)), "The number of gold labels and predictions should be the same."
# Verify that the gold labels and predictions are boolean values
assert all(isinstance(label, bool) for label in gold_labels), "The gold labels should be boolean values."
assert all(isinstance(label, bool) for label in predictions), "The predictions should be boolean values."
# evalaute performance
r1 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=True)
r2 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=False)
# Calculate total time and cost
total_time = 0
total_cost = 0
# Check if the time and cost columns are present in the FactChecker responses
if "time" in df_input.columns[1]:
total_time = df_input[df_input.columns[1]].astype(float).sum()
# Check if the cost column is present in the FactChecker responses
if "cost" in df_input.columns[2]:
total_cost = df_input[df_input.columns[2]].astype(float).sum()
self.results = {
"True_as_positive": r1,
"False_as_positive": r2,
"total_time": total_time,
"total_cost": total_cost,
"num_samples": len(predictions)
}
# Calculate the confusion matrix
self.confusion_matrix = confusion_matrix(y_true=gold_labels, y_pred=predictions, labels=[True, False])
# Calculate the classification report
self.classification_report = classification_report(gold_labels, predictions)
return self.results |