File size: 6,600 Bytes
974cf69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
from typing import Union
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix

from openfactcheck.core.base import OpenFactCheck
from importlib import resources as pkg_resources

from openfactcheck.templates import factchecker as gold_templates_dir

# Import solver configuration templates
gold_claims_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/claims.jsonl")
gold_documents_template_path = str(pkg_resources.files(gold_templates_dir) / "gold/documents.jsonl")

class FactCheckerEvaluator(OpenFactCheck):
    """
    This class is used to evaluate the performance of a FactChecker.

    Parameters
    ----------
    input_path : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the FactChecker responses.
        The CSV file should have the following three columns:
        - label: The label assigned by the FactChecker. This should be a boolean value.
        - time: The time taken by the FactChecker to respond.
        - cost: The cost of the FactChecker response.
    eval_type : str
        The type of evaluation to perform. Either "claim" or "document".
    gold_path : str
        Optional. The path to the gold standard file. If not provided, the default gold standard file will be used.
        This is useful when evaluating the FactChecker on a different dataset.
    eval_type : str

    Attributes
    ----------
    input : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the FactChecker responses.
    gold_path : str
        The path to the gold standard file.
    eval_type : str
        The type of evaluation to perform. Either "claim" or "document".
    results : dict
        The evaluation results.
    confusion_matrix : numpy.ndarray
        The confusion matrix of the evaluation.
    classification_report : dict
        The classification report of the evaluation.

    Methods
    -------
    __call__():
        This function evaluates the performance of the FactChecker.
    evaluate_binary_classification(y_true, y_pred, pos_label="yes"):
        This function evaluates the performance of a binary classification model.
    """
    def __init__(self, input: Union[str, pd.DataFrame], eval_type: str, gold_path: str = ""):
        if gold_path == "":
            if eval_type == "claims":
                gold_path = gold_claims_template_path
            elif eval_type == "documents":
                gold_path = gold_documents_template_path
            else:
                raise ValueError("Invalid evaluation type. Please provide a valid evaluation type.")
            
        self.input = input
        self.gold_path = gold_path
        self.eval_type = eval_type

        # Results
        self.results = None
        self.confusion_matrix = None
        self.classification_report = None

    @staticmethod
    def evaluate_binary_classification(y_true, y_pred, pos_label="yes"):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, pos_label=pos_label)
        recall = recall_score(y_true, y_pred, pos_label=pos_label)
        F1 = f1_score(y_true, y_pred, pos_label=pos_label)

        metrics = {
            "accuracy": round(accuracy, 3),
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "F1": round(F1, 3),
        }
        return metrics

    def __call__(self):
        """
        This function evaluates the performance of the FactChecker.
        """
        # Load the gold standard file
        df_gold = pd.read_json(self.gold_path, lines=True)

        # Check if the input is a DataFrame
        if isinstance(self.input, pd.DataFrame):
            df_input = self.input
        else:
            # Read the CSV file
            df_input = pd.read_csv(self.input)

        # Check if the FactChecker responses have the correct number of columns
        assert len(df_input.columns) == 3
        
        # Check if the FactChecker responses have the correct column names
        assert df_input.columns[0] == "label", f"The first column should be 'label' but is {df_input.columns[0]}."
        assert df_input.columns[1] == "time", f"The second column should be 'time' but is {df_input.columns[1]}."
        assert df_input.columns[2] == "cost", f"The third column should be 'cost' but is {df_input.columns[2]}."
        
        # Get the gold labels and the predictions
        if self.eval_type == "claims":
            gold_labels = df_gold['claim_label'].to_list()
        elif self.eval_type == "documents":
            gold_labels = df_gold['response_label'].to_list()
        predictions = df_input[df_input.columns[0]].to_list()

        # Check if the number of gold labels and predictions are the same
        assert (len(gold_labels) == len(predictions)), "The number of gold labels and predictions should be the same."

        # Verify that the gold labels and predictions are boolean values
        assert all(isinstance(label, bool) for label in gold_labels), "The gold labels should be boolean values."
        assert all(isinstance(label, bool) for label in predictions), "The predictions should be boolean values."

        # evalaute performance
        r1 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=True)
        r2 = self.evaluate_binary_classification(y_true=gold_labels, y_pred=predictions, pos_label=False)

        # Calculate total time and cost
        total_time = 0
        total_cost = 0
        
        # Check if the time and cost columns are present in the FactChecker responses
        if "time" in df_input.columns[1]:
            total_time = df_input[df_input.columns[1]].astype(float).sum()
        
        # Check if the cost column is present in the FactChecker responses
        if "cost" in df_input.columns[2]:
            total_cost = df_input[df_input.columns[2]].astype(float).sum()

        self.results = {
            "True_as_positive": r1,
            "False_as_positive": r2,
            "total_time": total_time,
            "total_cost": total_cost,
            "num_samples": len(predictions)
        }

        # Calculate the confusion matrix
        self.confusion_matrix = confusion_matrix(y_true=gold_labels, y_pred=predictions, labels=[True, False])

        # Calculate the classification report
        self.classification_report = classification_report(gold_labels, predictions)

        return self.results