File size: 15,645 Bytes
eca534f
 
48d16d8
eca534f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48d16d8
eca534f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48d16d8
eca534f
 
 
 
 
48d16d8
eca534f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import os
import json
import uuid
import pandas as pd
from importlib import resources as pkg_resources

from openfactcheck.lib.logger import logger
from openfactcheck.core.base import OpenFactCheck
from openfactcheck.evaluator.llm.evaluate_snowballing import SnowballingEvaluator
from openfactcheck.evaluator.llm.evaluate_selfaware import SelfAwareEvaluator
from openfactcheck.evaluator.llm.evaluate_freshqa import FreshQAEvaluator
from openfactcheck.evaluator.llm.evaluate_freetext import FreeTextEvaluator
from openfactcheck.evaluator.llm.report import create_report

from openfactcheck import data as data_dir

# Import LLM Evaluation Dataset
default_dataset_path = str(pkg_resources.files(data_dir))
default_output_path = "tmp/output/llm_evaluator"

class LLMEvaluator(SnowballingEvaluator, SelfAwareEvaluator, FreshQAEvaluator, FreeTextEvaluator):
    """
    This class is used to evaluate the performance of a Language Model.

    Parameters
    ----------
    model_name : str
        The name of the Language Model.
    input_path : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the LLM responses.
        The CSV file should have the following two columns:
        - index: The index of the response.
        - response: The response generated by the LLM.
    output_path : str
        The path to store the output files.
    dataset_path : str
        The path to the dataset file containing the questions.
    datasets : list
        The list of datasets to evaluate the LLM on. 
    analyze : bool
        Whether to analyze the results.
    save_plots : bool
        Whether to save the plots.
    save_report : bool
        Whether to save the report.

    Attributes
    ----------
    model_name : str
        The name of the Language Model.
    run_id : str
        The unique identifier for the run.
    input_path : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the LLM responses.
    output_path : str
        The path to store the output files.
    dataset_path : str
        The path to the dataset file containing the questions.
    datasets : list
        The list of datasets to evaluate the LLM on.
    combined_result : dict
        The combined evaluation results for all datasets.

    Methods
    -------
    evaluate(model_name: str, input_path: Union[str, pd.DataFrame], output_path: str = "", dataset_path: str = "", datasets: list = ["snowballing"], analyze: bool = True, save_plots: bool = True, save_report: bool = True):
        This function evaluates the performance of the Language Model.
    read_input():
        This function reads the input file and dataset file and returns a DataFrame containing the combined data.
    filter_responses(df: pd.DataFrame, dataset: str):
        Filter the responses based on the dataset.
    generate_plots(fig_path, save_plots=True):
        Generate plots for the evaluation
    """
    def __init__(self, ofc: OpenFactCheck):
        SnowballingEvaluator.__init__(self)
        SelfAwareEvaluator.__init__(self)
        FreshQAEvaluator.__init__(self)
        FreeTextEvaluator.__init__(self, ofc)
        self.logger = logger

        # Set the attributes
        self.model_name = None
        self.run_id = str(uuid.uuid4().hex)
        self.input_path = None
        self.dataset_path = None
        self.output_path = None
        self.datasets = None

        self.combined_result = None
        self.labels = None
        self.predictions = None

        self.logger.info(f"LLM Evaluator initialized with run_id: {self.run_id}")

    def read_input(self):
        """
        This function reads the input file and dataset file and returns a DataFrame containing the combined data.
        """

        # Check if the input_path is a DataFrame
        if isinstance(self.input_path, pd.DataFrame):
            df_responses = self.input_path
        else:
            # Read the CSV file
            self.logger.info(f"Reading the LLM responses from {self.input_path}...")
            df_responses = pd.read_csv(self.input_path)

        # Check the number of columns and if any response is missing
        assert df_responses.shape[1] == 2, "The LLM responses should have 2 columns."

        # Use the first column as index and rename the index and response column
        df_responses.set_index(df_responses.columns[0], inplace=True)
        df_responses.index.name = None
        df_responses.columns = ["response"]

        # Read the avaliable datasets
        self.logger.info(f"Reading the dataset from {self.dataset_path}...")
        df_dataset = pd.DataFrame()
        # Loop through each file in the directory
        for filename in os.listdir(self.dataset_path):
            if filename.endswith('.jsonl'):  # Check if the file is a JSONL file
                file_path = os.path.join(self.dataset_path, filename)

                logger.info(f"Reading {filename.split('.')[0]} dataset...")
                # Read the JSONL file and append it to the combined DataFrame
                df = pd.read_json(file_path, lines=True)
                df_dataset = pd.concat([df_dataset, df], ignore_index=True)

        # Combine the responses and questions
        assert len(df_responses) == len(df_dataset), "The number of responses and questions should be the same."
        df_combined = pd.concat([df_dataset, df_responses], axis=1)

        return df_combined       

    @staticmethod
    def filter_responses(df: pd.DataFrame, dataset: str):
        logger.info(f"Filtering responses for dataset: {dataset}...")
        # Filter the DataFrame based on the 'source' column directly
        if dataset == "snowballing":
            # Filter the DataFrame based on the 'source' column directly
            filtered_df = df[df['source'] == dataset]
            
            # Create a new DataFrame with only the required columns
            responses_df = filtered_df[['topic', 'response']].copy()
            
            # Convert the DataFrame to a list of dictionaries if needed
            responses = responses_df.to_dict(orient='records')

        elif dataset == "selfaware":
            selfaware_subset = [
                "selfaware-hotpot_train",
                "selfaware-squadqa_train",
                "selfaware-triviaqa_train",
                "selfaware-squadqa_dev",
                "selfaware-hotpot_dev",
                "selfaware-triviaqa_dev",
                "selfaware-SelfAware",
            ]

            responses = []
            for k, row in df.iterrows():
                if row["source"] in selfaware_subset:
                    responses.append({
                        "label_unanswerable": row["ability_to_test"].lstrip("answerable: ") == "False",
                        "response": row["response"]})
                    
        elif dataset == "freshqa":
            responses = []
            for k, row in df.iterrows():
                if row["source"] == dataset:
                    responses.append(
                        {
                            "question": row["question"],
                            "reference_answer": row["reference_answer"],
                            "response": row["response"],
                        }
                    )
        
        elif dataset in ["factoolqa", "felm-wk", "factcheck-bench"]:
            responses = []
            for k, row in df.iterrows():
                if row["source"] == dataset:
                    responses.append(
                        {
                            "source": row["source"],
                            "prompt": row["prompt"],
                            "response": row["response"],
                        }
                    )

        elif dataset == "factscore-bio":
            factscore_subset = [
                "factscore-labelled",
                "factscore-unlabelled",
            ]

            responses = []
            for k, row in df.iterrows():
                if row["source"] in factscore_subset:
                    responses.append(
                        {
                            "source": row["source"],
                            "prompt": row["prompt"],
                            "response": row["response"],
                        }
                    )
        else:
            raise ValueError(f"Dataset {dataset} is not supported.")
        
        return responses
    
    def generate_plots(self, fig_path: str = "", save_plots=True):
        # Create a bar plot of the accuracy of the LLM responses on the Snowballing dataset
        # for each topic and the overall accuracy.
        plots = {}
        for dataset in self.combined_result:
            if dataset == "snowballing":
                plots["snowballing"] = {}
                plots["snowballing"]["barplot"] = self.snowballing_barplot(self.combined_result[dataset], fig_path, save=save_plots)
                plots["snowballing"]["cm"] = self.snowballing_cm(self.labels[dataset], self.predictions[dataset], fig_path, save=save_plots)

            elif dataset == "selfaware":
                plots["selfaware"] = {}
                plots["selfaware"]["barplot"] = self.selfaware_barplot(self.combined_result[dataset], fig_path, save=save_plots)
                plots["selfaware"]["cm"] = self.selfaware_cm(self.labels[dataset], self.predictions[dataset], fig_path, save=save_plots)
            
            elif dataset == "freshqa":
                plots["freshqa"] = {}
                plots["freshqa"]["piechart"] = self.freshqa_piechart(self.combined_result[dataset], fig_path, save=save_plots)

            elif dataset == "freetext":
                plots["freetext"] = {}
                plots["freetext"]["barplot"] = self.freetext_barplot(self.combined_result["freetext"], fig_path, save=save_plots)

        return plots
    
    def generate_report(self, report_path: str):
        # Create a LaTeX report and return the path to the generated PDF
        return create_report(self.model_name, report_path)

    def evaluate(self, 
                 model_name: str,
                 input_path: str, 
                 output_path: str = "",
                 dataset_path: str = "", 
                 datasets: list = [
                     "snowballing",
                     "selfaware",
                     "freshqa",
                     "factoolqa",
                     "felm-wk",
                     "factcheck-bench",
                     "factscore-bio"
                 ],
                 analyze: bool = True,
                 save_report: bool = True):
        self.logger.info("Evaluating LLM responses...")

        # Set the attributes
        self.model_name = model_name
        self.input_path = input_path
        self.output_path = output_path
        self.dataset_path = dataset_path
        self.datasets = datasets
        
        # Check if the output path is provided (if not, use the default template)
        if self.output_path == "":
            self.output_path = default_output_path

        # Check if the output path exists (if not, create it)
        if not os.path.exists(f"{self.output_path}/{self.run_id}"):
            os.makedirs(f"{self.output_path}/{self.run_id}")
        
        # Check if the questions path is provided (if not, use the default template)
        if self.dataset_path == "":
            self.dataset_path = default_dataset_path

        # Read the input
        self.logger.info("Reading the input...")
        df = self.read_input()
        self.logger.info(f"Combined data contains {len(df)} rows")
        
        # Evaluate model responses over each dataset
        self.combined_result = {}
        self.labels = {}
        self.predictions = {}
        for dataset in self.datasets:
            logger.info(f"Evaluating responses for dataset: {dataset}...")
            if dataset == "snowballing":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, labels, preds = self.evaluate_snowballing(responses)

                # Store the output and save the results
                df_out = pd.DataFrame({"gold_labels": labels, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result
                self.labels[dataset] = labels
                self.predictions[dataset] = preds
            
            elif dataset == "selfaware":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, labels, preds = self.evaluate_selfaware(responses[:30])

                # Store the output and save the results
                df_out = pd.DataFrame({"gold_labels": labels, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result
                self.labels[dataset] = labels
                self.predictions[dataset] = preds

            elif dataset == "freshqa":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, raw_evals, preds = self.evaluate_freshqa(responses[:30])

                # Store the output and save the results
                df_out = pd.DataFrame({"raw_evaluations": raw_evals, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result

            elif dataset in ["factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"]:
                # Check if the freetext key exists
                if self.combined_result.get("freetext") is None:
                    self.combined_result["freetext"] = {}

                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                results, evaluations = self.evaluate_freetext(responses[:30], self.model_name, self.run_id)

                # Store the output and save the results
                df_out = pd.DataFrame(evaluations)
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result["freetext"][dataset] = results
            
            else:
                logger.error(f"Dataset {dataset} is not supported.")
                raise ValueError(f"Dataset {dataset} is not supported.")

            logger.info(f"Finished evaluating responses for dataset: {dataset}")

        # save all evaluation results
        with open(f"{self.output_path}/{self.run_id}/result.json", "w") as json_file:
            json.dump(self.combined_result, json_file, indent=4)

        # Analyze the results
        if analyze:
            self.logger.info("Analyzing the results...")
            self.generate_plots(save_plots=True, fig_path=f"{self.output_path}/{self.run_id}")
        
        # Create a report
        if save_report:
            self.logger.info("Creating the report...")
            self.generate_report(report_path=f"{self.output_path}/{self.run_id}")
        
        return self.combined_result