Spaces:

hasaniqbal777
/

OpenFactCheck-Prerelease

Sleeping

File size: 15,645 Bytes

import os
import json
import uuid
import pandas as pd
from importlib import resources as pkg_resources

from openfactcheck.lib.logger import logger
from openfactcheck.core.base import OpenFactCheck
from openfactcheck.evaluator.llm.evaluate_snowballing import SnowballingEvaluator
from openfactcheck.evaluator.llm.evaluate_selfaware import SelfAwareEvaluator
from openfactcheck.evaluator.llm.evaluate_freshqa import FreshQAEvaluator
from openfactcheck.evaluator.llm.evaluate_freetext import FreeTextEvaluator
from openfactcheck.evaluator.llm.report import create_report

from openfactcheck import data as data_dir

# Import LLM Evaluation Dataset
default_dataset_path = str(pkg_resources.files(data_dir))
default_output_path = "tmp/output/llm_evaluator"

class LLMEvaluator(SnowballingEvaluator, SelfAwareEvaluator, FreshQAEvaluator, FreeTextEvaluator):
    """
    This class is used to evaluate the performance of a Language Model.

    Parameters
    ----------
    model_name : str
        The name of the Language Model.
    input_path : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the LLM responses.
        The CSV file should have the following two columns:
        - index: The index of the response.
        - response: The response generated by the LLM.
    output_path : str
        The path to store the output files.
    dataset_path : str
        The path to the dataset file containing the questions.
    datasets : list
        The list of datasets to evaluate the LLM on. 
    analyze : bool
        Whether to analyze the results.
    save_plots : bool
        Whether to save the plots.
    save_report : bool
        Whether to save the report.

    Attributes
    ----------
    model_name : str
        The name of the Language Model.
    run_id : str
        The unique identifier for the run.
    input_path : Union[str, pd.DataFrame]
        The path to the CSV file or the DataFrame containing the LLM responses.
    output_path : str
        The path to store the output files.
    dataset_path : str
        The path to the dataset file containing the questions.
    datasets : list
        The list of datasets to evaluate the LLM on.
    combined_result : dict
        The combined evaluation results for all datasets.

    Methods
    -------
    evaluate(model_name: str, input_path: Union[str, pd.DataFrame], output_path: str = "", dataset_path: str = "", datasets: list = ["snowballing"], analyze: bool = True, save_plots: bool = True, save_report: bool = True):
        This function evaluates the performance of the Language Model.
    read_input():
        This function reads the input file and dataset file and returns a DataFrame containing the combined data.
    filter_responses(df: pd.DataFrame, dataset: str):
        Filter the responses based on the dataset.
    generate_plots(fig_path, save_plots=True):
        Generate plots for the evaluation
    """
    def __init__(self, ofc: OpenFactCheck):
        SnowballingEvaluator.__init__(self)
        SelfAwareEvaluator.__init__(self)
        FreshQAEvaluator.__init__(self)
        FreeTextEvaluator.__init__(self, ofc)
        self.logger = logger

        # Set the attributes
        self.model_name = None
        self.run_id = str(uuid.uuid4().hex)
        self.input_path = None
        self.dataset_path = None
        self.output_path = None
        self.datasets = None

        self.combined_result = None
        self.labels = None
        self.predictions = None

        self.logger.info(f"LLM Evaluator initialized with run_id: {self.run_id}")

    def read_input(self):
        """
        This function reads the input file and dataset file and returns a DataFrame containing the combined data.
        """

        # Check if the input_path is a DataFrame
        if isinstance(self.input_path, pd.DataFrame):
            df_responses = self.input_path
        else:
            # Read the CSV file
            self.logger.info(f"Reading the LLM responses from {self.input_path}...")
            df_responses = pd.read_csv(self.input_path)

        # Check the number of columns and if any response is missing
        assert df_responses.shape[1] == 2, "The LLM responses should have 2 columns."

        # Use the first column as index and rename the index and response column
        df_responses.set_index(df_responses.columns[0], inplace=True)
        df_responses.index.name = None
        df_responses.columns = ["response"]

        # Read the avaliable datasets
        self.logger.info(f"Reading the dataset from {self.dataset_path}...")
        df_dataset = pd.DataFrame()
        # Loop through each file in the directory
        for filename in os.listdir(self.dataset_path):
            if filename.endswith('.jsonl'):  # Check if the file is a JSONL file
                file_path = os.path.join(self.dataset_path, filename)

                logger.info(f"Reading {filename.split('.')[0]} dataset...")
                # Read the JSONL file and append it to the combined DataFrame
                df = pd.read_json(file_path, lines=True)
                df_dataset = pd.concat([df_dataset, df], ignore_index=True)

        # Combine the responses and questions
        assert len(df_responses) == len(df_dataset), "The number of responses and questions should be the same."
        df_combined = pd.concat([df_dataset, df_responses], axis=1)

        return df_combined       

    @staticmethod
    def filter_responses(df: pd.DataFrame, dataset: str):
        logger.info(f"Filtering responses for dataset: {dataset}...")
        # Filter the DataFrame based on the 'source' column directly
        if dataset == "snowballing":
            # Filter the DataFrame based on the 'source' column directly
            filtered_df = df[df['source'] == dataset]
            
            # Create a new DataFrame with only the required columns
            responses_df = filtered_df[['topic', 'response']].copy()
            
            # Convert the DataFrame to a list of dictionaries if needed
            responses = responses_df.to_dict(orient='records')

        elif dataset == "selfaware":
            selfaware_subset = [
                "selfaware-hotpot_train",
                "selfaware-squadqa_train",
                "selfaware-triviaqa_train",
                "selfaware-squadqa_dev",
                "selfaware-hotpot_dev",
                "selfaware-triviaqa_dev",
                "selfaware-SelfAware",
            ]

            responses = []
            for k, row in df.iterrows():
                if row["source"] in selfaware_subset:
                    responses.append({
                        "label_unanswerable": row["ability_to_test"].lstrip("answerable: ") == "False",
                        "response": row["response"]})
                    
        elif dataset == "freshqa":
            responses = []
            for k, row in df.iterrows():
                if row["source"] == dataset:
                    responses.append(
                        {
                            "question": row["question"],
                            "reference_answer": row["reference_answer"],
                            "response": row["response"],
                        }
                    )
        
        elif dataset in ["factoolqa", "felm-wk", "factcheck-bench"]:
            responses = []
            for k, row in df.iterrows():
                if row["source"] == dataset:
                    responses.append(
                        {
                            "source": row["source"],
                            "prompt": row["prompt"],
                            "response": row["response"],
                        }
                    )

        elif dataset == "factscore-bio":
            factscore_subset = [
                "factscore-labelled",
                "factscore-unlabelled",
            ]

            responses = []
            for k, row in df.iterrows():
                if row["source"] in factscore_subset:
                    responses.append(
                        {
                            "source": row["source"],
                            "prompt": row["prompt"],
                            "response": row["response"],
                        }
                    )
        else:
            raise ValueError(f"Dataset {dataset} is not supported.")
        
        return responses
    
    def generate_plots(self, fig_path: str = "", save_plots=True):
        # Create a bar plot of the accuracy of the LLM responses on the Snowballing dataset
        # for each topic and the overall accuracy.
        plots = {}
        for dataset in self.combined_result:
            if dataset == "snowballing":
                plots["snowballing"] = {}
                plots["snowballing"]["barplot"] = self.snowballing_barplot(self.combined_result[dataset], fig_path, save=save_plots)
                plots["snowballing"]["cm"] = self.snowballing_cm(self.labels[dataset], self.predictions[dataset], fig_path, save=save_plots)

            elif dataset == "selfaware":
                plots["selfaware"] = {}
                plots["selfaware"]["barplot"] = self.selfaware_barplot(self.combined_result[dataset], fig_path, save=save_plots)
                plots["selfaware"]["cm"] = self.selfaware_cm(self.labels[dataset], self.predictions[dataset], fig_path, save=save_plots)
            
            elif dataset == "freshqa":
                plots["freshqa"] = {}
                plots["freshqa"]["piechart"] = self.freshqa_piechart(self.combined_result[dataset], fig_path, save=save_plots)

            elif dataset == "freetext":
                plots["freetext"] = {}
                plots["freetext"]["barplot"] = self.freetext_barplot(self.combined_result["freetext"], fig_path, save=save_plots)

        return plots
    
    def generate_report(self, report_path: str):
        # Create a LaTeX report and return the path to the generated PDF
        return create_report(self.model_name, report_path)

    def evaluate(self, 
                 model_name: str,
                 input_path: str, 
                 output_path: str = "",
                 dataset_path: str = "", 
                 datasets: list = [
                     "snowballing",
                     "selfaware",
                     "freshqa",
                     "factoolqa",
                     "felm-wk",
                     "factcheck-bench",
                     "factscore-bio"
                 ],
                 analyze: bool = True,
                 save_report: bool = True):
        self.logger.info("Evaluating LLM responses...")

        # Set the attributes
        self.model_name = model_name
        self.input_path = input_path
        self.output_path = output_path
        self.dataset_path = dataset_path
        self.datasets = datasets
        
        # Check if the output path is provided (if not, use the default template)
        if self.output_path == "":
            self.output_path = default_output_path

        # Check if the output path exists (if not, create it)
        if not os.path.exists(f"{self.output_path}/{self.run_id}"):
            os.makedirs(f"{self.output_path}/{self.run_id}")
        
        # Check if the questions path is provided (if not, use the default template)
        if self.dataset_path == "":
            self.dataset_path = default_dataset_path

        # Read the input
        self.logger.info("Reading the input...")
        df = self.read_input()
        self.logger.info(f"Combined data contains {len(df)} rows")
        
        # Evaluate model responses over each dataset
        self.combined_result = {}
        self.labels = {}
        self.predictions = {}
        for dataset in self.datasets:
            logger.info(f"Evaluating responses for dataset: {dataset}...")
            if dataset == "snowballing":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, labels, preds = self.evaluate_snowballing(responses)

                # Store the output and save the results
                df_out = pd.DataFrame({"gold_labels": labels, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result
                self.labels[dataset] = labels
                self.predictions[dataset] = preds
            
            elif dataset == "selfaware":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, labels, preds = self.evaluate_selfaware(responses[:30])

                # Store the output and save the results
                df_out = pd.DataFrame({"gold_labels": labels, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result
                self.labels[dataset] = labels
                self.predictions[dataset] = preds

            elif dataset == "freshqa":
                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                result, raw_evals, preds = self.evaluate_freshqa(responses[:30])

                # Store the output and save the results
                df_out = pd.DataFrame({"raw_evaluations": raw_evals, "predictions": preds})
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result[dataset] = result

            elif dataset in ["factoolqa", "felm-wk", "factcheck-bench", "factscore-bio"]:
                # Check if the freetext key exists
                if self.combined_result.get("freetext") is None:
                    self.combined_result["freetext"] = {}

                # Filter responses based on the dataset
                responses = self.filter_responses(df, dataset)

                # Evaluate the responses
                results, evaluations = self.evaluate_freetext(responses[:30], self.model_name, self.run_id)

                # Store the output and save the results
                df_out = pd.DataFrame(evaluations)
                df_out.to_json(f"{self.output_path}/{self.run_id}/{dataset}_output.jsonl", orient="records", lines=True)
                self.combined_result["freetext"][dataset] = results
            
            else:
                logger.error(f"Dataset {dataset} is not supported.")
                raise ValueError(f"Dataset {dataset} is not supported.")

            logger.info(f"Finished evaluating responses for dataset: {dataset}")

        # save all evaluation results
        with open(f"{self.output_path}/{self.run_id}/result.json", "w") as json_file:
            json.dump(self.combined_result, json_file, indent=4)

        # Analyze the results
        if analyze:
            self.logger.info("Analyzing the results...")
            self.generate_plots(save_plots=True, fig_path=f"{self.output_path}/{self.run_id}")
        
        # Create a report
        if save_report:
            self.logger.info("Creating the report...")
            self.generate_report(report_path=f"{self.output_path}/{self.run_id}")
        
        return self.combined_result