Spaces:

hasaniqbal777
/

OpenFactCheck-Prerelease

Sleeping

OpenFactCheck-Prerelease / src /openfactcheck /evaluator /llm /evaluate_snowballing.py

Hasan Iqbal

Added LLM Evaluation on Datasets

eca534f unverified 7 months ago

6.26 kB

	import os
	import re
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.metrics import classification_report, confusion_matrix

	class SnowballingEvaluator():
	"""
	Evaluate the LLM responses on the Snowballing dataset.

	Parameters
	----------
	LLMEvaluator : class
	The LLMEvaluator class.

	Methods
	-------
	evaluate_snowballing(llm_responses: list):
	Evaluate the LLM responses on the Snowballing dataset
	snowballing_barplot(result: dict, fig_path: str, save: bool = False):
	Create a bar plot of the accuracy of the LLM responses on the Snowballing dataset
	for each topic and the overall accuracy.
	get_boolean(response: str, strict=False):
	Get a boolean value from the response.
	"""
	def __init__(self):
	pass

	def get_boolean(self, response: str, strict=False):
	"""
	Get a boolean value from the response.

	"""
	low_response = response.lower()
	if strict:
	if low_response.startswith("yes"):
	return True
	elif low_response.startswith("no"):
	return False
	return None
	else:
	# Check if the response contains any of the specified words
	pattern = r"{}".format("\|".join(["n't", "no"]))
	if bool(re.search(pattern, response, re.IGNORECASE)):
	return False
	else:
	return True

	def snowballing_barplot(self, result: dict, fig_path: str = "", save: bool = False):
	"""
	Create a bar plot of the accuracy of the LLM responses on the Snowballing dataset
	for each topic and the overall accuracy.

	Parameters
	----------
	cresult : dict
	The evaluation results for the LLM responses on the Snowballing dataset
	fig_path : str
	The path to save the figure.
	save : bool, optional
	Whether to save the figure, by default True.
	"""

	# Data
	items = result.keys()

	# Extract the accuracy values for each topic
	values = [round(v["accuracy"], 2) for k, v in result.items()]

	# Create a new figure
	fig, ax = plt.subplots()

	# Plotting
	bars = sns.barplot(x=items, y=values, palette="rocket", hue=items, ax=ax)

	# Adding values on top of each bar
	for bar in bars.patches:
	ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
	f'{bar.get_height():.2f}',
	ha='center',
	va='bottom',
	color='black',
	rotation='horizontal')

	# Rotating x-axis tick labels
	plt.xticks(rotation=20)

	# Set y-axis limits to accommodate annotations
	plt.ylim((0, max(values) + 0.1))

	# Adding labels and title
	plt.xlabel("Topics")
	plt.ylabel("Accuracy")
	plt.title("Performance on Snowballing Dataset.")

	if save:
	# Save the figure
	plt.tight_layout()
	plt.savefig(os.path.join(fig_path, "snowballing_barplot.pdf"), format="pdf")
	plt.savefig(os.path.join(fig_path, "snowballing_barplot.png"), format="png")

	# Return the figure
	return fig

	def snowballing_cm(self, labels: list, preds: list, fig_path: str = "", save: bool = False):
	"""
	Create a confusion matrix for the Snowballing dataset.

	Parameters
	----------
	labels : list
	The true labels.
	preds : list
	The predicted labels.
	fig_path : str
	The path to save the figure.
	save : bool, optional
	Whether to save the figure, by default True.
	"""

	# Create a new figure
	fig, ax = plt.subplots()

	# Plotting
	cm = sns.heatmap(confusion_matrix(labels, preds), annot=True, fmt="d", cmap="Blues", ax=ax)

	# Adding labels and title
	plt.xticks(ticks=[0.5, 1.5], labels=["True", "False"])
	plt.yticks(ticks=[0.5, 1.5], labels=["True", "False"])
	plt.ylabel("True label")
	plt.xlabel("Predicted label")
	plt.title("Confusion Matrix on Snowballing dataset.")

	if save:
	# Save the figure
	plt.tight_layout()
	plt.savefig(os.path.join(fig_path, "snowballing_cm.pdf"), format="pdf")
	plt.savefig(os.path.join(fig_path, "snowballing_cm.png"), format="png")

	# Return the figure
	return fig

	def evaluate_snowballing(self, llm_responses: list):
	"""
	Evaluate the LLM responses on the Snowballing dataset.
	"""

	# Store evaluation results for three specific topics and aggregate results
	# for the entire dataset, indexed by topic names.
	results = {}

	# Define the ground truth answers for the three specific topics.
	topic_answers = {
	"Primality Testing": True,
	"US Senator Search": True,
	"Graph Connectivity-Flight Search": False,
	}

	# Store the responses for each topic.
	topic_responses = {}
	for key in topic_answers:
	topic_responses[key] = []

	# Store the responses for each topic.
	for item in llm_responses:
	topic_responses[item["topic"]].append(self.get_boolean(item["response"]))

	# Evaluate the LLM responses
	labels, preds = [], []
	for key in topic_answers:
	# Evaluate the responses for each topic.
	y_true = [topic_answers[key]] * len(topic_responses[key])
	y_pred = topic_responses[key]
	results[key] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

	# Aggregate the results for the entire dataset.
	labels += [topic_answers[key]] * len(topic_responses[key])
	preds += topic_responses[key]

	# Evaluate the responses for the entire dataset.
	results["All"] = classification_report(labels, preds, output_dict=True, zero_division=0)

	return results, labels, preds