|
import os |
|
import re |
|
import torch |
|
import string |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
from transformers import AutoTokenizer, AutoModel |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from sklearn.metrics import classification_report, confusion_matrix |
|
|
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
class SelfAwareEvaluator(): |
|
def __init__(self): |
|
pass |
|
|
|
def remove_punctuation(self, input_string): |
|
""" |
|
Remove the punctuation from the input string. |
|
""" |
|
input_string = input_string.strip().lower() |
|
if input_string and input_string[-1] in string.punctuation: |
|
return input_string[:-1] |
|
return input_string |
|
|
|
|
|
def cut_sentences(self, content): |
|
""" |
|
Cut the content into sentences. |
|
""" |
|
sentences = re.split(r"(\.|\!|\?|。|!|?|\.{6})", content) |
|
return sentences |
|
|
|
|
|
def cut_sub_string(self, input_string, window_size=5, punctuation=".,?!"): |
|
""" |
|
Cut the input string into sub-strings of a fixed window size. |
|
""" |
|
input_string = input_string.strip().lower() |
|
|
|
|
|
if len(input_string) < 2: |
|
return [""] |
|
|
|
|
|
if input_string[-1] in punctuation: |
|
input_string = input_string[:-1] |
|
|
|
|
|
string_list = input_string.split() |
|
|
|
|
|
length = len(string_list) |
|
if length <= window_size: |
|
return [input_string] |
|
else: |
|
res = [] |
|
for i in range(length - window_size + 1): |
|
sub_string = " ".join(string_list[i: i + window_size]) |
|
if sub_string != "" or sub_string != " ": |
|
res.append(sub_string) |
|
return res |
|
|
|
|
|
def group_cosine_similarity(self, model, tokenizer, sentences1, sentences2): |
|
""" |
|
Calculate the cosine similarity between two groups of sentences. |
|
""" |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
|
|
encoded1 = tokenizer(sentences1, padding=True, truncation=True, return_tensors="pt").to(device) |
|
encoded2 = tokenizer(sentences2, padding=True, truncation=True, return_tensors="pt").to(device) |
|
|
|
|
|
embeddings1 = model(**encoded1).pooler_output |
|
embeddings2 = model(**encoded2).pooler_output |
|
|
|
|
|
similarities = cosine_similarity(embeddings1.detach().cpu().numpy(), embeddings2.detach().cpu().numpy()) |
|
|
|
return similarities |
|
|
|
|
|
def get_unanswerable(self, response, model, tokenizer, threshold=0.75): |
|
""" |
|
Predict whether the response is unanswerable or not. |
|
""" |
|
|
|
uncertain_list = [ |
|
"The answer is unknown.", |
|
"The answer is uncertain.", |
|
"The answer is unclear.", |
|
"There is no scientific evidence.", |
|
"There is no definitive answer.", |
|
"There is no right answer.", |
|
"There is much debate.", |
|
"There is no known case.", |
|
"There is no concrete answer to this question.", |
|
"There is no public information available.", |
|
"It is impossible to know.", |
|
"It is impossible to answer.", |
|
"It is difficult to predict.", |
|
"It is not known.", |
|
"We do not know.", |
|
"I'm not sure.", |
|
] |
|
uncertain_list = [self.remove_punctuation(_) for _ in uncertain_list] |
|
|
|
|
|
pred_unanswerable = False |
|
response = response.lower() |
|
for uncertain in uncertain_list: |
|
if uncertain in response: |
|
pred_unanswerable = True |
|
|
|
|
|
if pred_unanswerable == False: |
|
|
|
sub_sen_list = self.cut_sentences(response) |
|
|
|
|
|
sub_str_list = [] |
|
for sub_sen in sub_sen_list: |
|
if len(sub_sen) >= 2: |
|
sub_str_list.extend(self.cut_sub_string(sub_sen)) |
|
|
|
|
|
if len(sub_str_list) != 0: |
|
similarities = self.group_cosine_similarity(model, tokenizer, sub_str_list, uncertain_list) |
|
else: |
|
similarities = [0] |
|
|
|
|
|
max_uncertainty = np.max(similarities) |
|
|
|
|
|
if max_uncertainty > threshold: |
|
pred_unanswerable = True |
|
|
|
return pred_unanswerable |
|
|
|
def selfaware_barplot(self, result: dict, fig_path: str = "", save: bool = False): |
|
""" |
|
Create a bar plot of the performance on the SelfAware dataset. |
|
|
|
Parameters |
|
---------- |
|
result : dict |
|
The evaluation results for the LLM responses on the SelfAware dataset. |
|
fig_path : str |
|
The path to save the figure. |
|
save : bool, optional |
|
Whether to save the figure, by default True. |
|
""" |
|
|
|
|
|
unanswerable_as_pos = result["unanswerable_as_pos"] |
|
answerable_as_pos = result["answerable_as_pos"] |
|
|
|
|
|
unanswerable_as_pos.pop("support", None) |
|
answerable_as_pos.pop("support", None) |
|
|
|
|
|
metrics = list(unanswerable_as_pos.keys()) |
|
unanswerable_values = [round(v, 2) for k, v in unanswerable_as_pos.items()] |
|
answerable_values = [round(v, 2) for k, v in answerable_as_pos.items()] |
|
|
|
|
|
fig, ax = plt.subplots() |
|
|
|
|
|
n_groups = len(metrics) |
|
index = np.arange(n_groups) |
|
bar_width = 0.35 |
|
|
|
|
|
colors = sns.color_palette("rocket", n_colors=10) |
|
color_unanswerable = colors[1] |
|
color_answerable = colors[7] |
|
|
|
|
|
bars1 = ax.bar(index, unanswerable_values, bar_width, label='Unanswerable as Positive', color=color_unanswerable) |
|
bars2 = ax.bar(index + bar_width, answerable_values, bar_width, label='Answerable as Positive', color=color_answerable) |
|
|
|
|
|
for bar in bars1: |
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{bar.get_height():.2f}', |
|
ha='center', va='bottom', color='black', rotation='horizontal') |
|
for bar in bars2: |
|
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{bar.get_height():.2f}', |
|
ha='center', va='bottom', color='black', rotation='horizontal') |
|
|
|
|
|
ax.set_xticks(index + bar_width / 2) |
|
ax.set_xticklabels(metrics) |
|
|
|
|
|
ax.set_ylim((0, max(unanswerable_values + answerable_values) + 0.1)) |
|
|
|
|
|
ax.set_xlabel("Metrics") |
|
ax.set_ylabel("Performance") |
|
ax.set_title("Performance on SelfAware Dataset") |
|
ax.legend() |
|
|
|
if save: |
|
|
|
plt.tight_layout() |
|
plt.savefig(os.path.join(fig_path, "selfaware_barplot.pdf"), format="pdf") |
|
plt.savefig(os.path.join(fig_path, "selfaware_barplot.png"), format="png") |
|
|
|
|
|
return fig |
|
|
|
def selfaware_cm(self, labels: list, preds: list, fig_path: str = "", save: bool = False): |
|
""" |
|
Create a confusion matrix for the SelfAware dataset. |
|
|
|
Parameters |
|
---------- |
|
labels : list |
|
The true labels. |
|
preds : list |
|
The predicted labels. |
|
fig_path : str |
|
The path to save the figure. |
|
save : bool, optional |
|
Whether to save the figure, by default True. |
|
""" |
|
|
|
|
|
fig, ax = plt.subplots() |
|
|
|
|
|
cm = sns.heatmap(confusion_matrix(labels, preds), annot=True, fmt="d", cmap="Blues", ax=ax) |
|
|
|
|
|
plt.xticks(ticks=[0.5, 1.5], labels=["Answerable", "Unanswerable"]) |
|
plt.yticks(ticks=[0.5, 1.5], labels=["Answerable", "Unanswerable"]) |
|
plt.ylabel("True label") |
|
plt.xlabel("Predicted label") |
|
plt.title("Confusion Matrix on SelfAware dataset.") |
|
|
|
if save: |
|
|
|
plt.tight_layout() |
|
plt.savefig(os.path.join(fig_path, "selfaware_cm.pdf"), format="pdf") |
|
plt.savefig(os.path.join(fig_path, "selfaware_cm.png"), format="png") |
|
|
|
|
|
return fig |
|
|
|
def evaluate_selfaware(self, llm_responses): |
|
|
|
model_name = "princeton-nlp/sup-simcse-roberta-large" |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModel.from_pretrained(model_name).to(device) |
|
|
|
|
|
labels, preds = [], [] |
|
for item in llm_responses: |
|
|
|
labels.append(item["label_unanswerable"]) |
|
|
|
preds.append(self.get_unanswerable(item["response"], model, tokenizer)) |
|
|
|
|
|
result = classification_report(labels, preds, output_dict=True, zero_division=0) |
|
|
|
|
|
|
|
if "True" in result: |
|
result['unanswerable_as_pos'] = result.pop("True") |
|
if "False" in result: |
|
result['answerable_as_pos'] = result.pop('False') |
|
|
|
return result, labels, preds |
|
|
|
|