Spaces:
Sleeping
Sleeping
from multiprocessing import Pool | |
from eval.vert import get_self_bleu2_geometric, get_auto_bleu2_geometric, run_f | |
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel | |
import torch | |
from sklearn.metrics.pairwise import cosine_similarity | |
from scipy.stats import gmean | |
def perplexity(LLM_Output): | |
import evaluate | |
# import pdb;pdb.set_trace() | |
perplexity = evaluate.load("perplexity", module_type="metric") | |
results = perplexity.compute(model_id='gpt2',predictions=[LLM_Output]) | |
return f"Perplexity: {results['mean_perplexity']:.2f}\n" | |
def vert(LLM_response_arr): | |
# import pdb;pdb.set_trace() | |
terms = [x.strip().split() for x in LLM_response_arr] | |
tasks = [ | |
('Self-BLEU2-geometric', get_self_bleu2_geometric), | |
('Auto-BLEU2-geometric', get_auto_bleu2_geometric), | |
] | |
n_processes = min(16, len(tasks)) | |
with Pool(n_processes) as pool: | |
metrics = pool.map(run_f, [(t[1], terms) for t in tasks]) | |
metric_arr=[] | |
str1="" | |
for (metric_name, _), metric in zip(tasks, metrics): | |
metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric)) | |
metric, sem = [ | |
round(100 * x, 2) for x in [metric, sem] | |
] | |
metric_arr.append(metric) | |
str1+=(f'{metric_name}: {metric}\n') | |
str1+=(f'VERT: {round(100*gmean(metric), 2)}\n') | |
return str1 | |
def bert_score(total_response_arr): | |
# import pdb;pdb.set_trace() | |
def cosine_similarity_context_response(context, response, model, tokenizer): | |
# Tokenize and encode both context and response | |
context_inputs = tokenizer(context, return_tensors="pt", truncation=True) | |
response_inputs = tokenizer(response, return_tensors="pt", truncation=True) | |
for k in context_inputs: | |
context_inputs[k]=context_inputs[k].cuda() | |
for k in response_inputs: | |
response_inputs[k]=response_inputs[k].cuda() | |
# Get embeddings from the model | |
with torch.no_grad(): | |
context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1) | |
response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1) | |
# Compute cosine similarity | |
similarity = cosine_similarity(context_embedding.cpu().numpy(), response_embedding.cpu().numpy()) | |
return similarity[0][0] | |
bert_model_name = "bert-base-uncased" | |
bert_model = AutoModel.from_pretrained(bert_model_name).cuda() | |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name) | |
similarity = cosine_similarity_context_response(" ".join(total_response_arr[:-1]), total_response_arr[-1], bert_model, bert_tokenizer) | |
return (f"Cosine Similarity: {similarity*100:.2f}"+"\n") | |
def DialoGPT_perplexity(user_utterance, response): | |
# import pdb;pdb.set_trace() | |
def evaluate_response_with_dialoGPT(context, response, model, tokenizer): | |
""" | |
Evaluate the appropriateness of a response based on the given context using DialoGPT. | |
Args: | |
context (str): The dialogue context (previous conversation). | |
response (str): The generated response to evaluate. | |
model: Pre-trained DialoGPT model. | |
tokenizer: Corresponding tokenizer for the DialoGPT model. | |
Returns: | |
float: Perplexity score of the response given the context. | |
""" | |
model.eval() | |
# Combine context and response as input | |
input_text = context + tokenizer.eos_token + response + tokenizer.eos_token | |
inputs = tokenizer(input_text, return_tensors="pt", truncation=True) | |
inputs['input_ids']=inputs['input_ids'].cuda() | |
inputs['attention_mask']=inputs['attention_mask'].cuda() | |
# import pdb;pdb.set_trace() | |
# Compute model outputs and loss | |
with torch.no_grad(): | |
outputs = model(**inputs, labels=inputs["input_ids"].cuda()) | |
loss = outputs.loss | |
# Calculate perplexity | |
perplexity = torch.exp(loss) | |
return perplexity.cpu().item() | |
# Load DialoGPT model and tokenizer | |
model_name = "microsoft/DialoGPT-medium" # Choose small/medium/large based on your resources | |
model = AutoModelForCausalLM.from_pretrained(model_name).cuda() | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
perplexity = evaluate_response_with_dialoGPT(user_utterance, response, model, tokenizer) | |
return (f"DialoGPT Perplexity: {perplexity:.2f}"+"\n") | |