#bibliotecas import pandas as pd import numpy as np import torch from torch import cuda from torch.nn import functional as F #from sklearn.model_selection import train_test_split import transformers from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, ) from sentence_transformers import SentenceTransformer #classes e funcs #parte 1 ########################################################################################################### #parte 1 ########################################################################################################### def convert_label(lista): for x in range(len(lista)): curr = lista[x] lista[x] = 0 if curr =='loss' else 1 if curr == 'hazard' else 2# if curr == 'constraint' else 3 return lista def df_with_pred(labels, predictions, data): lista = [] cont = 0 #predicted = np.argmax(results.logits.cpu(), axis=-1) for test,pred in zip(labels, predictions): lista.append([data.id.iloc[cont],data.req.iloc[cont],test,pred.item()]) cont += 1 return pd.DataFrame(lista, columns=['id','req', 'label', 'pred']) #parte 2 ########################################################################################################### #parte 2 ########################################################################################################### # def organize_predictions_list(predicted, data):#data : ['id','req', 'label', 'pred'] # list_loss = [] # list_hazard = [] # list_constraint = [] # for x in range(len(predicted)): # if(predicted[x] == 0): # list_loss.append([data.id.iloc[x], data.req.iloc[x]]) # elif(predicted[x] == 1): # list_hazard.append([data.id.iloc[x], data.req.iloc[x]]) # elif(predicted[x] == 2): # list_constraint.append([data.id.iloc[x], data.req.iloc[x]]) # return pd.DataFrame(list_loss, columns=['id','req']), pd.DataFrame(list_hazard, columns=['id','req']), pd.DataFrame(list_constraint, columns=['id','req']) def organize_step2_predictions(predictions, list_sentences): list_correct = [] list_incorrect = [] for prediction, sentence in zip(predictions, list_sentences): if prediction == 0: list_correct.append(sentence) else: list_incorrect.append(sentence) return list_correct, list_incorrect def get_incorrect(predicted, data): #data : [id, req] list_incorrect = [] for x in range(len(predicted)): if predicted[x] == 1: list_incorrect.append([data.id.iloc[x],data.req.iloc[x]]) return pd.DataFrame(list_incorrect,columns=['id','req']) #parte 3 ########################################################################################################### #parte 3 ########################################################################################################### def format_examples(df): examples = [] for sentence in df: examples.append([sentence,sentence]) return examples def check_similarity_return(list_incorrect, list_correct, model): embeddings = model.encode(list_correct) for x in range(len(list_incorrect)): id = list_incorrect.id.iloc[x] sentence = list_incorrect.req.iloc[x] sentence = model.encode(sentence) similarity = model.similarity(sentence, embeddings) sim_pair = [] for sim,correct in zip(similarity[0].tolist(), list_correct): sim_pair.append([id, sim, correct[0]]) sim_pair.sort(key=lambda x: x[0]) sim_pair.reverse() return sim_pair[:10] def check_similarity_return2(list_incorrect, list_correct, model): sim_pair = [] embeddings = model.encode(list_correct) for x in range(len(list_incorrect)): id = list_incorrect.id.iloc[x] sentence = list_incorrect.req.iloc[x] sentence = model.encode(sentence) similarity = model.similarity(sentence, embeddings) temp_list = [] for sim,correct in zip(similarity[0].tolist(), list_correct): temp_list.append([id, sim, correct[0]]) temp_list.sort(key=lambda x: x[1]) temp_list.reverse() sim_pair.extend(temp_list[:10]) # print(sim_pair) return sim_pair #parte 4 ########################################################################################################### #parte 4 ########################################################################################################### def list_erro_with_pred(results, data, sub): diff_label = [] cont = 0 predicted = np.argmax(results.logits.cpu(), axis=-1) probabilidade = F.softmax(results.logits.cpu(), dim=-1) for id,req,pred,prob in zip(data.id, data.req, predicted, probabilidade): # print(pred) # print(sub[pred.item()]) # print(prob.tolist()) #diff_label.append([id,req,sub[pred.item()],prob.tolist()]) diff_label.append([id,req,pred.item(),prob.tolist()]) cont+=1 return diff_label ######################################################################## ######################################################################## ######################################################################## ######################################################################## ###########################################