sbic-method2 / prediction_multilabel.py
HamidBekam's picture
Update prediction_multilabel.py
6cf7ff8 verified
raw
history blame
2.72 kB
import time
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sentence_transformers import util
import os
# Set random seed for reproducibility
torch.manual_seed(1)
path = os.getcwd()
# Load datasets
df_inmemory = pd.read_csv(path + '/raw_data/labeled.csv') # labeled text extracted from 230 CSR GRI reports, 150 International companies, 2017-2021 period
df_paragraph = pd.read_csv(path + '/raw_data/prediction_demo.csv', encoding='latin1')
# Load stored embeddings
with open(path + '/embeddings/embeddings_prediction.pkl', "rb") as f:
stored_data = pickle.load(f)
pred_embeddings = stored_data['parg_embeddings']
with open(path + '/embeddings/embeddings_labeled.pkl', "rb") as f:
stored_data = pickle.load(f)
embeddings = stored_data['sent_embeddings']
# Define function for cosine similarity search
def get_top_n_similar_reports(new_report, report_embeddings, top_n=20):
search_hits = util.semantic_search(new_report, report_embeddings, top_k=top_n)
top_report_ids = [hit['corpus_id'] for hit in search_hits[0]]
similarity_scores = [hit['score'] for hit in search_hits[0]]
return pd.DataFrame({'top_report_ids': top_report_ids, 'cosine_similarity': similarity_scores})
# Perform cosine similarity search
test_embeddings = pred_embeddings[:50000]
all_predictions = []
start = time.time()
for i, test_embedding in enumerate(test_embeddings):
result_df = get_top_n_similar_reports(test_embedding.reshape(1, -1), embeddings)
result = pd.merge(result_df, df_inmemory, left_on='top_report_ids', right_on='index', how='left')
all_predictions.append(result)
df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0)
# Apply K-Nearest Neighbor (KNN) algorithm
top_n = 12
predict = pd.DataFrame(columns=df_inmemory.columns[6:])
for item in range(len(all_predictions)):
k_similar_reports = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity'])
result_knn = pd.DataFrame(0, index=[0], columns=k_similar_reports.columns[8:])
for i in range(top_n):
result_knn += k_similar_reports.iloc[i, 8:].values
predict = pd.concat([predict, result_knn], ignore_index=True)
# Apply Sigmoid activation function
sigmoid = nn.Sigmoid()
data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32)
output = sigmoid(data_tensor)
output = (output > 0.90).float()
# Save results
output_df = pd.DataFrame(output.numpy(), columns=predict.columns)
df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1)
df_results.to_csv('df_results_0_50k.csv', index=False)
print(f"Processing completed in {time.time() - start:.2f} seconds.")