File size: 2,720 Bytes
d9925b3
 
 
 
 
 
 
6cf7ff8
d9925b3
 
 
 
6cf7ff8
d9925b3
6cf7ff8
 
d9925b3
 
6cf7ff8
d9925b3
 
 
6cf7ff8
d9925b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import time
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sentence_transformers import util
import os

# Set random seed for reproducibility
torch.manual_seed(1)

path = os.getcwd()
# Load datasets
df_inmemory = pd.read_csv(path + '/raw_data/labeled.csv') # labeled text extracted from 230 CSR GRI reports, 150 International companies, 2017-2021 period
df_paragraph = pd.read_csv(path + '/raw_data/prediction_demo.csv', encoding='latin1')

# Load stored embeddings
with open(path + '/embeddings/embeddings_prediction.pkl', "rb") as f:
    stored_data = pickle.load(f)
    pred_embeddings = stored_data['parg_embeddings']

with open(path + '/embeddings/embeddings_labeled.pkl', "rb") as f:
    stored_data = pickle.load(f)
    embeddings = stored_data['sent_embeddings']

# Define function for cosine similarity search
def get_top_n_similar_reports(new_report, report_embeddings, top_n=20):
    search_hits = util.semantic_search(new_report, report_embeddings, top_k=top_n)
    top_report_ids = [hit['corpus_id'] for hit in search_hits[0]]
    similarity_scores = [hit['score'] for hit in search_hits[0]]
    
    return pd.DataFrame({'top_report_ids': top_report_ids, 'cosine_similarity': similarity_scores})

# Perform cosine similarity search
test_embeddings = pred_embeddings[:50000]
all_predictions = []

start = time.time()
for i, test_embedding in enumerate(test_embeddings):
    result_df = get_top_n_similar_reports(test_embedding.reshape(1, -1), embeddings)
    result = pd.merge(result_df, df_inmemory, left_on='top_report_ids', right_on='index', how='left')
    all_predictions.append(result)

df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0)

# Apply K-Nearest Neighbor (KNN) algorithm
top_n = 12
predict = pd.DataFrame(columns=df_inmemory.columns[6:])
for item in range(len(all_predictions)):
    k_similar_reports = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity'])
    result_knn = pd.DataFrame(0, index=[0], columns=k_similar_reports.columns[8:])
    for i in range(top_n):
        result_knn += k_similar_reports.iloc[i, 8:].values
    predict = pd.concat([predict, result_knn], ignore_index=True)

# Apply Sigmoid activation function
sigmoid = nn.Sigmoid()
data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32)
output = sigmoid(data_tensor)
output = (output > 0.90).float()

# Save results
output_df = pd.DataFrame(output.numpy(), columns=predict.columns)
df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1)
df_results.to_csv('df_results_0_50k.csv', index=False)

print(f"Processing completed in {time.time() - start:.2f} seconds.")