|
import time |
|
import pickle |
|
import numpy as np |
|
import pandas as pd |
|
import torch |
|
import torch.nn as nn |
|
from sentence_transformers import util |
|
import os |
|
|
|
|
|
torch.manual_seed(1) |
|
|
|
path = os.getcwd() |
|
|
|
df_inmemory = pd.read_csv(path + '/raw_data/labeled.csv') |
|
df_paragraph = pd.read_csv(path + '/raw_data/prediction_demo.csv', encoding='latin1') |
|
|
|
|
|
with open(path + '/embeddings/embeddings_prediction.pkl', "rb") as f: |
|
stored_data = pickle.load(f) |
|
pred_embeddings = stored_data['parg_embeddings'] |
|
|
|
with open(path + '/embeddings/embeddings_labeled.pkl', "rb") as f: |
|
stored_data = pickle.load(f) |
|
embeddings = stored_data['sent_embeddings'] |
|
|
|
|
|
def get_top_n_similar_reports(new_report, report_embeddings, top_n=20): |
|
search_hits = util.semantic_search(new_report, report_embeddings, top_k=top_n) |
|
top_report_ids = [hit['corpus_id'] for hit in search_hits[0]] |
|
similarity_scores = [hit['score'] for hit in search_hits[0]] |
|
|
|
return pd.DataFrame({'top_report_ids': top_report_ids, 'cosine_similarity': similarity_scores}) |
|
|
|
|
|
test_embeddings = pred_embeddings[:50000] |
|
all_predictions = [] |
|
|
|
start = time.time() |
|
for i, test_embedding in enumerate(test_embeddings): |
|
result_df = get_top_n_similar_reports(test_embedding.reshape(1, -1), embeddings) |
|
result = pd.merge(result_df, df_inmemory, left_on='top_report_ids', right_on='index', how='left') |
|
all_predictions.append(result) |
|
|
|
df_all_predictions = pd.concat(all_predictions, keys=range(len(all_predictions)), axis=0) |
|
|
|
|
|
top_n = 12 |
|
predict = pd.DataFrame(columns=df_inmemory.columns[6:]) |
|
for item in range(len(all_predictions)): |
|
k_similar_reports = df_all_predictions.xs(item).nlargest(top_n, ['cosine_similarity']) |
|
result_knn = pd.DataFrame(0, index=[0], columns=k_similar_reports.columns[8:]) |
|
for i in range(top_n): |
|
result_knn += k_similar_reports.iloc[i, 8:].values |
|
predict = pd.concat([predict, result_knn], ignore_index=True) |
|
|
|
|
|
sigmoid = nn.Sigmoid() |
|
data_tensor = torch.tensor(predict.to_numpy().astype(float), dtype=torch.float32) |
|
output = sigmoid(data_tensor) |
|
output = (output > 0.90).float() |
|
|
|
|
|
output_df = pd.DataFrame(output.numpy(), columns=predict.columns) |
|
df_results = pd.concat([df_paragraph.iloc[:50000, :].reset_index(), output_df], axis=1) |
|
df_results.to_csv('df_results_0_50k.csv', index=False) |
|
|
|
print(f"Processing completed in {time.time() - start:.2f} seconds.") |
|
|