In [None]:
import pandas as pd
import numpy as np
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch
from sklearn.metrics import roc_auc_score

In [None]:
model = SentenceTransformer('dunzhang/stella_en_1.5b_v5', trust_remote_code=True, device='cuda')


In [None]:
cohort_checks = pd.read_csv('top_ten_cohorts_checked_synthetic.csv')

In [None]:
cohort_checks.info()

In [None]:
cohort_checks['mod_eligibility_result'] = np.where(cohort_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)

In [None]:
cohort_checks.eligibility_result.value_counts()

In [None]:
cohort_checks.mod_eligibility_result.value_counts()

In [None]:
patient_checks = pd.read_csv('top_twenty_patients_checked_synthetic.csv')

In [None]:
patient_checks['mod_eligibility_result'] = np.where(patient_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)

In [None]:
patient_checks.info()

In [None]:
patient_checks.eligibility_result.value_counts(), patient_checks.mod_eligibility_result.value_counts()

In [None]:
patient_checks = patient_checks.rename(columns={'this_patient':'patient_summary', 'space_summary':'this_space'})

In [None]:
combined_checks = pd.concat([patient_checks, cohort_checks], axis=0)

In [None]:
combined_checks.info()

In [None]:
train_summaries = combined_checks[combined_checks.split=='train']
train_summaries = train_summaries[~train_summaries.patient_summary.isnull()]
train_summaries = train_summaries[~train_summaries.llama_response.isnull()]
train_summaries.split.value_counts()

In [None]:
train_summaries.mod_eligibility_result.value_counts()

In [None]:
train_summaries.info()

In [None]:
# mll loss
train_eligibles_only = train_summaries[train_summaries.eligibility_result == 1]
example_list = []
for i in range(train_eligibles_only.shape[0]):
 example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]]))

train_eligibles_only_dataloader = DataLoader(example_list, shuffle=True, batch_size=8)
train_eligibles_only_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
# for attempt at contrastive loss
contrastive_example_list = []
for i in range(train_summaries.shape[0]):
 contrastive_example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]],
 label=train_summaries.mod_eligibility_result.iloc[i]))

contrastive_dataloader = DataLoader(contrastive_example_list, shuffle=True, batch_size=12)
contrastive_train_loss = losses.OnlineContrastiveLoss(model=model)

In [None]:
#%%capture
model.fit(train_objectives=[(contrastive_dataloader, contrastive_train_loss),
 (train_eligibles_only_dataloader, train_eligibles_only_loss)], epochs=2, warmup_steps=100)

In [None]:
model.save('reranker_round1.model')

In [None]:
model = SentenceTransformer('reranker_round1.model', trust_remote_code=True, device='cuda')

In [None]:
# check model's ability to do initial discriminate among diseases task
# (on PHI)


In [None]:
cohort_checks = pd.read_csv('../v7/space_specific_eligibility_checks_11-6-24.csv')
# this cohort_checks file is not provided publicly, since it contains PHI/IP

In [None]:
validation_set = cohort_checks[cohort_checks.split.str.contains('valid')]
validation_set.info()


In [None]:
validation_set = validation_set[~validation_set.patient_summary.isnull()]
validation_set.info()

In [None]:

eligibles_only = validation_set[validation_set.eligibility_result == 1]
patient_summary_embeddings = model.encode(eligibles_only.patient_summary.tolist())
trial_summary_embeddings = model.encode(eligibles_only.this_space.tolist())

In [None]:
# among patient to trial space candidate matches that pass llama checks, how good is TrialSpace at discriminating between true and random matches?
import random
labels = []
similarities = []
for i in range(trial_summary_embeddings.shape[0]):
 if random.choice([0,1]) == 1:
 similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[i, :]).unsqueeze(0)))
 labels.append(1.)
 else:
 random_index = random.choice([x for x in range(0,trial_summary_embeddings.shape[0])])
 similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[random_index, :]).unsqueeze(0)))
 labels.append(0.)
roc_auc_score(labels, np.array([x.numpy() for x in similarities]))

In [None]:
# how good are embeddings at discriminating between llama yes and no checks?
# (on PHI)
patient_summary_embeddings = model.encode(validation_set.patient_summary.tolist(), convert_to_tensor=True)
trial_summary_embeddings = model.encode(validation_set.this_space.tolist(), convert_to_tensor=True)

In [None]:
similarities = F.cosine_similarity(patient_summary_embeddings, trial_summary_embeddings).detach().cpu().numpy()
roc_auc_score(validation_set.eligibility_result, similarities)

In [None]:
validation_set.eligibility_result.value_counts()