In [None]:
import numpy as np
import pandas as pd
import json
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import torch
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'


In [None]:
llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, 
 gpu_memory_utilization = 0.5,
 download_dir = "../../..", max_model_len=6000)

In [None]:
def summarize_trials_multi_cohort(eligibility_texts, llama_model):

 tokenizer = llama.get_tokenizer()
 prompts = []
 for trial in eligibility_texts:
 messages = [
 {'role':'system', 'content': """You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.
 Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.
 A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.
 Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.
 Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.
 For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.
 Spell out cancer types; do not abbreviate them. For example, write "non-small cell lung cancer" rather than "NSCLC".
 Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:
 1. Cancer type allowed: . Histology allowed: . Cancer burden allowed: . Prior treatment required: . Prior treatment excluded: . Biomarkers required: . Biomarkers excluded: .
 2. Cancer type allowed: , etc.
 If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).
 """}, 
 
 {'role':'user', 'content': "Here is a clinical trial document: \n" + trial + "\n" + """Now, generate your list of the trial space(s), formatted as above.
 Do not provide any introductory, explanatory, concluding, or disclaimer text.
 Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED."""
 }
 ]
 
 prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))
 

 
 responses = llama_model.generate(
 prompts, 
 SamplingParams(
 temperature=0.0,
 top_p=0.9,
 max_tokens=3096,
 stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")], # KEYPOINT HERE
 ))

 response_texts = [x.outputs[0].text for x in responses]


 return responses, response_texts

In [None]:
trials = pd.read_csv('./ctgov_cancer_trials.csv')

In [None]:
trial_cohorts = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)

In [None]:
trials['spaces'] = trial_cohorts[1]

In [None]:
trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')

In [None]:
import pandas as pd
import numpy as np
output = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')

In [None]:
# example of a trial and extracted spaces
i = 1000
output.trial_text.iloc[i], output.spaces.iloc[i]

In [None]:
frames = []
for i in range(trials.shape[0]):
 cohorts = pd.Series(trials.iloc[i].spaces.split("\n"))
 cohorts = cohorts[~((cohorts.isnull()) | (cohorts == "\n") | (cohorts == ''))].reset_index(drop=True)
 frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)
 frame['this_space'] = cohorts
 frame['space_number'] = frame.index
 frames.append(frame)
 

In [None]:
cohort_level_trials = pd.concat(frames, axis=0)

In [None]:
cohort_level_trials.info()

In [None]:
cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()

In [None]:
cohort_level_trials = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]

In [None]:
cohort_level_trials.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [None]:
temp = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [None]:
temp.this_space.nunique()

In [None]:
import pandas as pd
out = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [None]:
out.info()

In [None]:
# this component and following cells will not run without access to the DFCI private dataset

import pandas as pd
dfci_trials = pd.read_csv("../space_specific_eligibility_checks_11-6-24.csv")
dfci_trials.info()

In [None]:
non_dfci_ctgov_trials = out[~out.nct_id.isin(dfci_trials.nct_id)]

In [None]:
non_dfci_ctgov_trials.info()

In [None]:
unique_trials = non_dfci_ctgov_trials.groupby('nct_id').first().reset_index()[['nct_id']]
unique_trials.shape[0]

In [None]:
unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)

In [None]:
sample_spaces = non_dfci_ctgov_trials[non_dfci_ctgov_trials.nct_id.isin(unique_trial_sample)]

In [None]:
sample_spaces.info()

In [None]:
sample_spaces.to_csv('sample_spaces.csv')