ksg-dfci
/

MatchMiner-AI

Model card Files Files and versions Community

kenlkehl commited on Dec 21, 2024

Commit

2d4cda5

verified ·

1 Parent(s): d3a3d42

Add code

Browse files

Files changed (20) hide show

0_summarize_ctgov_trials.ipynb +423 -0
10_train_trialspace_round2.ipynb +443 -0
11_train_roberta_checker.ipynb +0 -0
12_example_patient_query.ipynb +749 -0
1a_generate_synthetic_imaging_reports.ipynb +183 -0
1b_generate_synthetic_clinical_notes.ipynb +136 -0
1c_generate_synthetic_path_reports.ipynb +136 -0
2a_tag_chunks_of_synthetic_notes.ipynb +271 -0
2b_train_tiny_bert_tagger.ipynb +706 -0
3_generate_synthetic_full_patient_histories.ipynb +148 -0
4_summarize_synthetic_histories.ipynb +276 -0
5a_make_top_10_cohorts_llama_check_list.ipynb +232 -0
5b_check_top10_cohorts_synthetic.ipynb +187 -0
6a_make_top_10_cohorts_llama_check_list.ipynb +233 -0
6b_check_top20_patient_matches_synthetic.ipynb +191 -0
7_train_trialspace_round1.ipynb +451 -0
8a_make_top_10_cohorts_llama_check_list_round2.ipynb +232 -0
8b_check_top10_cohorts_synthetic_round2.ipynb +187 -0
9a_make_top_20_patients_llama_check_list_round2.ipynb +232 -0
9b_check_top20_patients_synthetic_round2.ipynb +191 -0

0_summarize_ctgov_trials.ipynb ADDED Viewed

	@@ -0,0 +1,423 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee78bb6d-4e3c-4751-b042-12c358d89cac",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import json\n",
+    "from vllm import LLM, SamplingParams\n",
+    "from transformers import AutoTokenizer\n",
+    "import torch\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8eeb339-6aca-4d3f-96fb-24a1caf26b34",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7129a989-04e9-475d-9260-d1fdb1ab7faa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
+    "            gpu_memory_utilization = 0.5,\n",
+    "            download_dir = \"../../..\", max_model_len=6000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9d7d1c4-50ed-4614-9855-8e6cc86bbb0e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73897bb9-0738-4446-b332-9b9bf46ad043",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize_trials_multi_cohort(eligibility_texts, llama_model):\n",
+    "\n",
+    "    tokenizer = llama.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    for trial in eligibility_texts:\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.\n",
+    "        Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.\n",
+    "        A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.\n",
+    "        Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.\n",
+    "        Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.\n",
+    "        For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.\n",
+    "        Spell out cancer types; do not abbreviate them. For example, write \"non-small cell lung cancer\" rather than \"NSCLC\".\n",
+    "        Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:\n",
+    "        1. Cancer type allowed: <cancer_type_allowed>. Histology allowed: <histology_allowed>. Cancer burden allowed: <cancer_burden_allowed>. Prior treatment required: <prior_treatments_requred>. Prior treatment excluded: <prior_treatments_excluded>. Biomarkers required: <biomarkers_required>. Biomarkers excluded: <biomarkers_excluded>.\n",
+    "        2. Cancer type allowed: <cancer_type_allowed>, etc.\n",
+    "        If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).\n",
+    "        \"\"\"},      \n",
+    "              \n",
+    "            {'role':'user', 'content': \"Here is a clinical trial document: \\n\" + trial + \"\\n\" + \"\"\"Now, generate your list of the trial space(s), formatted as above.\n",
+    "            Do not provide any introductory, explanatory, concluding, or disclaimer text.\n",
+    "            Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED.\"\"\"\n",
+    "            }\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=3096,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return responses, response_texts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca683840-842b-4346-8eef-b66bc52d26af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trials = pd.read_csv('./ctgov_cancer_trials.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa51de7d-74e0-4822-b7e1-2c9a3bc31260",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4816dbf0-bd92-4742-912a-477e545e330b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trial_cohorts = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8283c587-c909-4548-804d-4d88b4ed7255",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trials['spaces'] = trial_cohorts[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ca75bab-7273-4ab0-86cd-1e0373546fce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0291913f-f3b9-4b39-99ab-954cb7237255",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16563812-6967-4788-a123-0af5fd701ede",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95776dbe-1a25-44bd-90f8-5c1573b6e92a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "output = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf647a1f-5a8c-4958-9032-440806a306d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example of a trial and extracted spaces\n",
+    "i = 1000\n",
+    "output.trial_text.iloc[i], output.spaces.iloc[i]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9cc06840-5647-4524-a7bf-a1ad53a07b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frames = []\n",
+    "for i in range(trials.shape[0]):\n",
+    "    cohorts = pd.Series(trials.iloc[i].spaces.split(\"\\n\"))\n",
+    "    cohorts = cohorts[~((cohorts.isnull()) | (cohorts == \"\\n\") | (cohorts == ''))].reset_index(drop=True)\n",
+    "    frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)\n",
+    "    frame['this_space'] = cohorts\n",
+    "    frame['space_number'] = frame.index\n",
+    "    frames.append(frame)\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "541669eb-f92e-49f3-9a36-b6625448c1a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials = pd.concat(frames, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51a04e84-7483-4398-b4a0-d0cdab790609",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "648f0e1e-ef81-4983-8f03-1fbdb138f649",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ea048c1-c4ef-4202-a9be-a4658c4f1058",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "852aee9d-ad97-4374-932f-6cae378dde2a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00d2220a-627a-4b67-be28-c42561c3c964",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_level_trials.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a130e909-6629-4408-b1ad-201b319d5e0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "temp = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad078444-33e1-4398-92b8-2e7f9f1a4031",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "temp.this_space.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be264ecb-12e7-4fd4-a16b-5a4b2f44d2aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "out = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d38ca13b-f4c4-47f1-abd6-3289abbd5f64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6849b44d-df0d-464f-bbce-f8fc1f789d3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this component and following cells will not run without access to the DFCI private dataset\n",
+    "\n",
+    "import pandas as pd\n",
+    "dfci_trials = pd.read_csv(\"../space_specific_eligibility_checks_11-6-24.csv\")\n",
+    "dfci_trials.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "869690c3-2a80-4403-8933-f8f042c4ae35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "non_dfci_ctgov_trials = out[~out.nct_id.isin(dfci_trials.nct_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d28ed5c-d152-40a0-ab14-4aa748f3f8ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "non_dfci_ctgov_trials.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "efdaaf5b-edfb-4900-b85b-dde7eb1f92df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_trials = non_dfci_ctgov_trials.groupby('nct_id').first().reset_index()[['nct_id']]\n",
+    "unique_trials.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41a63a73-4822-4c1d-820d-389252c0c56f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cbbffe7-72ca-45b4-a11f-bb2d278bcfb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_spaces = non_dfci_ctgov_trials[non_dfci_ctgov_trials.nct_id.isin(unique_trial_sample)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2639cc4-3472-463c-8519-ce0a9a1d845c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bc6def48-cacc-437b-ac19-2af9418821c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_spaces.to_csv('sample_spaces.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b2370cf-e2ec-4e54-8dd0-6dde6d0fb041",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

10_train_trialspace_round2.ipynb ADDED Viewed

	@@ -0,0 +1,443 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81b83fa8-421d-4be5-b9eb-5892f01fd5b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "937cbcda-0cd6-47f7-b52e-17ed2bafce3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('reranker_round1.model', trust_remote_code=True, device='cuda')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "853e6b86-db0b-4650-b98f-f437987baa5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks = pd.read_csv('top_ten_cohorts_checked_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "474950e0-869b-414e-823f-df5ba8e5de92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9835dad-4fc4-4a0e-aba2-d45358edbee9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks['mod_eligibility_result'] = np.where(cohort_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "413bddbc-f35c-48ec-bcbb-48405bd2c9c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79c2d994-1e39-41b6-aeb3-962ba3ba5611",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c8e6a20-4513-422c-be6e-3459ce98a2be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks = pd.read_csv('top_twenty_patients_checked_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e91074cf-07de-40d1-8bf3-baf825d3f625",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks['mod_eligibility_result'] = np.where(patient_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a2d45fc-7d92-4a65-aad0-a9ab8f783779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cf7c705-ba6d-4f01-ac2e-88d345fef7f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks.eligibility_result.value_counts(), patient_checks.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dec4a8c8-c5db-4164-a06c-27fb59782fa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks = patient_checks.rename(columns={'this_patient':'patient_summary', 'space_summary':'this_space'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0bf55e82-c91d-472f-84ad-74c755e9bf29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined_checks = pd.concat([patient_checks, cohort_checks], axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebd07c9c-6263-4005-bfb1-2a8468b76a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49f59429-c9f4-43df-a1b2-750a3c94517a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1d0126e-a58d-41ca-ad2f-a2d37bc585ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_summaries = combined_checks[combined_checks.split=='train']\n",
+    "train_summaries = train_summaries[~train_summaries.patient_summary.isnull()]\n",
+    "train_summaries = train_summaries[~train_summaries.llama_response.isnull()]\n",
+    "train_summaries.split.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f6506ed-dcbf-4e0b-8722-6e234c2d4509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_summaries.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c678a59a-c301-42d9-83dd-511503cee2fb",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "train_summaries.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57932264-103a-413b-9a48-43b7be254ac0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mll loss\n",
+    "train_eligibles_only = train_summaries[train_summaries.eligibility_result == 1]\n",
+    "example_list = []\n",
+    "for i in range(train_eligibles_only.shape[0]):\n",
+    "    example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]]))\n",
+    "\n",
+    "train_eligibles_only_dataloader = DataLoader(example_list, shuffle=True, batch_size=8)\n",
+    "train_eligibles_only_loss = losses.MultipleNegativesRankingLoss(model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5482be3-9a13-4ce1-aa8a-429c54bf6be0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for attempt at contrastive loss\n",
+    "# note 'Yes' is considered positive even without !\n",
+    "contrastive_example_list = []\n",
+    "for i in range(train_summaries.shape[0]):\n",
+    "    contrastive_example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]],\n",
+    "                                    label=train_summaries.mod_eligibility_result.iloc[i]))\n",
+    "\n",
+    "contrastive_dataloader = DataLoader(contrastive_example_list, shuffle=True, batch_size=12)\n",
+    "contrastive_train_loss = losses.OnlineContrastiveLoss(model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e825dae-a5a9-4f87-af35-63ac2d73de33",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f17ad7a6-8911-4d7d-8495-3e37cb00597d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#%%capture\n",
+    "model.fit(train_objectives=[(contrastive_dataloader, contrastive_train_loss),\n",
+    "                           (train_eligibles_only_dataloader, train_eligibles_only_loss)], epochs=2, warmup_steps=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9cb6021-21d8-44bf-b440-980fcdae3b3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('reranker_round2.model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bae79a2e-4357-4c90-ba4c-a08b1206a99d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('reranker_round2.model', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5517caa-c45b-4b62-ae8d-0af61b61fd25",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6bfb8f7-ca6b-474b-8ce3-ba5acacb6b6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check model's ability to do initial discriminate among diseases task\n",
+    "# (on PHI)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4172f6ba-b334-4b83-b73e-d05dad6c05f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# this file is not uploaded, since it contains PHI/IP\n",
+    "cohort_checks = pd.read_csv('../v7/space_specific_eligibility_checks_11-6-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6b25941-0007-4347-9ef3-899f9258542a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_set = cohort_checks[cohort_checks.split.str.contains('valid')]\n",
+    "validation_set.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b791608-6011-4bf6-914a-9534a08eba5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_set = validation_set[~validation_set.patient_summary.isnull()]\n",
+    "validation_set.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "479b9905-fcd6-4d37-9b03-7bbbfb88f123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "eligibles_only = validation_set[validation_set.eligibility_result == 1]\n",
+    "patient_summary_embeddings = model.encode(eligibles_only.patient_summary.tolist())\n",
+    "trial_summary_embeddings = model.encode(eligibles_only.this_space.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b8f3a40-0854-43a5-bd83-a7fe6770f52b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "labels = []\n",
+    "similarities = []\n",
+    "for i in range(trial_summary_embeddings.shape[0]):\n",
+    "    if random.choice([0,1]) == 1:\n",
+    "        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[i, :]).unsqueeze(0)))\n",
+    "        labels.append(1.)\n",
+    "    else:\n",
+    "        random_index = random.choice([x for x in range(0,trial_summary_embeddings.shape[0])])\n",
+    "        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[random_index, :]).unsqueeze(0)))\n",
+    "        labels.append(0.)\n",
+    "roc_auc_score(labels, np.array([x.numpy() for x in similarities]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16dd4634-0389-466d-8257-160ddd2659af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# how good are embeddings at discriminating between llama yes and no checks on original enrollments?\n",
+    "# (for PHI)\n",
+    "patient_summary_embeddings = model.encode(validation_set.patient_summary.tolist(), convert_to_tensor=True)\n",
+    "trial_summary_embeddings = model.encode(validation_set.this_space.tolist(), convert_to_tensor=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bb0bc89-0b4f-451d-9523-550f7344e4d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarities = F.cosine_similarity(patient_summary_embeddings, trial_summary_embeddings).detach().cpu().numpy()\n",
+    "roc_auc_score(validation_set.eligibility_result, similarities)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6035e62-8d28-49c5-8d0a-049633edd553",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "453c2f3c-105a-4b71-851c-372bf29d3fe8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69a3fc1d-86f1-49f7-a93a-54f4748c5dbf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23d7d1f4-9f1f-42f6-a366-0e39af8893b2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c4415d5-d0fd-48ca-b88c-2e244434561d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

11_train_roberta_checker.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

12_example_patient_query.ipynb ADDED Viewed

	@@ -0,0 +1,749 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "da78b443-8f9a-426d-8ac1-2320dc10f1d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "# os.environ['CUDA_VISIBLE_DEVICES'] = '2'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6ed86222-b6a4-4f3c-8dd6-ae6ac2fd7545",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trial_spaces = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "df199321-94ac-4998-a3ad-bb90705485f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 38140 entries, 0 to 38139\n",
+      "Data columns (total 10 columns):\n",
+      " #   Column                Non-Null Count  Dtype \n",
+      "---  ------                --------------  ----- \n",
+      " 0   Unnamed: 0.1          38140 non-null  int64 \n",
+      " 1   Unnamed: 0            38140 non-null  int64 \n",
+      " 2   nct_id                38140 non-null  object\n",
+      " 3   title                 38140 non-null  object\n",
+      " 4   brief_summary         38140 non-null  object\n",
+      " 5   eligibility_criteria  38140 non-null  object\n",
+      " 6   trial_text            38140 non-null  object\n",
+      " 7   spaces                38140 non-null  object\n",
+      " 8   this_space            38140 non-null  object\n",
+      " 9   space_number          38140 non-null  int64 \n",
+      "dtypes: int64(3), object(7)\n",
+      "memory usage: 2.9+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "trial_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "47b983df-d6f7-41d8-8c98-354f395c098e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/homes10/klkehl/miniconda3/envs/vllm2/lib/python3.12/site-packages/sentence_transformers/cross_encoder/CrossEncoder.py:13: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from tqdm.autonotebook import tqdm, trange\n",
+      "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.86s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "import torch\n",
+    "\n",
+    "embedding_model = SentenceTransformer('reranker_round2.model', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9ae0aa51-92f4-4e44-9a95-bf76196b1b7b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# only needs to be run once to generate and save trial embeddings\n",
+    "\n",
+    "# with torch.no_grad():\n",
+    "#    trial_space_embeddings = embedding_model.encode(trial_spaces.this_space.tolist(), convert_to_tensor=True)\n",
+    "\n",
+    "# from safetensors.torch import save_file\n",
+    "# output_trial_file = {\"space_embeddings\": trial_space_embeddings}\n",
+    "# save_file(output_trial_file, \"trial_space_embeddings.safetensors\")\n",
+    "\n",
+    "# trial_space_embeddings.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6abc0de-b919-41df-88be-b838e0998f51",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e6a25767-1939-48dd-a509-0bb2e3598f06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from safetensors import safe_open\n",
+    "with safe_open(\"trial_space_embeddings.safetensors\", framework=\"pt\", device=0) as f:\n",
+    "    trial_space_embeddings = f.get_tensor(\"space_embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f83d55e8-4876-4048-8aa9-8aaf766b6722",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import pipeline, AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"roberta-large\")\n",
+    "\n",
+    "pipe = pipeline('text-classification', './roberta-checker', tokenizer=tokenizer, truncation=True, padding='max_length', max_length=512, device='cuda') \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0318b38e-8ee8-493c-a3fa-391142ce9696",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_summary = \"metastatic lung adenocarcinoma, PD-L1 75%, KRAS G12C mutant, prior pembrolizumab, prior carboplatin/pemetrexed\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6d84d3a6-d965-49f2-ad6c-c8f9571da616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_embedding = embedding_model.encode([patient_summary], convert_to_tensor=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08c76a33-fdd0-4b1c-8a93-3a9e1f8b9bd6",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8bf2e6f-fc8a-4f1a-9c68-f0e5fb939f5a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "0b36fc28-28f1-4cba-b4fc-679d457e8da4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch.nn.functional as F\n",
+    "similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "774db345-8fd4-4a5d-b76e-2aa44123b4f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([38140])"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "similarities.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b77b3b0b-a474-4a19-a4b8-2cc0990a3a45",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull top ten spaces for the patient\n",
+    "sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "relevant_spaces = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].this_space\n",
+    "relevant_nctid =  trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].nct_id\n",
+    "relevant_title = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].title\n",
+    "relevant_brief_summary = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].brief_summary\n",
+    "relevant_eligibility_criteria = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()].eligibility_criteria\n",
+    "relevant_space_embeddings = trial_space_embeddings[sorted_indices[0:10], :]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "653c772f-6031-4c35-9978-5f23eb9f9301",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>patient_summary</th>\n",
+       "      <th>this_space</th>\n",
+       "      <th>nct_id</th>\n",
+       "      <th>trial_title</th>\n",
+       "      <th>trial_brief_summary</th>\n",
+       "      <th>trial_eligibility_criteria</th>\n",
+       "      <th>pt_trial_pair</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>5. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06253520</td>\n",
+       "      <td>A Phase Ib Clinical Trial to Evaluate the Admi...</td>\n",
+       "      <td>Background:\\n\\nMany cancer cells produce subst...</td>\n",
+       "      <td>* INCLUSION CRITERIA:\\n* Participants with an ...</td>\n",
+       "      <td>5. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT05853575</td>\n",
+       "      <td>A Randomized Study of Two Dosing Regimens of A...</td>\n",
+       "      <td>This study will evaluate the efficacy of two d...</td>\n",
+       "      <td>Key Inclusion Criteria:\\n\\n* Are at least 18 y...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>3. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06128551</td>\n",
+       "      <td>Phase 1b, Multicenter, Open-Label, Dose Escala...</td>\n",
+       "      <td>This study is to evaluate the safety, tolerabi...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...</td>\n",
+       "      <td>3. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: Non-small cell lung ca...</td>\n",
+       "      <td>NCT05788926</td>\n",
+       "      <td>A Phase I Dose-escalation Trial of TG6050 Admi...</td>\n",
+       "      <td>This is a phase I, open-label, dose-escalation...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n1. Signed written infor...</td>\n",
+       "      <td>1. Cancer type allowed: Non-small cell lung ca...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT05375084</td>\n",
+       "      <td>A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...</td>\n",
+       "      <td>This is a Phase 1 study of BBP-398, a SHP2 inh...</td>\n",
+       "      <td>Key Inclusion Criteria:\\n\\n* Patients must hav...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     patient_summary  \\\n",
+       "0  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "1  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "2  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "3  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "4  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "\n",
+       "                                          this_space       nct_id  \\\n",
+       "0  5. Cancer type allowed: non-small cell lung ca...  NCT06253520   \n",
+       "1  1. Cancer type allowed: non-small cell lung ca...  NCT05853575   \n",
+       "2  3. Cancer type allowed: non-small cell lung ca...  NCT06128551   \n",
+       "3  1. Cancer type allowed: Non-small cell lung ca...  NCT05788926   \n",
+       "4  1. Cancer type allowed: non-small cell lung ca...  NCT05375084   \n",
+       "\n",
+       "                                         trial_title  \\\n",
+       "0  A Phase Ib Clinical Trial to Evaluate the Admi...   \n",
+       "1  A Randomized Study of Two Dosing Regimens of A...   \n",
+       "2  Phase 1b, Multicenter, Open-Label, Dose Escala...   \n",
+       "3  A Phase I Dose-escalation Trial of TG6050 Admi...   \n",
+       "4  A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...   \n",
+       "\n",
+       "                                 trial_brief_summary  \\\n",
+       "0  Background:\\n\\nMany cancer cells produce subst...   \n",
+       "1  This study will evaluate the efficacy of two d...   \n",
+       "2  This study is to evaluate the safety, tolerabi...   \n",
+       "3  This is a phase I, open-label, dose-escalation...   \n",
+       "4  This is a Phase 1 study of BBP-398, a SHP2 inh...   \n",
+       "\n",
+       "                          trial_eligibility_criteria  \\\n",
+       "0  * INCLUSION CRITERIA:\\n* Participants with an ...   \n",
+       "1  Key Inclusion Criteria:\\n\\n* Are at least 18 y...   \n",
+       "2  Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...   \n",
+       "3  Inclusion Criteria:\\n\\n1. Signed written infor...   \n",
+       "4  Key Inclusion Criteria:\\n\\n* Patients must hav...   \n",
+       "\n",
+       "                                       pt_trial_pair  \n",
+       "0  5. Cancer type allowed: non-small cell lung ca...  \n",
+       "1  1. Cancer type allowed: non-small cell lung ca...  \n",
+       "2  3. Cancer type allowed: non-small cell lung ca...  \n",
+       "3  1. Cancer type allowed: Non-small cell lung ca...  \n",
+       "4  1. Cancer type allowed: non-small cell lung ca...  "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces,\n",
+    "                        'nct_id':relevant_nctid, 'trial_title':relevant_title,\n",
+    "                        'trial_brief_summary':relevant_brief_summary, 'trial_eligibility_criteria':relevant_eligibility_criteria}).reset_index(drop=True)\n",
+    "analysis['pt_trial_pair'] = analysis['this_space'] + \"\\nNow here is the patient summary:\" + analysis['patient_summary']\n",
+    "analysis.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "38439e90-59bc-4d8a-bef7-8731966ff015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipe = pipeline('text-classification', model='./roberta-checker', device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "4d6c761a-1d9f-4890-beb8-c20ba5523f87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier_results = pipe(analysis.pt_trial_pair.tolist())\n",
+    "analysis['roberta_check_result'] = [x['label'] for x in classifier_results]\n",
+    "analysis['roberta_check_score'] = [x['score'] for x in classifier_results]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9b9cb51-a054-4950-bef7-220c4378757d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "86d23c7b-d10a-4efb-bec5-ab95ef6dfb0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>patient_summary</th>\n",
+       "      <th>this_space</th>\n",
+       "      <th>nct_id</th>\n",
+       "      <th>trial_title</th>\n",
+       "      <th>trial_brief_summary</th>\n",
+       "      <th>trial_eligibility_criteria</th>\n",
+       "      <th>pt_trial_pair</th>\n",
+       "      <th>roberta_check_result</th>\n",
+       "      <th>roberta_check_score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>5. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06253520</td>\n",
+       "      <td>A Phase Ib Clinical Trial to Evaluate the Admi...</td>\n",
+       "      <td>Background:\\n\\nMany cancer cells produce subst...</td>\n",
+       "      <td>* INCLUSION CRITERIA:\\n* Participants with an ...</td>\n",
+       "      <td>5. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NEGATIVE</td>\n",
+       "      <td>0.834101</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT05853575</td>\n",
+       "      <td>A Randomized Study of Two Dosing Regimens of A...</td>\n",
+       "      <td>This study will evaluate the efficacy of two d...</td>\n",
+       "      <td>Key Inclusion Criteria:\\n\\n* Are at least 18 y...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.910206</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>3. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06128551</td>\n",
+       "      <td>Phase 1b, Multicenter, Open-Label, Dose Escala...</td>\n",
+       "      <td>This study is to evaluate the safety, tolerabi...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...</td>\n",
+       "      <td>3. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.915395</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: Non-small cell lung ca...</td>\n",
+       "      <td>NCT05788926</td>\n",
+       "      <td>A Phase I Dose-escalation Trial of TG6050 Admi...</td>\n",
+       "      <td>This is a phase I, open-label, dose-escalation...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n1. Signed written infor...</td>\n",
+       "      <td>1. Cancer type allowed: Non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.914168</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT05375084</td>\n",
+       "      <td>A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...</td>\n",
+       "      <td>This is a Phase 1 study of BBP-398, a SHP2 inh...</td>\n",
+       "      <td>Key Inclusion Criteria:\\n\\n* Patients must hav...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.877930</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>2. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06128551</td>\n",
+       "      <td>Phase 1b, Multicenter, Open-Label, Dose Escala...</td>\n",
+       "      <td>This study is to evaluate the safety, tolerabi...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...</td>\n",
+       "      <td>2. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.926033</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>2. Cancer type allowed: Non-Small Cell Lung Ca...</td>\n",
+       "      <td>NCT06447662</td>\n",
+       "      <td>A Phase 1 Open-Label Study of PF-07934040 as a...</td>\n",
+       "      <td>The purpose of this study is to learn about th...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* Histological or cytol...</td>\n",
+       "      <td>2. Cancer type allowed: Non-Small Cell Lung Ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.506948</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06127940</td>\n",
+       "      <td>K-SAB Trial - Sotorasib Followed by SBRT to 1-...</td>\n",
+       "      <td>The goal of this interventional study is to le...</td>\n",
+       "      <td>Main inclusion criteria:\\n\\n1. Histological or...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.952771</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT06343402</td>\n",
+       "      <td>A Phase 1a/1b Open-Label Study of BBO-8520 As ...</td>\n",
+       "      <td>A first in human study to evaluate the safety,...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* Histologically docume...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.949954</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>metastatic lung adenocarcinoma, PD-L1 75%, KRA...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>NCT05815173</td>\n",
+       "      <td>Phase I/II Study of Ladarixin and Sotorasib in...</td>\n",
+       "      <td>This is a phase I/II, open-label, study of twi...</td>\n",
+       "      <td>Inclusion Criteria:\\n\\n* Written informed cons...</td>\n",
+       "      <td>1. Cancer type allowed: non-small cell lung ca...</td>\n",
+       "      <td>POSITIVE</td>\n",
+       "      <td>0.937962</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     patient_summary  \\\n",
+       "0  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "1  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "2  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "3  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "4  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "5  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "6  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "7  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "8  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "9  metastatic lung adenocarcinoma, PD-L1 75%, KRA...   \n",
+       "\n",
+       "                                          this_space       nct_id  \\\n",
+       "0  5. Cancer type allowed: non-small cell lung ca...  NCT06253520   \n",
+       "1  1. Cancer type allowed: non-small cell lung ca...  NCT05853575   \n",
+       "2  3. Cancer type allowed: non-small cell lung ca...  NCT06128551   \n",
+       "3  1. Cancer type allowed: Non-small cell lung ca...  NCT05788926   \n",
+       "4  1. Cancer type allowed: non-small cell lung ca...  NCT05375084   \n",
+       "5  2. Cancer type allowed: non-small cell lung ca...  NCT06128551   \n",
+       "6  2. Cancer type allowed: Non-Small Cell Lung Ca...  NCT06447662   \n",
+       "7  1. Cancer type allowed: non-small cell lung ca...  NCT06127940   \n",
+       "8  1. Cancer type allowed: non-small cell lung ca...  NCT06343402   \n",
+       "9  1. Cancer type allowed: non-small cell lung ca...  NCT05815173   \n",
+       "\n",
+       "                                         trial_title  \\\n",
+       "0  A Phase Ib Clinical Trial to Evaluate the Admi...   \n",
+       "1  A Randomized Study of Two Dosing Regimens of A...   \n",
+       "2  Phase 1b, Multicenter, Open-Label, Dose Escala...   \n",
+       "3  A Phase I Dose-escalation Trial of TG6050 Admi...   \n",
+       "4  A Phase 1 Study of the SHP2 Inhibitor BBP-398 ...   \n",
+       "5  Phase 1b, Multicenter, Open-Label, Dose Escala...   \n",
+       "6  A Phase 1 Open-Label Study of PF-07934040 as a...   \n",
+       "7  K-SAB Trial - Sotorasib Followed by SBRT to 1-...   \n",
+       "8  A Phase 1a/1b Open-Label Study of BBO-8520 As ...   \n",
+       "9  Phase I/II Study of Ladarixin and Sotorasib in...   \n",
+       "\n",
+       "                                 trial_brief_summary  \\\n",
+       "0  Background:\\n\\nMany cancer cells produce subst...   \n",
+       "1  This study will evaluate the efficacy of two d...   \n",
+       "2  This study is to evaluate the safety, tolerabi...   \n",
+       "3  This is a phase I, open-label, dose-escalation...   \n",
+       "4  This is a Phase 1 study of BBP-398, a SHP2 inh...   \n",
+       "5  This study is to evaluate the safety, tolerabi...   \n",
+       "6  The purpose of this study is to learn about th...   \n",
+       "7  The goal of this interventional study is to le...   \n",
+       "8  A first in human study to evaluate the safety,...   \n",
+       "9  This is a phase I/II, open-label, study of twi...   \n",
+       "\n",
+       "                          trial_eligibility_criteria  \\\n",
+       "0  * INCLUSION CRITERIA:\\n* Participants with an ...   \n",
+       "1  Key Inclusion Criteria:\\n\\n* Are at least 18 y...   \n",
+       "2  Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...   \n",
+       "3  Inclusion Criteria:\\n\\n1. Signed written infor...   \n",
+       "4  Key Inclusion Criteria:\\n\\n* Patients must hav...   \n",
+       "5  Inclusion Criteria:\\n\\n* 18 years of age\\n* Hi...   \n",
+       "6  Inclusion Criteria:\\n\\n* Histological or cytol...   \n",
+       "7  Main inclusion criteria:\\n\\n1. Histological or...   \n",
+       "8  Inclusion Criteria:\\n\\n* Histologically docume...   \n",
+       "9  Inclusion Criteria:\\n\\n* Written informed cons...   \n",
+       "\n",
+       "                                       pt_trial_pair roberta_check_result  \\\n",
+       "0  5. Cancer type allowed: non-small cell lung ca...             NEGATIVE   \n",
+       "1  1. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "2  3. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "3  1. Cancer type allowed: Non-small cell lung ca...             POSITIVE   \n",
+       "4  1. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "5  2. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "6  2. Cancer type allowed: Non-Small Cell Lung Ca...             POSITIVE   \n",
+       "7  1. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "8  1. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "9  1. Cancer type allowed: non-small cell lung ca...             POSITIVE   \n",
+       "\n",
+       "   roberta_check_score  \n",
+       "0             0.834101  \n",
+       "1             0.910206  \n",
+       "2             0.915395  \n",
+       "3             0.914168  \n",
+       "4             0.877930  \n",
+       "5             0.926033  \n",
+       "6             0.506948  \n",
+       "7             0.952771  \n",
+       "8             0.949954  \n",
+       "9             0.937962  "
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "94ccb775-e2da-47cf-a64d-6e017a5bb11f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'5. Cancer type allowed: non-small cell lung cancer. Histology allowed: solid cancer. Cancer burden allowed: metastatic disease. Prior treatment required: at least one platinum-based chemotherapy regimen and at least one FDA-approved targeted treatment. Biomarkers required: KRAS G12V or G12D mutation. Biomarkers to be assessed during screening: HLA match.'"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis.this_space.iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "27f3bdda-a893-4439-bb20-d864c0476b23",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'1. Cancer type allowed: non-small cell lung cancer. Histology allowed: adenocarcinoma, squamous cell carcinoma, large cell carcinoma, and other subtypes of non-small cell lung cancer. Cancer burden allowed: advanced, metastatic. Prior treatment required: chemotherapy that included cisplatin or carboplatin, immune checkpoint inhibitor. Prior treatment excluded: KRAS G12C targeted therapy. Biomarkers required: KRAS G12C mutation.'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis.this_space.iloc[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "824794ca-1934-47dd-8844-74cf0ea2db75",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'3. Cancer type allowed: non-small cell lung cancer. Histology allowed: pathologically documented, KRAS G12C-mutated. Cancer burden allowed: advanced or metastatic. Prior treatment required: immunotherapy, chemotherapy. Biomarkers required: KRAS G12C mutation.'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "analysis.this_space.iloc[2]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24a06913-e32e-4f14-ba0a-c9cd6ae6c4fd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

1a_generate_synthetic_imaging_reports.ipynb ADDED Viewed

	@@ -0,0 +1,183 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "180f9bc1-03cc-4e31-babe-3f6c6ecb0167",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef6b0609-695f-4975-9970-f8b8350f953d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f986b468-c428-4ca3-9101-1cbabe6ad73f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8138f4e-6f45-4c98-b6b6-19370d53e7ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# llama = LLM(model='meta-llama/Meta-Llama-3.1-8B-Instruct', tensor_parallel_size = 2, \n",
+    "#             gpu_memory_utilization=0.95,\n",
+    "#             download_dir = \"../../\", max_model_len=120000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e0537c8-85cc-4bae-97de-6dd6f70ea5a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../meta_ai/\", gpu_memory_utilization=0.80, max_model_len=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0394142-a749-4995-abc8-bac884fea671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_synthetic_imaging_reports(num_reports, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    scan_types = np.random.choice(['CT scan', 'MRI', 'Nuclear bone scan', 'PET-CT'], size=num_reports)\n",
+    "    cancer_types = np.random.choice(['breast', 'non-small cell lung', 'small cell lung', 'colorectal', 'pancreatic', 'urothelial', 'prostate', 'gastric', 'esophageal', 'thymoma', 'thymic carcinoma', 'adrenal', 'ovarian', 'endometrial', 'melanoma', 'renal cell', 'sarcoma', 'head and neck', 'Hodgkin lymphoma', 'Non-Hodgkin lymphoma', 'myeloma', 'acute myeloid leukemia', 'chronic myeloid leukemia', 'acute lymphoblastic leukemia', 'chronic lymphocytic leukemia/lymphoma', 'primary brain tumor'], size=num_reports) \n",
+    "\n",
+    "    for i in range(num_reports):\n",
+    "\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"Your job is to generate synthetic imaging reports for hypothetical patients with cancer.\n",
+    "            You know all there is to know about cancer and its treatments, so be detailed.           \n",
+    "        \"\"\"},      \n",
+    "\n",
+    "\n",
+    "            {'role':'user', 'content': \"\"\"Imagine a patient with cancer. \n",
+    "            The cancer type is \"\"\" + cancer_types[i] + \".\" + \"\"\"\n",
+    "            Then, generate a very detailed imaging report that might have been written about an imaging study performed for the patient. \n",
+    "            The patient might have any stage of disease and be at any point along the disease trajectory. Use everything you know about cancer, including epidemiology, treatment, and heterogeneity in disease presentations.\n",
+    "            The imaging study type is \"\"\" + scan_types[i] + \".\" + \"\"\"\n",
+    "            The report should include a detailed \"Findings\" section followed by an \"Impression\" section.\n",
+    "            The report should not include any treatment recommendations.\n",
+    "            The imaging report should be approximately a full page long.\"\"\"}\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=1.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=4000,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return pd.DataFrame({'cancer_type':cancer_types, 'scan_type':scan_types, 'synthetic_imaging_report':response_texts})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "109f2208-de29-43cf-a831-4609bdab225e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = generate_synthetic_imaging_reports(10000, llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "443fa19c-8933-41d2-ba04-382902421e08",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.synthetic_imaging_report.sample(n=1).iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b40b06f0-69e9-48cc-9766-61d8d26178d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6119a0b4-5a63-4f91-a637-484b5e9dc29c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.to_csv('synthetic_imaging_reports.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71a24b19-5e1c-4c24-b13b-ac04c1e94bd2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

1b_generate_synthetic_clinical_notes.ipynb ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f986b468-c428-4ca3-9101-1cbabe6ad73f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e0537c8-85cc-4bae-97de-6dd6f70ea5a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../meta_ai/\", gpu_memory_utilization=0.80, max_model_len=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6119a0b4-5a63-4f91-a637-484b5e9dc29c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71a24b19-5e1c-4c24-b13b-ac04c1e94bd2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_synthetic_clinical_notes(num_reports, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    cancer_types = np.random.choice(['breast', 'non-small cell lung', 'small cell lung', 'colorectal', 'pancreatic', 'urothelial', 'prostate', 'gastric', 'esophageal', 'thymoma', 'thymic carcinoma', 'adrenal', 'ovarian', 'endometrial', 'melanoma', 'renal cell', 'sarcoma', 'head and neck', 'Hodgkin lymphoma', 'Non-Hodgkin lymphoma', 'myeloma', 'acute myeloid leukemia', 'chronic myeloid leukemia', 'acute lymphoblastic leukemia', 'chronic lymphocytic leukemia/lymphoma', 'primary brain tumor'], size=num_reports) \n",
+    "\n",
+    "    for i in range(num_reports):\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"Your job is to generate synthetic oncologist clinical progress notes for hypothetical patients with cancer.\n",
+    "            You know all there is to know about cancer and its treatments, so be detailed.           \n",
+    "        \"\"\"},      \n",
+    "              \n",
+    "            {'role':'user', 'content': \"\"\"Imagine a patient with cancer. \n",
+    "            The cancer type is\"\"\" + cancer_types[i] + \".\" + \"\"\"\n",
+    "            The patient might have any stage of disease. Use everything you know about cancer, including biomarkers, epidemiology, and heterogeneity in disease presentations.\n",
+    "            The note might correspond to any point along the disease trajectory, from initial diagnosis to curative intent treatment to palliative intent treatment.\n",
+    "            The note should include a chief complaint, oncologic history including prior treatments, past medical history/comorbidities, current subjective clinical status and physical exam including vital signs and ECOG performance status, laboratory values, radiology excerpts, and an assessment and plan.\n",
+    "            The note should should be approximately two pages long. It will not be used for clinical care, so do not include disclaimers.\"\"\"}\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=1.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=4000,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return pd.DataFrame({'cancer_type':cancer_types, 'synthetic_clinical_note':response_texts})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "63642559-3583-4771-88f1-5d4e8b6e033f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = generate_synthetic_clinical_notes(10000, llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6eeb2f9c-4ca8-424b-b30e-dcee57323fe2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.to_csv('synthetic_clinical_notes.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0470b150-dda1-4900-adb6-a90e5cf74e39",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

1c_generate_synthetic_path_reports.ipynb ADDED Viewed

	@@ -0,0 +1,136 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f986b468-c428-4ca3-9101-1cbabe6ad73f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e0537c8-85cc-4bae-97de-6dd6f70ea5a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../meta_ai/\", gpu_memory_utilization=0.80, max_model_len=10000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0394142-a749-4995-abc8-bac884fea671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_synthetic_path_reports(num_reports, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    cancer_types = np.random.choice(['breast', 'non-small cell lung', 'small cell lung', 'colorectal', 'pancreatic', 'urothelial', 'prostate', 'gastric', 'esophageal', 'thymoma', 'thymic carcinoma', 'adrenal', 'ovarian', 'endometrial', 'melanoma', 'renal cell', 'sarcoma', 'head and neck', 'Hodgkin lymphoma', 'Non-Hodgkin lymphoma', 'myeloma', 'acute myeloid leukemia', 'chronic myeloid leukemia', 'acute lymphoblastic leukemia', 'chronic lymphocytic leukemia/lymphoma', 'primary brain tumor'], size=num_reports) \n",
+    "\n",
+    "    for i in range(num_reports):\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"Your job is to generate synthetic pathology reports for hypothetical patients with cancer.\n",
+    "            You know all there is to know about cancer and its treatments, so be detailed.           \n",
+    "        \"\"\"},      \n",
+    "              \n",
+    "            {'role':'user', 'content': \"\"\"Imagine a patient with cancer. \n",
+    "            The cancer type is\"\"\" + cancer_types[i] + \".\" + \"\"\"\n",
+    "            Then, generate a very detailed pathology report that might have been written about a specimen collected from the patient. The patient might have any stage of disease. Use everything you know about cancer, including biomarkers, epidemiology, and heterogeneity in disease presentations.\n",
+    "            The report might be from a cytology specimen, anatomic pathology specimen, genomic sequencing analysis, bone marrow biopsy, flow cytometry, SPEP, etc.\n",
+    "            The report should not include any treatment recommendations.\n",
+    "            The pathology report should be approximately a full page long.\"\"\"}\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=1.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=4000,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return pd.DataFrame({'cancer_type':cancer_types, 'synthetic_path_report':response_texts})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "109f2208-de29-43cf-a831-4609bdab225e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = generate_synthetic_path_reports(10000, llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b40b06f0-69e9-48cc-9766-61d8d26178d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.to_csv('synthetic_path_reports.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6119a0b4-5a63-4f91-a637-484b5e9dc29c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0470b150-dda1-4900-adb6-a90e5cf74e39",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

2a_tag_chunks_of_synthetic_notes.ipynb ADDED Viewed

	@@ -0,0 +1,271 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0,2\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62669512-19e7-43cd-a518-4572eea700af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
+    "            gpu_memory_utilization=0.50,\n",
+    "            download_dir = \"../../\", max_model_len=12000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f87a20c9-fbea-4a09-9ffc-99c64bcd3709",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa6a0fb4-b22b-4e24-a4c1-55b95182fe60",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f32e5e86-2769-4dad-972c-00e6ddddb95a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "def tag_chunks(patient_texts, llama_model):\n",
+    "    \n",
+    "\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "    for the_patient in patient_texts:\n",
+    "        temp_patient = re.sub(\"\\n|\\r\", \" \", the_patient.strip())\n",
+    "        temp_patient = re.sub(r'\\s+', \" \", temp_patient)\n",
+    "        sentences = \"<excerpt break>\" + re.sub(\"\\\\. \", \"<excerpt break>\", temp_patient) + \"<excerpt break>\"\n",
+    "    \n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are an oncology clinical note data extraction bot.\n",
+    "        Your job is to review a list of excerpts from a clinical document and extract the excerpts relevant to a list of questions.\n",
+    "        \"\"\"        \n",
+    "        \n",
+    "        },\n",
+    "\n",
+    "        {'role':'user', 'content': \"The list of excerpts, separated by <excerpt break>, is: \" + sentences + \n",
+    "        \"\"\"Now, list the excerpts relevant to any of the following questions.\n",
+    "        Format your answer as JSON, tagging each excerpt that is relevant to at least one question with each tag to which it is relevant.\n",
+    "        Here is the list of questions:\n",
+    "        What type of cancer (primary site and histology) does the patient have? (Tag: cancer_type )\n",
+    "        What was the stage at diagnosis? (Tag: stage_at_diagnosis)\n",
+    "        What treatments (including surgery, radiation, or systemic therapy) has the patient received? (Tag: treatment)\n",
+    "        How widespread is the cancer currently? (Tag: cancer_burden)\n",
+    "        Is there response to therapy or progressive disease? (Tag: cancer_status)\n",
+    "        Is the patient experiencing an adverse event of treatment? (Tag: adverse_event)\n",
+    "        What biomarkers, such as protein expression and genetic mutations/alterations, does the patient's tumor have? (Tag: biomarker)\n",
+    "        What comorbidities, or diseases other than cancer, does the patient have? (Tag: comorbidity)\n",
+    "        Here is an example of the output format:\n",
+    "        [{\"excerpt\": \"80M with metastatic lung adenocarcinoma.\", \"tags\": [\"cancer_type\", \"cancer_burden\"]},\n",
+    "           {\"excerpt\": \"The tumor was HER2 positive.\", \"tags\": [\"biomarker\"]}\n",
+    "        ]\n",
+    "       \n",
+    "        Do not include excerpts that are not relevant to the questions. \n",
+    "        Do not abbreviate or alter excerpts that you do include; copy them verbatim from the prompt.\n",
+    "        Do not add disclaimers or introductory text.\n",
+    "        If there are no excerpts relevant to the above questions, just output blank JSON {} .\n",
+    "        \"\"\"}\n",
+    "        ]\n",
+    "\n",
+    "        prompts.append(messages)\n",
+    "\n",
+    "    long_messages = [x[1]['content'] for x in prompts]\n",
+    "    trunc_messages = tokenizer.batch_decode([x[-10000:] for x in tokenizer(long_messages, add_special_tokens=False).input_ids])\n",
+    "\n",
+    "    newprompts = []\n",
+    "    for i, messages in enumerate(prompts):\n",
+    "        messages[1]['content'] = trunc_messages[i]\n",
+    "        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        newprompts.append(template_prompt)\n",
+    "        \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        newprompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=5000,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return responses, response_texts\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fd501ab8-65de-4098-9019-ead68ab9cd5e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a070d00-9a45-4360-a38f-ceed8a9360e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "064eef80-feae-407b-b2cd-ad7aa115c0de",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull in our synthetic notes\n",
+    "imaging = pd.read_csv('synthetic_imaging_reports.csv').rename(columns={'synthetic_imaging_report':'text'})\n",
+    "medonc = pd.read_csv('synthetic_clinical_notes.csv').rename(columns={'synthetic_clinical_note':'text'})\n",
+    "path = pd.read_csv('synthetic_path_reports.csv').rename(columns={'synthetic_path_report':'text'})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "088f4db2-ef4e-45e6-bbf9-77cd94224e94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_reports = pd.concat([imaging, medonc, path], axis=0)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "246cbc95-8130-493b-9422-9b204d0a381b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_reports.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1119b5e8-d8fc-416f-9d72-e3d5a5608066",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3f9bece9-fa57-4bdc-a9e5-53f6b4f61caa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = tag_chunks(all_reports.sample(n=2).text.tolist(), llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bee4a4e1-bf6a-4e9a-98da-828b5ddea3c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0e8390b-ef71-4c49-8cef-6e565d28fe56",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18b1b677-9798-4172-809e-e763c6f30121",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "output_datasets = []\n",
+    "for i in range(0, all_reports.shape[0], 5000):\n",
+    "    output_dataset = all_reports.iloc[i:(i+5000)]\n",
+    "    output_dataset['llm_output'] = tag_chunks(output_dataset.text.tolist(), llama)[1]\n",
+    "    output_datasets.append(output_dataset)\n",
+    "    fileout = pd.concat(output_datasets, axis=0)\n",
+    "    fileout.to_parquet('tagged_chunks.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92a37f90-f1fe-4a6c-85fb-cc2941456b0d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a47e9f4-be48-41c8-a3ce-a876c92e2961",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

2b_train_tiny_bert_tagger.ipynb ADDED Viewed

	@@ -0,0 +1,706 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8fe89a3-56d5-4a58-9e2f-0a236c4a8409",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import json\n",
+    "import os\n",
+    "import torch\n",
+    "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76a8061d-f35e-4d11-83c6-bfc46d68130b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarized_notes = pd.read_parquet('./tagged_chunks.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20bb4efc-4ca1-4179-8f0a-e89d80123358",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a75ae543-ecca-4598-91f2-938998086801",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarized_notes.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdeaf0de-0462-4e1d-b6b2-cd8e10c76ceb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_rowwise_chunk_labels(original_note, llm_output, valid_tags_list):\n",
+    "    valid_tags_array = np.array(valid_tags_list)\n",
+    "    chunks = re.sub(\"\\n|\\r\", \" \", original_note.strip())\n",
+    "    chunks = re.sub(r'\\s+', \" \", chunks)\n",
+    "    chunks = \"<excerpt break>\" + re.sub(\"\\\\. \", \"<excerpt break>\", chunks) + \"<excerpt break>\"\n",
+    "    chunks = pd.Series(chunks.split(\"<excerpt break>\")).str.strip()\n",
+    "    chunks = chunks[chunks != '']\n",
+    "    chunk_frame = pd.DataFrame({'excerpt':chunks})\n",
+    "    tag_dict = {}\n",
+    "    try:\n",
+    "        json_output = pd.DataFrame.from_records(json.loads(llm_output))\n",
+    "        json_output['tags'] = json_output['tags'].astype(str).str.strip(\"[|]\")\n",
+    "\n",
+    "        chunk_frame = pd.merge(chunk_frame, json_output, on='excerpt', how='left')\n",
+    "        chunk_frame['is_tagged'] = np.where(chunk_frame.tags.isnull(), 0, 1)\n",
+    "        chunk_frame['tags'] = np.where(chunk_frame.tags.isnull(), \"\", chunk_frame.tags)\n",
+    "        chunk_frame['good_json'] = 1\n",
+    "        for tag in valid_tags_array[valid_tags_array != 'is_tagged'].tolist():\n",
+    "            chunk_frame[tag] = np.where(chunk_frame.tags.str.contains(tag), 1, 0)\n",
+    "    except:\n",
+    "        chunk_frame['tags'] = \"\"\n",
+    "        chunk_frame['is_tagged'] = 0\n",
+    "        chunk_frame['good_json'] = 0\n",
+    "        for tag in valid_tags_array[valid_tags_array != 'is_tagged'].tolist():\n",
+    "            chunk_frame[tag] = 0\n",
+    "    return chunk_frame"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aaeaf6a9-aa16-4770-9b36-b8f4c7bb2327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarized_notes.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f00b0c00-c215-4254-8555-28aac4e0796c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "valid_tags_list = ['is_tagged','cancer_type','stage_at_diagnosis','treatment','cancer_burden','cancer_status','adverse_event','comorbidity','biomarker']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd1ff0a9-dfcf-4ffb-9aa9-2dec616bb5fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "summarized_notes.shape[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a75ed17-01c8-4187-9c10-d32f601cd81b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = []\n",
+    "for i in range(summarized_notes.shape[0]):\n",
+    "    out = generate_rowwise_chunk_labels(summarized_notes.text.iloc[i], summarized_notes.llm_output.iloc[i], valid_tags_list)\n",
+    "    try:\n",
+    "        if out['good_json'].iloc[0] == 1:\n",
+    "            outputs.append(out)\n",
+    "    except:\n",
+    "        pass\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0aa51527-77ff-4938-8699-c80941da2a3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "excerpts=pd.concat(outputs, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30a3b8a3-6948-4ec7-ae5c-308f9b1af03e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "excerpts.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "722d6d3d-ceec-4795-b41f-d6afd8d86890",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5dbb8472-1f46-4e2e-a0ce-bc5ea28fc6f9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb556c78-f370-4134-9a64-a8d5263b4e48",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils import data\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "class TagDataset(data.Dataset):\n",
+    "    def __init__(self, pandas_dataset, valid_tags_list):\n",
+    "        self.data = pandas_dataset.copy().reset_index(drop=True)\n",
+    "        self.indices = self.data.index.unique()\n",
+    "        self.tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny', max_length=128, truncation_side='left')        \n",
+    "        self.valid_tags_list = valid_tags_list\n",
+    "        \n",
+    "    def __len__(self):\n",
+    "        # how many notes in the dataset\n",
+    "        return len(self.indices)\n",
+    "    \n",
+    "    def __getitem__(self, index):\n",
+    "        # get data for notes corresponding to indices passed\n",
+    "        this_index = self.indices[index]\n",
+    "        pand = self.data.loc[this_index, :]\n",
+    "    \n",
+    "        encoded = self.tokenizer(pand['excerpt'], padding='max_length', max_length=128, truncation=True)\n",
+    "\n",
+    "        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)\n",
+    "        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)\n",
+    "        y_labels = torch.tensor([torch.tensor(pand[label], dtype=torch.float32) for label in self.valid_tags_list])\n",
+    "        \n",
+    "\n",
+    "        return x_text_tensor, x_attention_mask, y_labels\n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9917bfdf-03f9-4c8f-a201-cef9d4744522",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "temp_dataset = TagDataset(excerpts.head(10), valid_tags_list)\n",
+    "temp_data = data.DataLoader(temp_dataset, shuffle=False, batch_size=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8368008f-2a20-44a1-bf0b-41f35a04e66e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "temp_iter = iter(temp_data)\n",
+    "result = next(temp_iter)\n",
+    "result[0].shape, result[1].shape, torch.unbind(result[2], dim=1)[0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a75a6e89-8dd5-45db-b5ce-315eca7e409f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "excerpts.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2812618-20f7-4c10-b0e0-7a904ed6f9c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_tags_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb47772b-30da-4b70-959e-87bc57af94fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from torch.nn import functional as F\n",
+    "import torch.nn as nn\n",
+    "from torch.utils.data import DataLoader\n",
+    "from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU\n",
+    "from transformers import AutoModel\n",
+    "   \n",
+    "class TagModel(nn.Module):\n",
+    "\n",
+    "    def __init__(self, num_tags, device):\n",
+    "        super(TagModel, self).__init__()\n",
+    "        \n",
+    "        self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny').to(device)\n",
+    "\n",
+    "        #self.prediction_head = Sequential(Linear(128, 128), ReLU(), Linear(128,1)).to(device)\n",
+    "        self.prediction_heads = nn.ModuleList([Sequential(Linear(128, 128), ReLU(), Linear(128,1)).to(device) for x in range(0, num_tags)])\n",
+    "        \n",
+    "\n",
+    "    def forward(self, x_text_tensor, x_attention_mask):\n",
+    "        # x should be tuple of input IDs, then attention mask\n",
+    "        \n",
+    "        main = self.bert(x_text_tensor, x_attention_mask)\n",
+    "        main = main.last_hidden_state[:,0,:].squeeze(1)\n",
+    "\n",
+    "        outputs = [x(main) for x in self.prediction_heads]\n",
+    "        #outputs = [self.prediction_head(main)]                                \n",
+    "\n",
+    "        return outputs\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e99f1867-5d53-48ba-b6c8-89b5470085f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# train loop\n",
+    "from transformers import get_scheduler\n",
+    "from torch.optim import AdamW, Adam\n",
+    "#, get_linear_schedule_with_warmup\n",
+    "\n",
+    "\n",
+    "def train_model(model, num_epochs, num_tags, trainloader, validloader=None, device='cuda'):\n",
+    "    \n",
+    "    \n",
+    "\n",
+    "    optimizer = AdamW(model.parameters(), lr=5e-5)\n",
+    "    num_training_steps = num_epochs * len(trainloader)\n",
+    "    lr_scheduler = get_scheduler(\n",
+    "        name=\"linear\", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)\n",
+    "\n",
+    "    model.to(device)\n",
+    "    \n",
+    "    for epoch in range(num_epochs):  \n",
+    "        running_train_losses = [0.0 for i in range(num_tags)]\n",
+    "        mean_train_losses = [0.0 for i in range(num_tags)]\n",
+    "        \n",
+    "        running_valid_losses = [0.0 for i in range(num_tags)]\n",
+    "        mean_valid_losses = [0.0 for i in range(num_tags)]\n",
+    "\n",
+    "        num_train_batches = len(trainloader)\n",
+    "                \n",
+    "        model.train()\n",
+    "        \n",
+    "        for i, batch in enumerate(trainloader, 0):\n",
+    "            input_ids = batch[0].to(device)\n",
+    "            input_masks = batch[1].to(device)\n",
+    "\n",
+    "            \n",
+    "            optimizer.zero_grad()\n",
+    "            \n",
+    "            outputs_true = torch.unbind(batch[2].to(device), dim=1)\n",
+    "            \n",
+    "            outputs_pred = model(input_ids, input_masks)\n",
+    "            \n",
+    "           \n",
+    "            losses = [F.binary_cross_entropy_with_logits(outputs_pred[x].squeeze(1), outputs_true[x]) for x in range(num_tags)]\n",
+    "            \n",
+    "            total_loss = 0.0\n",
+    "            for j in range(num_tags):\n",
+    "                total_loss = total_loss + losses[j]\n",
+    "\n",
+    "                \n",
+    "            total_loss.backward()\n",
+    "            optimizer.step()\n",
+    "            lr_scheduler.step()\n",
+    "            \n",
+    "            \n",
+    "            for j in range(num_tags):\n",
+    "                running_train_losses[j] += losses[j].detach().cpu().numpy()\n",
+    "                mean_train_losses[j] = running_train_losses[j] / (i+1)\n",
+    "\n",
+    "            if i % 10 == 0:\n",
+    "                print('Training Epoch: ' + str(epoch+1) + ', batch: ' + str(i + 1) + '/' + str(num_train_batches) + ' this_loss:' + str(total_loss.detach().cpu().numpy()) +', train losses: ' + str([str(x) + ': ' + str(mean_train_losses[x]) + \", \" for x in range(num_tags)]), end='\\r', flush=True)\n",
+    "        \n",
+    "        print('\\n')\n",
+    "        # eval on valid\n",
+    "        \n",
+    "        if validloader is not None:\n",
+    "            num_valid_batches = len(validloader)\n",
+    "            model.eval()\n",
+    "            \n",
+    "            for i, batch in enumerate(validloader, 0):\n",
+    "                input_ids = batch[0].to(device)\n",
+    "                input_masks = batch[1].to(device)\n",
+    "\n",
+    "\n",
+    "                outputs_true = torch.unbind(batch[2].to(device), dim=1)\n",
+    "\n",
+    "                outputs_pred = model(input_ids, input_masks)\n",
+    "\n",
+    "                losses = [F.binary_cross_entropy_with_logits(outputs_pred[x].squeeze(1), outputs_true[x]) for x in range(num_tags)]\n",
+    "\n",
+    "                total_loss = 0.0\n",
+    "                for j in range(num_tags):\n",
+    "                    total_loss = total_loss + losses[j]\n",
+    "\n",
+    "                for j in range(num_tags):\n",
+    "                    running_valid_losses[j] += losses[j].detach().cpu().numpy()\n",
+    "\n",
+    "            \n",
+    "            for j in range(num_tags):\n",
+    "                mean_valid_losses[j] = running_valid_losses[j] / (i+1)\n",
+    "            \n",
+    "\n",
+    "            \n",
+    "            print('Validation Epoch: ' + str(epoch+1) + ', batch: ' + str(i + 1) + '/' + str(num_valid_batches) + ', valid losses: ' + str([str(x) + ': ' + str(mean_valid_losses[x]) + \", \" for x in range(num_tags)]), end='\\r', flush=True)\n",
+    "            print('\\n')\n",
+    "\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98c1aa95-e339-4e89-9c67-0215b399b7a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "temp = torch.tensor([0]).to('cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c408392-94f7-46eb-9400-05f28327db95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# actual training code, commented out after model was trained\n",
+    "training = excerpts\n",
+    "\n",
+    "themodel = TagModel(len(valid_tags_list), device='cuda')\n",
+    "trainloader = data.DataLoader(TagDataset(training.reset_index(drop=True), valid_tags_list), batch_size=64, num_workers=4, shuffle=True)\n",
+    "#validloader = data.DataLoader(TagDataset(validation.reset_index(drop=True), valid_tags_list), batch_size=64, num_workers=4, shuffle=True)\n",
+    "train_model(themodel, 5, len(valid_tags_list), trainloader, validloader=None, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b223e6d9-dad2-4180-8832-9b5ef713ef52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(themodel.state_dict(), './tiny_bert_tagger_synthetic.pt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19dafc4b-f952-444f-b857-3c6743cc871d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# pull PHI notes from v7 for validation\n",
+    "phi_summarized_notes = pd.read_parquet(\"../v7/tagged_chunks_enrolled_pt_reports.parquet\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf9cab2b-eeb8-4665-a63f-382950a8c089",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation = phi_summarized_notes[phi_summarized_notes.split.str.contains(\"valid\")]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cc4f54c-906c-4309-b900-4fbc6a3db9e5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val_outputs = []\n",
+    "for i in range(validation.shape[0]):\n",
+    "    out = generate_rowwise_chunk_labels(validation.text.iloc[i], validation.llm_output.iloc[i], valid_tags_list)\n",
+    "    try:\n",
+    "        if out['good_json'].iloc[0] == 1:\n",
+    "            val_outputs.append(out)\n",
+    "    except:\n",
+    "        pass\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76b29328-5254-4b32-8732-981a8696e7cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val_outputs = pd.concat(val_outputs, axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a19b22ed-0811-40e7-8a60-f2092d002808",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "val_outputs.is_tagged.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e3a0623-fa86-4c9c-9b7a-dd6cf631d781",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "621e9ff7-e155-403d-b1ab-96b8b1af2ad3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf0c2a8f-6fa0-450f-9a3b-ee79c1f3ace8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_valid_tags = len(valid_tags_list)\n",
+    "themodel = TagModel(num_valid_tags, 'cuda')\n",
+    "themodel.load_state_dict(torch.load('./tiny_bert_tagger_synthetic.pt'))\n",
+    "themodel.to('cuda')\n",
+    "\n",
+    "themodel.eval()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a051ea9f-f934-4f05-801e-f470276e6225",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# write out actual PHI validation dataset\n",
+    "\n",
+    "\n",
+    "num_valid_tags = len(valid_tags_list)\n",
+    "val_output = val_outputs.reset_index(drop=True)\n",
+    "no_shuffle_valid_dataset = data.DataLoader(TagDataset(val_outputs, valid_tags_list), batch_size=32, shuffle=False, num_workers=0)\n",
+    "\n",
+    "output_true_lists = [[] for x in range(num_valid_tags)]\n",
+    "output_prediction_lists = [[] for x in range(num_valid_tags)]\n",
+    "for batch in no_shuffle_valid_dataset:\n",
+    "    x_text_ids = batch[0].to('cuda')\n",
+    "    x_attention_mask = batch[1].to('cuda')\n",
+    "    label_list = torch.unbind(batch[2], axis=1)\n",
+    "    with torch.no_grad():\n",
+    "        predictions = themodel(x_text_ids, x_attention_mask)\n",
+    "\n",
+    "\n",
+    "        predictions = themodel(x_text_ids, x_attention_mask)\n",
+    "\n",
+    "        \n",
+    "    for j in range(num_valid_tags):\n",
+    "        output_true_lists[j].append(label_list[j].detach().cpu().numpy())\n",
+    "        output_prediction_lists[j].append(predictions[j].squeeze(1).detach().cpu().numpy())\n",
+    "\n",
+    "output_true_lists = [np.concatenate(x) for x in output_true_lists]        \n",
+    "output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]\n",
+    "\n",
+    "\n",
+    "output_validation = val_outputs.copy()\n",
+    "for x in range(num_valid_tags):\n",
+    "    output_validation['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]\n",
+    "    output_validation['truth_' + str(x)] = output_true_lists[x]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "999ca687-5a27-4718-8d73-f6dcc9d5800d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "986def2b-29c0-4807-9a30-e56b21ec535d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import roc_auc_score\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d395df39-cd74-4805-ac8d-e145a91e5bfe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_tags_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfee9187-8d6e-4741-ac92-cb6c996a1bfb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.Series(output_true_lists[0]).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c6e6939-b217-4821-af67-4a8c54786172",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# PHI valset metric AUROCs wer:e\n",
+    "# 0.8596166833121998\n",
+    "# 0.9855369622435224\n",
+    "# 0.9771353402092497\n",
+    "# 0.9645691289367063\n",
+    "# 0.9389302197266838\n",
+    "# 0.9564117372864042\n",
+    "# 0.9452735452766257\n",
+    "# 0.9234540539394782\n",
+    "# 0.9863098212461762"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19623b7f-9485-4973-ac94-52d8c64aa43b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "433f9bfa-8468-4914-9a1b-a555887f5367",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "[print(roc_auc_score(output_true_lists[x], output_prediction_lists[x])) for x in range(num_valid_tags)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "90c5d6ee-7d1e-4e38-957d-f7378cc6c891",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils_102023 import eval_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98da87ab-d0e1-4610-a7c9-7d47a0d54e8a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_f1_thresholds = [eval_model(output_prediction_lists[x], output_true_lists[x], graph=False) for x in range(num_valid_tags)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01dfb257-62d8-412d-a446-59d81111b2a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "best_f1_thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afbcbe3d-d261-4f96-80e5-912ac664bf0c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d8b41e47-a901-40b9-84b2-dcdd46af0e89",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe2d639d-568b-47cf-894e-e0ffffbf24e8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

3_generate_synthetic_full_patient_histories.ipynb ADDED Viewed

	@@ -0,0 +1,148 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "180f9bc1-03cc-4e31-babe-3f6c6ecb0167",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef6b0609-695f-4975-9970-f8b8350f953d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f986b468-c428-4ca3-9101-1cbabe6ad73f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8e0537c8-85cc-4bae-97de-6dd6f70ea5a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../meta_ai/\", gpu_memory_utilization=0.75, max_model_len=20000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0394142-a749-4995-abc8-bac884fea671",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_synthetic_histories(num_histories, llama_model):\n",
+    "    np.random.seed(42)\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "    prompts = []\n",
+    "    cancer_types = np.random.choice(['breast', 'non-small cell lung', 'small cell lung', 'colorectal', 'pancreatic', 'urothelial', 'prostate', 'gastric', 'esophageal', 'thymoma', 'thymic carcinoma', 'adrenal', 'ovarian', 'endometrial', 'melanoma', 'renal cell', 'sarcoma', 'head and neck', 'Hodgkin lymphoma', 'Non-Hodgkin lymphoma', 'myeloma', 'acute myeloid leukemia', 'chronic myeloid leukemia', 'acute lymphoblastic leukemia', 'chronic lymphocytic leukemia/lymphoma', 'primary brain tumor'], size=num_histories) \n",
+    "    splits = np.random.choice(['train','val','test'], size=num_histories, p=[0.8,0.1,0.1])\n",
+    "    for i in range(num_histories):\n",
+    "        messages = [\n",
+    "            {'role':'system', 'content': \"\"\"Your job is to generate synthetic clinical histories for hypothetical patients with cancer.\n",
+    "            You know all there is to know about cancer and its treatments, so be detailed.\n",
+    "            The histories should be presented in chronological order as a sequence of events. Each event should begin with a date, and should then include some new development, such as a diagnosis, treatment, adverse event, progression, response to therapy, biomarker ascertainment, symptom burden, recurrence events, and so on.\n",
+    "            \n",
+    "        \"\"\"},      \n",
+    "              \n",
+    "            {'role':'user', 'content': \"\"\"Imagine a patient with cancer. \n",
+    "            The cancer type is \"\"\" + cancer_types[i] + \"\"\".\n",
+    "            Then, generate a very detailed synthetic clinical history for the patient. The patient might have any stage of disease. Use everything you know about cancer, including epidemiology, treatment options, outcomes, and heterogeneity in disease trajectories.\n",
+    "            Do not mention transitions to hospice or death events.\n",
+    "            Do not start with any demographics; just launch into the chronological history. Phrase it in the past tense. Dates should be in mm/dd/yyyy format. Output should be plain text, not Markdown.\n",
+    "            The history should be approximately two pages long.\"\"\"}\n",
+    "        ]\n",
+    "    \n",
+    "        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,   \n",
+    "        SamplingParams(\n",
+    "        temperature=1.0,\n",
+    "        top_p=0.9,\n",
+    "        max_tokens=5000,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return pd.DataFrame({'split':splits, 'cancer_type':cancer_types, 'patient_long_text':response_texts})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "109f2208-de29-43cf-a831-4609bdab225e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = generate_synthetic_histories(30000, llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6119a0b4-5a63-4f91-a637-484b5e9dc29c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results.to_csv('synthetic_histories_11-22-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "71a24b19-5e1c-4c24-b13b-ac04c1e94bd2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

4_summarize_synthetic_histories.ipynb ADDED Viewed

	@@ -0,0 +1,276 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"1\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a070d00-9a45-4360-a38f-ceed8a9360e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0f407048-0eb3-439a-8257-3cb6881ac784",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "synthetic_histories = pd.read_csv('synthetic_histories_11-22-24.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9bc40636-2325-4664-afc3-833b58fe7ba0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "synthetic_histories.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9b4cae4-d46d-4a80-841c-8c8f08915b90",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca2b0678-119e-47a7-9a72-28685e97559d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = \"../../\", gpu_memory_utilization=0.90, max_model_len=120000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f19be1ca-334c-4285-b8b7-0c9fbc83d0d4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02b9f891-4b50-4b64-9954-8481056cba79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def summarize_patients(patient_texts, llama_model):\n",
+    "    \n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "    for the_patient in patient_texts:\n",
+    "\n",
+    "\n",
+    "    \n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are an experienced clinical oncology history summarization bot.\n",
+    "        Your job is to construct a summary of the cancer history for a patient based on an excerpt of the patient's electronic health record. The text in the excerpt is provided in chronological order.     \n",
+    "        Document the cancer type/primary site (eg breast cancer, lung cancer, etc); histology (eg adenocarcinoma, squamous carcinoma, etc); current extent (localized, advanced, metastatic, etc); biomarkers (genomic results, protein expression, etc); and treatment history (surgery, radiation, chemotherapy/targeted therapy/immunotherapy, etc, including start and stop dates and best response if known).\n",
+    "        Do not consider localized basal cell or squamous carcinomas of the skin, or colon polyps, to be cancers for your purposes.\n",
+    "        Do not include the patient's name, but do include relevant dates whenever documented, including dates of diagnosis and start/stop dates of each treatment.\n",
+    "        If a patient has a history of more than one cancer, document the cancers one at a time.\n",
+    "        \"\"\"}, \n",
+    "                    {'role':'user', 'content': \"The excerpt is:\\n\" + the_patient + \"\"\"Now, write your summary. Do not add preceding text before the abstraction, and do not add notes or commentary afterwards. This will not be used for clinical care, so do not write any disclaimers or cautionary notes.\"\"\"}\n",
+    "\n",
+    "                     ]\n",
+    "    \n",
+    "\n",
+    "\n",
+    "        prompts.append(messages)\n",
+    "\n",
+    "    long_messages = [x[1]['content'] for x in prompts]\n",
+    "    trunc_messages = tokenizer.batch_decode([x[-115000:] for x in tokenizer(long_messages, add_special_tokens=False).input_ids])\n",
+    "\n",
+    "    newprompts = []\n",
+    "    for i, messages in enumerate(prompts):\n",
+    "        messages[1]['content'] = trunc_messages[i]\n",
+    "        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        newprompts.append(template_prompt)\n",
+    "        \n",
+    "\n",
+    "    \n",
+    "    responses = llama_model.generate(\n",
+    "        newprompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=4096,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "\n",
+    "    return responses, response_texts\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69bc8576-e6d7-452f-b6b0-15df7f4c8922",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "synthetic_histories.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd443d34-c5db-414e-9892-eec368ef7ad6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# example summary generation for one synthetic patient\n",
+    "patient_summaries = summarize_patients(synthetic_histories.patient_long_text.iloc[10025:10026].tolist(), llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b5f0b1a-6df4-4d32-9072-efb4136df070",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_summaries[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dabd98af-947e-40c0-aea8-7805bb5b1c3c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74b2a972-9271-4ed2-9c2c-5ec5793e8650",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_summaries = summarize_patients(synthetic_histories.patient_long_text.tolist(), llama)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6b772c9-c4dd-45c2-8a4a-9e5c17d25e2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = synthetic_histories.copy()\n",
+    "output['patient_summary'] = patient_summaries[1]\n",
+    "output.to_parquet('synthetic_pt_summaries_11-22-24.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9f9ed498-4927-46a1-a23e-bf9f3a0cc544",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = pd.read_parquet('synthetic_pt_summaries_11-22-24.parquet')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6e1e9ce-e984-458b-881c-a99e3336e6c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5baf640d-1a6d-447e-84c2-d09a2a94a65a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output.patient_summary.sample(n=1).iloc[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "633ab065-8620-4519-af61-d9e76849cbdf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output['patient_summary'].str.contains(\"Lung\").value_counts()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

5a_make_top_10_cohorts_llama_check_list.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d99d54-84c8-4531-8951-133dfaf64c1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7a22e15-81dd-4ab4-b7df-1d8da8992685",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3553d4f3-c23c-4d21-94ad-da2bcd31a63d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_spaces = pd.read_csv('test_spaces.csv')\n",
+    "test_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4757eb7-4a69-4578-9020-075e960817ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dfci_trials = pd.read_csv(\"../v7/space_specific_eligibility_checks_11-6-24.csv\")\n",
+    "# this will not run out of the box, because the dfci trials file is not included in the upload, since it contains PHI/IP\n",
+    "\n",
+    "other_trials = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(test_spaces.nct_id)]\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(dfci_trials.nct_id)]\n",
+    "\n",
+    "unique_trials = other_trials.groupby('nct_id').first().reset_index()[['nct_id', 'this_space']]\n",
+    "unique_trials.shape[0]\n",
+    "\n",
+    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)\n",
+    "\n",
+    "valid_spaces = unique_trials[unique_trials.nct_id.isin(unique_trial_sample)]\n",
+    "\n",
+    "valid_spaces.to_csv('valid_spaces.csv')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d6b5654-2f5f-4caa-82ff-b02a9007dc49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces = unique_trials[~unique_trials.nct_id.isin(valid_spaces.nct_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01e0eb5b-9282-48b8-9917-3147fbf25730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.nct_id.isin(valid_spaces.nct_id).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed9139ac-a5f2-4791-a3a4-fd2127b7af0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa78d0d-3de1-48f7-a01e-f7d81a6a5b3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients = pd.read_parquet('synthetic_pt_summaries_11-23-24.parquet')\n",
+    "patients = patients[patients.split == 'train'][['patient_summary','split']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcba54bc-aba3-4d02-b874-c8275badf015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "399e05a5-7a58-4362-a002-0bce62a348ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_unique_patient_summaries = patients.patient_summary.unique().tolist()\n",
+    "print(len(train_unique_patient_summaries))\n",
+    "train_unique_spaces = train_spaces.this_space.unique().tolist()\n",
+    "print(len(train_unique_spaces))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c37f1137-41cf-4a6d-8099-7bed5838a1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06e7e0f2-5382-4507-908e-5ccf92b4beae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77659362-cf51-4906-9e12-5d1e55440b25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_patient_embeddings = embedding_model.encode(train_unique_patient_summaries, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0bb8926-3fa8-49bc-b507-6cc36d998600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_space_embeddings = embedding_model.encode(train_unique_spaces, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf259f29-4934-41e8-9b3a-87f09d0ef52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_list = []\n",
+    "train_unique_space_series = pd.Series(train_unique_spaces)\n",
+    "for i, patient_summary in enumerate(train_unique_patient_summaries):\n",
+    "    patient_embedding = train_unique_patient_embeddings[i, :]\n",
+    "    similarities = F.cosine_similarity(patient_embedding, train_unique_space_embeddings)\n",
+    "    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "    relevant_spaces = train_unique_space_series.iloc[sorted_indices[0:10].cpu().numpy()]\n",
+    "    output = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces})\n",
+    "    output_list.append(output)\n",
+    "\n",
+    "train_output = pd.concat(output_list, axis=0).reset_index(drop=True)\n",
+    "train_output['patient_summary'] = train_output.patient_summary.str.strip()\n",
+    "train_output['split'] = 'train'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52c926cc-d481-4c43-8288-7d1620c1f06f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output.to_csv('top_ten_cohorts_tocheck_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e0e18c1-bfb6-4c20-9aa4-257f0fb0424c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

5b_check_top10_cohorts_synthetic.ipynb ADDED Viewed

	@@ -0,0 +1,187 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62669512-19e7-43cd-a518-4572eea700af",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
+    "            gpu_memory_utilization=0.80,\n",
+    "            download_dir = \"../../\", max_model_len=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bca2af-0cf4-41f2-ae28-d2c669a1af21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ask_about_trials_loosely(patient_summaries, trial_summaries, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    for patient_summary, trial_summary in zip(patient_summaries, trial_summaries):\n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are a brilliant oncologist with encyclopedic knowledge about cancer and its treatment. \n",
+    "    Your job is to evaluate whether a given clinical trial is a reasonable consideration for a patient, given a clinical trial summary and a patient summary.\\n\"\"\"}, \n",
+    "                {'role':'user', 'content': \"Here is a summary of the clinical trial:\\n\" + trial_summary + \"\\nHere is a summary of the patient:\\n\" + patient_summary + \"\"\"\n",
+    "Base your judgment on whether the patient generally fits the cancer type(s), cancer burden, prior treatment(s), and biomarker criteria specified for the trial.\n",
+    "You do not have to determine if the patient is actually eligible; instead please just evaluate whether it is reasonable for the trial to be considered further by the patient's oncologist.\n",
+    "Some trials have biomarker requirements that are not assessed until formal eligibility screening begins; please ignore these requirements.\n",
+    "Reason step by step, then answer the question \"Is this trial a reasonable consideration for this patient?\" with a one-word \"Yes!\" or \"No!\" answer.\n",
+    "Make sure to include the exclamation point in your final one-word answer.\"\"\"}]\n",
+    "\n",
+    "    \n",
+    "        prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        prompts.append(prompt)\n",
+    "        \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=2048,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "    eligibility_results = []\n",
+    "\n",
+    "    for response_text in response_texts:\n",
+    "        if (\"Yes!\" in response_text) or (\"YES!\" in response_text):\n",
+    "            eligibility_results.append(1.0)\n",
+    "        else:\n",
+    "            eligibility_results.append(0.0)\n",
+    "    \n",
+    "    return responses, response_texts, eligibility_results\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ce4cce6-3833-451a-98c6-d7f4c7b948c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = pd.read_csv('top_ten_cohorts_tocheck_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbb846d2-20e3-4361-b69b-82aa31c1f789",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb1aa2f-6c28-4a4e-a4e1-bfd69d0b39a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "output_list = []\n",
+    "batch_list = []\n",
+    "\n",
+    "num_in_batch = 0\n",
+    "\n",
+    "for i in range(0, patient_cohort_candidates.shape[0]):\n",
+    "   \n",
+    "    batch_list.append(patient_cohort_candidates.iloc[[i]])\n",
+    "    num_in_batch += 1\n",
+    "    \n",
+    "    if (num_in_batch == 500) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "\n",
+    "        output = pd.concat(batch_list, axis=0)\n",
+    "        _, output['llama_response'], output['eligibility_result'] = ask_about_trials_loosely(output['patient_summary'].tolist(), output['this_space'].astype(str).tolist(), llama)\n",
+    "\n",
+    "        output_list.append(output)\n",
+    "        num_in_batch = 0\n",
+    "        batch_list = []\n",
+    "    \n",
+    "    if (len(output_list) > 0 and (i % 500 == 0)) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "        output_file = pd.concat(output_list, axis=0)\n",
+    "        output_file.to_csv('top_ten_cohorts_checked_synthetic.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91534c0e-4873-4eda-9a69-53660a84b4df",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaebffcc-4b62-4ab6-a077-69a6e4340773",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

6a_make_top_10_cohorts_llama_check_list.ipynb ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d99d54-84c8-4531-8951-133dfaf64c1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7a22e15-81dd-4ab4-b7df-1d8da8992685",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3553d4f3-c23c-4d21-94ad-da2bcd31a63d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_spaces = pd.read_csv('test_spaces.csv')\n",
+    "test_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4757eb7-4a69-4578-9020-075e960817ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dfci_trials = pd.read_csv(\"../v7/space_specific_eligibility_checks_11-6-24.csv\")\n",
+    "# this will not run out of the box, because dfci_trials file was not included in upload, since it contains PHI\n",
+    "\n",
+    "other_trials = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(test_spaces.nct_id)]\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(dfci_trials.nct_id)]\n",
+    "\n",
+    "unique_trials = other_trials.groupby('nct_id').first().reset_index()[['nct_id', 'this_space']]\n",
+    "unique_trials.shape[0]\n",
+    "\n",
+    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)\n",
+    "\n",
+    "valid_spaces = unique_trials[unique_trials.nct_id.isin(unique_trial_sample)]\n",
+    "\n",
+    "valid_spaces.to_csv('valid_spaces.csv')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d6b5654-2f5f-4caa-82ff-b02a9007dc49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces = unique_trials[~unique_trials.nct_id.isin(valid_spaces.nct_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01e0eb5b-9282-48b8-9917-3147fbf25730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.nct_id.isin(valid_spaces.nct_id).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed9139ac-a5f2-4791-a3a4-fd2127b7af0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa78d0d-3de1-48f7-a01e-f7d81a6a5b3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients = pd.read_parquet('synthetic_pt_summaries_11-23-24.parquet')\n",
+    "\n",
+    "patients = patients[patients.split == 'train'][['patient_summary','split']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcba54bc-aba3-4d02-b874-c8275badf015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "399e05a5-7a58-4362-a002-0bce62a348ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_unique_patient_summaries = patients.patient_summary.unique().tolist()\n",
+    "print(len(train_unique_patient_summaries))\n",
+    "train_unique_spaces = train_spaces.this_space.unique().tolist()\n",
+    "print(len(train_unique_spaces))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c37f1137-41cf-4a6d-8099-7bed5838a1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06e7e0f2-5382-4507-908e-5ccf92b4beae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_model = SentenceTransformer('dunzhang/stella_en_1.5B_v5', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77659362-cf51-4906-9e12-5d1e55440b25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_patient_embeddings = embedding_model.encode(train_unique_patient_summaries, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0bb8926-3fa8-49bc-b507-6cc36d998600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_space_embeddings = embedding_model.encode(train_unique_spaces, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf259f29-4934-41e8-9b3a-87f09d0ef52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_list = []\n",
+    "train_unique_patient_series = pd.Series(train_unique_patient_summaries)\n",
+    "for i, space_summary in enumerate(train_unique_spaces):\n",
+    "    space_embedding = train_unique_space_embeddings[i, :]\n",
+    "    similarities = F.cosine_similarity(space_embedding, train_unique_patient_embeddings)\n",
+    "    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "    relevant_patients = train_unique_patient_series.iloc[sorted_indices[0:20].cpu().numpy()]\n",
+    "    output = pd.DataFrame({'space_summary':space_summary, 'this_patient':relevant_patients})\n",
+    "    output_list.append(output)\n",
+    "\n",
+    "train_output = pd.concat(output_list, axis=0).reset_index(drop=True)\n",
+    "train_output['space_summary'] = train_output.space_summary.str.strip()\n",
+    "train_output['split'] = 'train'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52c926cc-d481-4c43-8288-7d1620c1f06f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output.to_csv('top_twenty_patients_tocheck_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e0e18c1-bfb6-4c20-9aa4-257f0fb0424c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

6b_check_top20_patient_matches_synthetic.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '0,2'\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62669512-19e7-43cd-a518-4572eea700af",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
+    "            gpu_memory_utilization=0.50,\n",
+    "            download_dir = \"../../\", max_model_len=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bca2af-0cf4-41f2-ae28-d2c669a1af21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ask_about_trials_loosely(patient_summaries, trial_summaries, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    for patient_summary, trial_summary in zip(patient_summaries, trial_summaries):\n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are a brilliant oncologist with encyclopedic knowledge about cancer and its treatment. \n",
+    "    Your job is to evaluate whether a given clinical trial is a reasonable consideration for a patient, given a clinical trial summary and a patient summary.\\n\"\"\"}, \n",
+    "                {'role':'user', 'content': \"Here is a summary of the clinical trial:\\n\" + trial_summary + \"\\nHere is a summary of the patient:\\n\" + patient_summary + \"\"\"\n",
+    "Base your judgment on whether the patient generally fits the cancer type(s), cancer burden, prior treatment(s), and biomarker criteria specified for the trial.\n",
+    "You do not have to determine if the patient is actually eligible; instead please just evaluate whether it is reasonable for the trial to be considered further by the patient's oncologist.\n",
+    "Some trials have biomarker requirements that are not assessed until formal eligibility screening begins; please ignore these requirements.\n",
+    "Reason step by step, then answer the question \"Is this trial a reasonable consideration for this patient?\" with a one-word \"Yes!\" or \"No!\" answer.\n",
+    "Make sure to include the exclamation point in your final one-word answer.\"\"\"}]\n",
+    "\n",
+    "    \n",
+    "        prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        prompts.append(prompt)\n",
+    "        \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=2048,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "    eligibility_results = []\n",
+    "\n",
+    "    for response_text in response_texts:\n",
+    "        if (\"Yes!\" in response_text) or (\"YES!\" in response_text):\n",
+    "            eligibility_results.append(1.0)\n",
+    "        else:\n",
+    "            eligibility_results.append(0.0)\n",
+    "    \n",
+    "    return responses, response_texts, eligibility_results\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ce4cce6-3833-451a-98c6-d7f4c7b948c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = pd.read_csv('top_twenty_patients_tocheck_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbb846d2-20e3-4361-b69b-82aa31c1f789",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = patient_cohort_candidates.rename(columns={'this_patient':'patient_summary', 'space_summary':'this_space'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb1aa2f-6c28-4a4e-a4e1-bfd69d0b39a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "output_list = []\n",
+    "batch_list = []\n",
+    "\n",
+    "num_in_batch = 0\n",
+    "\n",
+    "for i in range(0, patient_cohort_candidates.shape[0]):\n",
+    "   \n",
+    "    batch_list.append(patient_cohort_candidates.iloc[[i]])\n",
+    "    num_in_batch += 1\n",
+    "    \n",
+    "    if (num_in_batch == 500) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "\n",
+    "        output = pd.concat(batch_list, axis=0)\n",
+    "        _, output['llama_response'], output['eligibility_result'] = ask_about_trials_loosely(output['patient_summary'].tolist(), output['this_space'].astype(str).tolist(), llama)\n",
+    "\n",
+    "        output_list.append(output)\n",
+    "        num_in_batch = 0\n",
+    "        batch_list = []\n",
+    "    \n",
+    "    if (len(output_list) > 0 and (i % 500 == 0)) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "        output_file = pd.concat(output_list, axis=0)\n",
+    "        output_file.to_csv('top_twenty_patients_checked_synthetic.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91534c0e-4873-4eda-9a69-53660a84b4df",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaebffcc-4b62-4ab6-a077-69a6e4340773",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

7_train_trialspace_round1.ipynb ADDED Viewed

	@@ -0,0 +1,451 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81b83fa8-421d-4be5-b9eb-5892f01fd5b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "937cbcda-0cd6-47f7-b52e-17ed2bafce3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('dunzhang/stella_en_1.5b_v5', trust_remote_code=True, device='cuda')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "853e6b86-db0b-4650-b98f-f437987baa5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks = pd.read_csv('top_ten_cohorts_checked_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "474950e0-869b-414e-823f-df5ba8e5de92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9835dad-4fc4-4a0e-aba2-d45358edbee9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks['mod_eligibility_result'] = np.where(cohort_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "413bddbc-f35c-48ec-bcbb-48405bd2c9c9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79c2d994-1e39-41b6-aeb3-962ba3ba5611",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c8e6a20-4513-422c-be6e-3459ce98a2be",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks = pd.read_csv('top_twenty_patients_checked_synthetic.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e91074cf-07de-40d1-8bf3-baf825d3f625",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks['mod_eligibility_result'] = np.where(patient_checks.llama_response.str.contains('Yes!|YES!'), 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a2d45fc-7d92-4a65-aad0-a9ab8f783779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0cf7c705-ba6d-4f01-ac2e-88d345fef7f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks.eligibility_result.value_counts(), patient_checks.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dec4a8c8-c5db-4164-a06c-27fb59782fa5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_checks = patient_checks.rename(columns={'this_patient':'patient_summary', 'space_summary':'this_space'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0bf55e82-c91d-472f-84ad-74c755e9bf29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined_checks = pd.concat([patient_checks, cohort_checks], axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ebd07c9c-6263-4005-bfb1-2a8468b76a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined_checks.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49f59429-c9f4-43df-a1b2-750a3c94517a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e1d0126e-a58d-41ca-ad2f-a2d37bc585ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_summaries = combined_checks[combined_checks.split=='train']\n",
+    "train_summaries = train_summaries[~train_summaries.patient_summary.isnull()]\n",
+    "train_summaries = train_summaries[~train_summaries.llama_response.isnull()]\n",
+    "train_summaries.split.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f6506ed-dcbf-4e0b-8722-6e234c2d4509",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_summaries.mod_eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c678a59a-c301-42d9-83dd-511503cee2fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_summaries.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57932264-103a-413b-9a48-43b7be254ac0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# mll loss\n",
+    "train_eligibles_only = train_summaries[train_summaries.eligibility_result == 1]\n",
+    "example_list = []\n",
+    "for i in range(train_eligibles_only.shape[0]):\n",
+    "    example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]]))\n",
+    "\n",
+    "train_eligibles_only_dataloader = DataLoader(example_list, shuffle=True, batch_size=8)\n",
+    "train_eligibles_only_loss = losses.MultipleNegativesRankingLoss(model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e5482be3-9a13-4ce1-aa8a-429c54bf6be0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for attempt at contrastive loss\n",
+    "contrastive_example_list = []\n",
+    "for i in range(train_summaries.shape[0]):\n",
+    "    contrastive_example_list.append(InputExample(texts=[train_summaries.patient_summary.iloc[i], train_summaries.this_space.iloc[i]],\n",
+    "                                    label=train_summaries.mod_eligibility_result.iloc[i]))\n",
+    "\n",
+    "contrastive_dataloader = DataLoader(contrastive_example_list, shuffle=True, batch_size=12)\n",
+    "contrastive_train_loss = losses.OnlineContrastiveLoss(model=model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e825dae-a5a9-4f87-af35-63ac2d73de33",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f17ad7a6-8911-4d7d-8495-3e37cb00597d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "#%%capture\n",
+    "model.fit(train_objectives=[(contrastive_dataloader, contrastive_train_loss),\n",
+    "                           (train_eligibles_only_dataloader, train_eligibles_only_loss)], epochs=2, warmup_steps=100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9cb6021-21d8-44bf-b440-980fcdae3b3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save('reranker_round1.model')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bae79a2e-4357-4c90-ba4c-a08b1206a99d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = SentenceTransformer('reranker_round1.model', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5517caa-c45b-4b62-ae8d-0af61b61fd25",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6bfb8f7-ca6b-474b-8ce3-ba5acacb6b6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check model's ability to do initial discriminate among diseases task\n",
+    "# (on PHI)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4172f6ba-b334-4b83-b73e-d05dad6c05f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cohort_checks = pd.read_csv('../v7/space_specific_eligibility_checks_11-6-24.csv')\n",
+    "# this cohort_checks file is not provided publicly, since it contains PHI/IP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6b25941-0007-4347-9ef3-899f9258542a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_set = cohort_checks[cohort_checks.split.str.contains('valid')]\n",
+    "validation_set.info()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b791608-6011-4bf6-914a-9534a08eba5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_set = validation_set[~validation_set.patient_summary.isnull()]\n",
+    "validation_set.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "479b9905-fcd6-4d37-9b03-7bbbfb88f123",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "eligibles_only = validation_set[validation_set.eligibility_result == 1]\n",
+    "patient_summary_embeddings = model.encode(eligibles_only.patient_summary.tolist())\n",
+    "trial_summary_embeddings = model.encode(eligibles_only.this_space.tolist())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b8f3a40-0854-43a5-bd83-a7fe6770f52b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# among patient to trial space candidate matches that pass llama checks, how good is TrialSpace at discriminating between true and random matches?\n",
+    "import random\n",
+    "labels = []\n",
+    "similarities = []\n",
+    "for i in range(trial_summary_embeddings.shape[0]):\n",
+    "    if random.choice([0,1]) == 1:\n",
+    "        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[i, :]).unsqueeze(0)))\n",
+    "        labels.append(1.)\n",
+    "    else:\n",
+    "        random_index = random.choice([x for x in range(0,trial_summary_embeddings.shape[0])])\n",
+    "        similarities.append(F.cosine_similarity(torch.tensor(patient_summary_embeddings[i,:]).unsqueeze(0), torch.tensor(trial_summary_embeddings[random_index, :]).unsqueeze(0)))\n",
+    "        labels.append(0.)\n",
+    "roc_auc_score(labels, np.array([x.numpy() for x in similarities]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16dd4634-0389-466d-8257-160ddd2659af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# how good are embeddings at discriminating between llama yes and no checks?\n",
+    "# (on PHI)\n",
+    "patient_summary_embeddings = model.encode(validation_set.patient_summary.tolist(), convert_to_tensor=True)\n",
+    "trial_summary_embeddings = model.encode(validation_set.this_space.tolist(), convert_to_tensor=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5bb0bc89-0b4f-451d-9523-550f7344e4d9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "similarities = F.cosine_similarity(patient_summary_embeddings, trial_summary_embeddings).detach().cpu().numpy()\n",
+    "roc_auc_score(validation_set.eligibility_result, similarities)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6035e62-8d28-49c5-8d0a-049633edd553",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f587899-0101-4d81-91a7-f8ef72be949f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "validation_set.eligibility_result.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "453c2f3c-105a-4b71-851c-372bf29d3fe8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69a3fc1d-86f1-49f7-a93a-54f4748c5dbf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23d7d1f4-9f1f-42f6-a366-0e39af8893b2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c4415d5-d0fd-48ca-b88c-2e244434561d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

8a_make_top_10_cohorts_llama_check_list_round2.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d99d54-84c8-4531-8951-133dfaf64c1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7a22e15-81dd-4ab4-b7df-1d8da8992685",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3553d4f3-c23c-4d21-94ad-da2bcd31a63d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_spaces = pd.read_csv('test_spaces.csv')\n",
+    "test_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4757eb7-4a69-4578-9020-075e960817ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dfci_trials = pd.read_csv(\"../v7/space_specific_eligibility_checks_11-6-24.csv\")\n",
+    "# this dfci_trials file is not provided for public use, since it contains PHI, so this will not run out of the box\n",
+    "\n",
+    "other_trials = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(test_spaces.nct_id)]\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(dfci_trials.nct_id)]\n",
+    "\n",
+    "unique_trials = other_trials.groupby('nct_id').first().reset_index()[['nct_id', 'this_space']]\n",
+    "unique_trials.shape[0]\n",
+    "\n",
+    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)\n",
+    "\n",
+    "valid_spaces = unique_trials[unique_trials.nct_id.isin(unique_trial_sample)]\n",
+    "\n",
+    "valid_spaces.to_csv('valid_spaces.csv')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d6b5654-2f5f-4caa-82ff-b02a9007dc49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces = unique_trials[~unique_trials.nct_id.isin(valid_spaces.nct_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01e0eb5b-9282-48b8-9917-3147fbf25730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.nct_id.isin(valid_spaces.nct_id).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed9139ac-a5f2-4791-a3a4-fd2127b7af0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa78d0d-3de1-48f7-a01e-f7d81a6a5b3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients = pd.read_parquet('synthetic_pt_summaries_11-23-24.parquet')\n",
+    "patients = patients[patients.split == 'train'][['patient_summary','split']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcba54bc-aba3-4d02-b874-c8275badf015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "399e05a5-7a58-4362-a002-0bce62a348ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_unique_patient_summaries = patients.patient_summary.unique().tolist()\n",
+    "print(len(train_unique_patient_summaries))\n",
+    "train_unique_spaces = train_spaces.this_space.unique().tolist()\n",
+    "print(len(train_unique_spaces))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c37f1137-41cf-4a6d-8099-7bed5838a1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06e7e0f2-5382-4507-908e-5ccf92b4beae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_model = SentenceTransformer('reranker_round1.model', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77659362-cf51-4906-9e12-5d1e55440b25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_patient_embeddings = embedding_model.encode(train_unique_patient_summaries, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0bb8926-3fa8-49bc-b507-6cc36d998600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_space_embeddings = embedding_model.encode(train_unique_spaces, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf259f29-4934-41e8-9b3a-87f09d0ef52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_list = []\n",
+    "train_unique_space_series = pd.Series(train_unique_spaces)\n",
+    "for i, patient_summary in enumerate(train_unique_patient_summaries):\n",
+    "    patient_embedding = train_unique_patient_embeddings[i, :]\n",
+    "    similarities = F.cosine_similarity(patient_embedding, train_unique_space_embeddings)\n",
+    "    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "    relevant_spaces = train_unique_space_series.iloc[sorted_indices[0:10].cpu().numpy()]\n",
+    "    output = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces})\n",
+    "    output_list.append(output)\n",
+    "\n",
+    "train_output = pd.concat(output_list, axis=0).reset_index(drop=True)\n",
+    "train_output['patient_summary'] = train_output.patient_summary.str.strip()\n",
+    "train_output['split'] = 'train'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52c926cc-d481-4c43-8288-7d1620c1f06f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output.to_csv('top_ten_cohorts_tocheck_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e0e18c1-bfb6-4c20-9aa4-257f0fb0424c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

8b_check_top10_cohorts_synthetic_round2.ipynb ADDED Viewed

	@@ -0,0 +1,187 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62669512-19e7-43cd-a518-4572eea700af",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n",
+    "            gpu_memory_utilization=0.90,\n",
+    "            download_dir = \"../../\", max_model_len=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bca2af-0cf4-41f2-ae28-d2c669a1af21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ask_about_trials_loosely(patient_summaries, trial_summaries, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    for patient_summary, trial_summary in zip(patient_summaries, trial_summaries):\n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are a brilliant oncologist with encyclopedic knowledge about cancer and its treatment. \n",
+    "    Your job is to evaluate whether a given clinical trial is a reasonable consideration for a patient, given a clinical trial summary and a patient summary.\\n\"\"\"}, \n",
+    "                {'role':'user', 'content': \"Here is a summary of the clinical trial:\\n\" + trial_summary + \"\\nHere is a summary of the patient:\\n\" + patient_summary + \"\"\"\n",
+    "Base your judgment on whether the patient generally fits the cancer type(s), cancer burden, prior treatment(s), and biomarker criteria specified for the trial.\n",
+    "You do not have to determine if the patient is actually eligible; instead please just evaluate whether it is reasonable for the trial to be considered further by the patient's oncologist.\n",
+    "Some trials have biomarker requirements that are not assessed until formal eligibility screening begins; please ignore these requirements.\n",
+    "Reason step by step, then answer the question \"Is this trial a reasonable consideration for this patient?\" with a one-word \"Yes!\" or \"No!\" answer.\n",
+    "Make sure to include the exclamation point in your final one-word answer.\"\"\"}]\n",
+    "\n",
+    "    \n",
+    "        prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        prompts.append(prompt)\n",
+    "        \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=2048,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "    eligibility_results = []\n",
+    "\n",
+    "    for response_text in response_texts:\n",
+    "        if (\"Yes!\" in response_text) or (\"YES!\" in response_text):\n",
+    "            eligibility_results.append(1.0)\n",
+    "        else:\n",
+    "            eligibility_results.append(0.0)\n",
+    "    \n",
+    "    return responses, response_texts, eligibility_results\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ce4cce6-3833-451a-98c6-d7f4c7b948c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = pd.read_csv('top_ten_cohorts_tocheck_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbb846d2-20e3-4361-b69b-82aa31c1f789",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb1aa2f-6c28-4a4e-a4e1-bfd69d0b39a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "output_list = []\n",
+    "batch_list = []\n",
+    "\n",
+    "num_in_batch = 0\n",
+    "\n",
+    "for i in range(0, patient_cohort_candidates.shape[0]):\n",
+    "   \n",
+    "    batch_list.append(patient_cohort_candidates.iloc[[i]])\n",
+    "    num_in_batch += 1\n",
+    "    \n",
+    "    if (num_in_batch == 500) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "\n",
+    "        output = pd.concat(batch_list, axis=0)\n",
+    "        _, output['llama_response'], output['eligibility_result'] = ask_about_trials_loosely(output['patient_summary'].tolist(), output['this_space'].astype(str).tolist(), llama)\n",
+    "\n",
+    "        output_list.append(output)\n",
+    "        num_in_batch = 0\n",
+    "        batch_list = []\n",
+    "    \n",
+    "    if (len(output_list) > 0 and (i % 500 == 0)) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "        output_file = pd.concat(output_list, axis=0)\n",
+    "        output_file.to_csv('top_ten_cohorts_checked_synthetic_round2.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91534c0e-4873-4eda-9a69-53660a84b4df",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaebffcc-4b62-4ab6-a077-69a6e4340773",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

9a_make_top_20_patients_llama_check_list_round2.ipynb ADDED Viewed

	@@ -0,0 +1,232 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "07d99d54-84c8-4531-8951-133dfaf64c1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "os.environ['CUDA_VISIBLE_DEVICES'] = '1'\n",
+    "from sentence_transformers import SentenceTransformer, InputExample, losses\n",
+    "from torch.utils.data import DataLoader\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from sklearn.metrics import roc_auc_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7a22e15-81dd-4ab4-b7df-1d8da8992685",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3553d4f3-c23c-4d21-94ad-da2bcd31a63d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_spaces = pd.read_csv('test_spaces.csv')\n",
+    "test_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4757eb7-4a69-4578-9020-075e960817ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "dfci_trials = pd.read_csv(\"../v7/space_specific_eligibility_checks_11-6-24.csv\")\n",
+    "# this dfci_trials file is not provided publicly, since it contains PHI, so this cell and dependent cells will not run out of the box\n",
+    "\n",
+    "other_trials = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(test_spaces.nct_id)]\n",
+    "other_trials = other_trials[~other_trials.nct_id.isin(dfci_trials.nct_id)]\n",
+    "\n",
+    "unique_trials = other_trials.groupby('nct_id').first().reset_index()[['nct_id', 'this_space']]\n",
+    "unique_trials.shape[0]\n",
+    "\n",
+    "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)\n",
+    "\n",
+    "valid_spaces = unique_trials[unique_trials.nct_id.isin(unique_trial_sample)]\n",
+    "\n",
+    "valid_spaces.to_csv('valid_spaces.csv')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8d6b5654-2f5f-4caa-82ff-b02a9007dc49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces = unique_trials[~unique_trials.nct_id.isin(valid_spaces.nct_id)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01e0eb5b-9282-48b8-9917-3147fbf25730",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.nct_id.isin(valid_spaces.nct_id).value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed9139ac-a5f2-4791-a3a4-fd2127b7af0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_spaces.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfa78d0d-3de1-48f7-a01e-f7d81a6a5b3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients = pd.read_parquet('synthetic_pt_summaries_11-23-24.parquet')\n",
+    "patients = patients[patients.split == 'train'][['patient_summary','split']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fcba54bc-aba3-4d02-b874-c8275badf015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patients.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "399e05a5-7a58-4362-a002-0bce62a348ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_unique_patient_summaries = patients.patient_summary.unique().tolist()\n",
+    "print(len(train_unique_patient_summaries))\n",
+    "train_unique_spaces = train_spaces.this_space.unique().tolist()\n",
+    "print(len(train_unique_spaces))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c37f1137-41cf-4a6d-8099-7bed5838a1ee",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06e7e0f2-5382-4507-908e-5ccf92b4beae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_model = SentenceTransformer('reranker_round1.model', trust_remote_code=True, device='cuda')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77659362-cf51-4906-9e12-5d1e55440b25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_patient_embeddings = embedding_model.encode(train_unique_patient_summaries, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a0bb8926-3fa8-49bc-b507-6cc36d998600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.no_grad():\n",
+    "    train_unique_space_embeddings = embedding_model.encode(train_unique_spaces, convert_to_tensor=True, prompt_name = \"s2s_query\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf259f29-4934-41e8-9b3a-87f09d0ef52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_list = []\n",
+    "train_unique_patient_series = pd.Series(train_unique_patient_summaries)\n",
+    "for i, space_summary in enumerate(train_unique_spaces):\n",
+    "    space_embedding = train_unique_space_embeddings[i, :]\n",
+    "    similarities = F.cosine_similarity(space_embedding, train_unique_patient_embeddings)\n",
+    "    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)\n",
+    "    relevant_patients = train_unique_patient_series.iloc[sorted_indices[0:20].cpu().numpy()]\n",
+    "    output = pd.DataFrame({'space_summary':space_summary, 'this_patient':relevant_patients})\n",
+    "    output_list.append(output)\n",
+    "\n",
+    "train_output = pd.concat(output_list, axis=0).reset_index(drop=True)\n",
+    "train_output['space_summary'] = train_output.space_summary.str.strip()\n",
+    "train_output['split'] = 'train'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52c926cc-d481-4c43-8288-7d1620c1f06f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_output.to_csv('top_twenty_patients_tocheck_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e0e18c1-bfb6-4c20-9aa4-257f0fb0424c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

9b_check_top20_patients_synthetic_round2.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3d6ff53-2176-44aa-8590-ec0aa301342d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import torch.nn.functional as F\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForCausalLM\n",
+    "import re\n",
+    "import os\n",
+    "#os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3'\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62669512-19e7-43cd-a518-4572eea700af",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 4, \n",
+    "            gpu_memory_utilization=0.20,\n",
+    "            download_dir = \"../../\", max_model_len=5000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16bca2af-0cf4-41f2-ae28-d2c669a1af21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def ask_about_trials_loosely(patient_summaries, trial_summaries, llama_model):\n",
+    "\n",
+    "    tokenizer = llama_model.get_tokenizer()\n",
+    "\n",
+    "    prompts = []\n",
+    "\n",
+    "    for patient_summary, trial_summary in zip(patient_summaries, trial_summaries):\n",
+    "        messages = [{'role':'system', 'content': \"\"\"You are a brilliant oncologist with encyclopedic knowledge about cancer and its treatment. \n",
+    "    Your job is to evaluate whether a given clinical trial is a reasonable consideration for a patient, given a clinical trial summary and a patient summary.\\n\"\"\"}, \n",
+    "                {'role':'user', 'content': \"Here is a summary of the clinical trial:\\n\" + trial_summary + \"\\nHere is a summary of the patient:\\n\" + patient_summary + \"\"\"\n",
+    "Base your judgment on whether the patient generally fits the cancer type(s), cancer burden, prior treatment(s), and biomarker criteria specified for the trial.\n",
+    "You do not have to determine if the patient is actually eligible; instead please just evaluate whether it is reasonable for the trial to be considered further by the patient's oncologist.\n",
+    "Some trials have biomarker requirements that are not assessed until formal eligibility screening begins; please ignore these requirements.\n",
+    "Reason step by step, then answer the question \"Is this trial a reasonable consideration for this patient?\" with a one-word \"Yes!\" or \"No!\" answer.\n",
+    "Make sure to include the exclamation point in your final one-word answer.\"\"\"}]\n",
+    "\n",
+    "    \n",
+    "        prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)\n",
+    "        prompts.append(prompt)\n",
+    "        \n",
+    "    responses = llama_model.generate(\n",
+    "        prompts,     \n",
+    "        SamplingParams(\n",
+    "        temperature=0.0,\n",
+    "        top_p=0.2,\n",
+    "        max_tokens=2048,\n",
+    "        repetition_penalty=1.2,\n",
+    "        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")],  # KEYPOINT HERE\n",
+    "    ))\n",
+    "\n",
+    "    response_texts = [x.outputs[0].text for x in responses]\n",
+    "\n",
+    "    eligibility_results = []\n",
+    "\n",
+    "    for response_text in response_texts:\n",
+    "        if (\"Yes!\" in response_text) or (\"YES!\" in response_text):\n",
+    "            eligibility_results.append(1.0)\n",
+    "        else:\n",
+    "            eligibility_results.append(0.0)\n",
+    "    \n",
+    "    return responses, response_texts, eligibility_results\n",
+    "    \n",
+    "\n",
+    "    \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ce4cce6-3833-451a-98c6-d7f4c7b948c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = pd.read_csv('top_twenty_patients_tocheck_synthetic_round2.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbb846d2-20e3-4361-b69b-82aa31c1f789",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates = patient_cohort_candidates.rename(columns={'this_patient':'patient_summary', 'space_summary':'this_space'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1fb1aa2f-6c28-4a4e-a4e1-bfd69d0b39a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "patient_cohort_candidates.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d30bf018-e135-40be-b636-0ba17acf8e61",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "output_list = []\n",
+    "batch_list = []\n",
+    "\n",
+    "num_in_batch = 0\n",
+    "\n",
+    "for i in range(0, patient_cohort_candidates.shape[0]):\n",
+    "   \n",
+    "    batch_list.append(patient_cohort_candidates.iloc[[i]])\n",
+    "    num_in_batch += 1\n",
+    "    \n",
+    "    if (num_in_batch == 500) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "\n",
+    "        output = pd.concat(batch_list, axis=0)\n",
+    "        _, output['llama_response'], output['eligibility_result'] = ask_about_trials_loosely(output['patient_summary'].tolist(), output['this_space'].astype(str).tolist(), llama)\n",
+    "\n",
+    "        output_list.append(output)\n",
+    "        num_in_batch = 0\n",
+    "        batch_list = []\n",
+    "    \n",
+    "    if (len(output_list) > 0 and (i % 500 == 0)) or (i == (patient_cohort_candidates.shape[0] - 1)):\n",
+    "        output_file = pd.concat(output_list, axis=0)\n",
+    "        output_file.to_csv('top_twenty_patients_checked_synthetic_round2.csv')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91534c0e-4873-4eda-9a69-53660a84b4df",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eaebffcc-4b62-4ab6-a077-69a6e4340773",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}