{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "ee78bb6d-4e3c-4751-b042-12c358d89cac", "metadata": { "scrolled": true }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import json\n", "from vllm import LLM, SamplingParams\n", "from transformers import AutoTokenizer\n", "import torch\n", "import os\n", "#os.environ['CUDA_VISIBLE_DEVICES'] = '2,3'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e8eeb339-6aca-4d3f-96fb-24a1caf26b34", "metadata": { "scrolled": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7129a989-04e9-475d-9260-d1fdb1ab7faa", "metadata": {}, "outputs": [], "source": [ "llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, \n", " gpu_memory_utilization = 0.5,\n", " download_dir = \"../../..\", max_model_len=6000)" ] }, { "cell_type": "code", "execution_count": null, "id": "d9d7d1c4-50ed-4614-9855-8e6cc86bbb0e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "73897bb9-0738-4446-b332-9b9bf46ad043", "metadata": {}, "outputs": [], "source": [ "def summarize_trials_multi_cohort(eligibility_texts, llama_model):\n", "\n", " tokenizer = llama.get_tokenizer()\n", " prompts = []\n", " for trial in eligibility_texts:\n", " messages = [\n", " {'role':'system', 'content': \"\"\"You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.\n", " Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.\n", " A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.\n", " Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.\n", " Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.\n", " For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.\n", " Spell out cancer types; do not abbreviate them. For example, write \"non-small cell lung cancer\" rather than \"NSCLC\".\n", " Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:\n", " 1. Cancer type allowed: . Histology allowed: . Cancer burden allowed: . Prior treatment required: . Prior treatment excluded: . Biomarkers required: . Biomarkers excluded: .\n", " 2. Cancer type allowed: , etc.\n", " If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).\n", " \"\"\"}, \n", " \n", " {'role':'user', 'content': \"Here is a clinical trial document: \\n\" + trial + \"\\n\" + \"\"\"Now, generate your list of the trial space(s), formatted as above.\n", " Do not provide any introductory, explanatory, concluding, or disclaimer text.\n", " Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED.\"\"\"\n", " }\n", " ]\n", " \n", " prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))\n", " \n", "\n", " \n", " responses = llama_model.generate(\n", " prompts, \n", " SamplingParams(\n", " temperature=0.0,\n", " top_p=0.9,\n", " max_tokens=3096,\n", " stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(\"<|eot_id|>\")], # KEYPOINT HERE\n", " ))\n", "\n", " response_texts = [x.outputs[0].text for x in responses]\n", "\n", "\n", " return responses, response_texts" ] }, { "cell_type": "code", "execution_count": null, "id": "ca683840-842b-4346-8eef-b66bc52d26af", "metadata": {}, "outputs": [], "source": [ "trials = pd.read_csv('./ctgov_cancer_trials.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "aa51de7d-74e0-4822-b7e1-2c9a3bc31260", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "4816dbf0-bd92-4742-912a-477e545e330b", "metadata": {}, "outputs": [], "source": [ "trial_cohorts = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)" ] }, { "cell_type": "code", "execution_count": null, "id": "8283c587-c909-4548-804d-4d88b4ed7255", "metadata": {}, "outputs": [], "source": [ "trials['spaces'] = trial_cohorts[1]" ] }, { "cell_type": "code", "execution_count": null, "id": "2ca75bab-7273-4ab0-86cd-1e0373546fce", "metadata": {}, "outputs": [], "source": [ "trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "0291913f-f3b9-4b39-99ab-954cb7237255", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "16563812-6967-4788-a123-0af5fd701ede", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "95776dbe-1a25-44bd-90f8-5c1573b6e92a", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "output = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "cf647a1f-5a8c-4958-9032-440806a306d5", "metadata": {}, "outputs": [], "source": [ "# example of a trial and extracted spaces\n", "i = 1000\n", "output.trial_text.iloc[i], output.spaces.iloc[i]" ] }, { "cell_type": "code", "execution_count": null, "id": "9cc06840-5647-4524-a7bf-a1ad53a07b7c", "metadata": {}, "outputs": [], "source": [ "frames = []\n", "for i in range(trials.shape[0]):\n", " cohorts = pd.Series(trials.iloc[i].spaces.split(\"\\n\"))\n", " cohorts = cohorts[~((cohorts.isnull()) | (cohorts == \"\\n\") | (cohorts == ''))].reset_index(drop=True)\n", " frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)\n", " frame['this_space'] = cohorts\n", " frame['space_number'] = frame.index\n", " frames.append(frame)\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "541669eb-f92e-49f3-9a36-b6625448c1a4", "metadata": {}, "outputs": [], "source": [ "cohort_level_trials = pd.concat(frames, axis=0)" ] }, { "cell_type": "code", "execution_count": null, "id": "51a04e84-7483-4398-b4a0-d0cdab790609", "metadata": {}, "outputs": [], "source": [ "cohort_level_trials.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "648f0e1e-ef81-4983-8f03-1fbdb138f649", "metadata": {}, "outputs": [], "source": [ "cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "9ea048c1-c4ef-4202-a9be-a4658c4f1058", "metadata": {}, "outputs": [], "source": [ "cohort_level_trials = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]" ] }, { "cell_type": "code", "execution_count": null, "id": "852aee9d-ad97-4374-932f-6cae378dde2a", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "00d2220a-627a-4b67-be28-c42561c3c964", "metadata": {}, "outputs": [], "source": [ "cohort_level_trials.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "a130e909-6629-4408-b1ad-201b319d5e0f", "metadata": {}, "outputs": [], "source": [ "temp = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "ad078444-33e1-4398-92b8-2e7f9f1a4031", "metadata": {}, "outputs": [], "source": [ "temp.this_space.nunique()" ] }, { "cell_type": "code", "execution_count": null, "id": "be264ecb-12e7-4fd4-a16b-5a4b2f44d2aa", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "out = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "d38ca13b-f4c4-47f1-abd6-3289abbd5f64", "metadata": {}, "outputs": [], "source": [ "out.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "6849b44d-df0d-464f-bbce-f8fc1f789d3a", "metadata": {}, "outputs": [], "source": [ "# this component and following cells will not run without access to the DFCI private dataset\n", "\n", "import pandas as pd\n", "dfci_trials = pd.read_csv(\"../space_specific_eligibility_checks_11-6-24.csv\")\n", "dfci_trials.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "869690c3-2a80-4403-8933-f8f042c4ae35", "metadata": {}, "outputs": [], "source": [ "non_dfci_ctgov_trials = out[~out.nct_id.isin(dfci_trials.nct_id)]" ] }, { "cell_type": "code", "execution_count": null, "id": "3d28ed5c-d152-40a0-ab14-4aa748f3f8ee", "metadata": {}, "outputs": [], "source": [ "non_dfci_ctgov_trials.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "efdaaf5b-edfb-4900-b85b-dde7eb1f92df", "metadata": {}, "outputs": [], "source": [ "unique_trials = non_dfci_ctgov_trials.groupby('nct_id').first().reset_index()[['nct_id']]\n", "unique_trials.shape[0]" ] }, { "cell_type": "code", "execution_count": null, "id": "41a63a73-4822-4c1d-820d-389252c0c56f", "metadata": {}, "outputs": [], "source": [ "unique_trial_sample = unique_trials.nct_id.sample(n=500, random_state=42)" ] }, { "cell_type": "code", "execution_count": null, "id": "4cbbffe7-72ca-45b4-a11f-bb2d278bcfb7", "metadata": {}, "outputs": [], "source": [ "sample_spaces = non_dfci_ctgov_trials[non_dfci_ctgov_trials.nct_id.isin(unique_trial_sample)]" ] }, { "cell_type": "code", "execution_count": null, "id": "c2639cc4-3472-463c-8519-ce0a9a1d845c", "metadata": {}, "outputs": [], "source": [ "sample_spaces.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "bc6def48-cacc-437b-ac19-2af9418821c2", "metadata": {}, "outputs": [], "source": [ "sample_spaces.to_csv('sample_spaces.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "2b2370cf-e2ec-4e54-8dd0-6dde6d0fb041", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }