# DAEDRA: Determining Adverse Event Disposition for Regulatory Affairs

DAEDRA is a language model intended to predict the disposition (outcome) of an adverse event based on the text of the event report. Intended to be used to classify reports in passive reporting systems, it is trained on the [VAERS](https://vaers.hhs.gov/) dataset, which contains reports of adverse events following vaccination in the United States.

In [1]:
# %pip install accelerate -U

In [1]:
%pip install transformers datasets shap watermark wandb scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import torch
import os
from typing import List
from sklearn.metrics import f1_score, accuracy_score, classification_report
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments, pipeline
from datasets import load_dataset, Dataset, DatasetDict
from pyarrow import Table
import shap
import wandb
from skmultilearn.problem_transform import LabelPowerset

os.environ["TOKENIZERS_PARALLELISM"] = "false"

%load_ext watermark

  from .autonotebook import tqdm as notebook_tqdm
2024-01-28 15:09:42.856486: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-28 15:09:43.818179: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-01-28 15:09:43.818307: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
device: str = 'cuda' if torch.cuda.is_available() else 'cpu'

SEED: int = 42

BATCH_SIZE: int = 16
EPOCHS: int = 3
model_ckpt: str = "distilbert-base-uncased"

CLASS_NAMES: List[str] = ["DIED",
                          "ER_VISIT",
                          "HOSPITAL",
                          "OFC_VISIT",
                          #"X_STAY",      # pruned
                          #"DISABLE",     # pruned
                          #"D_PRESENTED"  # pruned
                          ]




# WandB configuration
os.environ["WANDB_PROJECT"] = "DAEDRA model training"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
os.environ["WANDB_NOTEBOOK_NAME"] = "DAEDRA.ipynb"

In [4]:
%watermark --iversion

shap   : 0.44.1
logging: 0.5.1.2
pandas : 2.0.2
numpy  : 1.23.5
torch  : 1.12.0
wandb  : 0.16.2
re     : 2.2.1



In [5]:
!nvidia-smi

Sun Jan 28 15:09:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-PCIE-16GB           Off | 00000001:00:00.0 Off |                  Off |
| N/A   30C    P0              38W / 250W |      4MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE-16GB           Off | 00000002:00:0

## Loading the data set

In [7]:
dataset = load_dataset("chrisvoncsefalvay/vaers-outcomes")

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 1270444
    })
    test: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 272238
    })
    val: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 272238
    })
})

In [9]:
SUBSAMPLING: float = 0.1

In [10]:
def minisample(ds: DatasetDict, fraction: float) -> DatasetDict:
    res = DatasetDict()

    res["train"] = Dataset.from_dict(ds["train"].shuffle()[:round(len(ds["train"]) * fraction)])
    res["test"] = Dataset.from_dict(ds["test"].shuffle()[:round(len(ds["test"]) * fraction)])
    res["val"] = Dataset.from_dict(ds["val"].shuffle()[:round(len(ds["val"]) * fraction)])
    
    return res

In [11]:
dataset = minisample(dataset, SUBSAMPLING)

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 127044
    })
    test: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 27224
    })
    val: Dataset({
        features: ['id', 'text', 'labels'],
        num_rows: 27224
    })
})

We prune things down to the first four keys: `DIED`, `ER_VISIT`, `HOSPITAL`, `OFC_VISIT`.

In [13]:
ds = DatasetDict()

for i in ["test", "train", "val"]:
    tab = Table.from_arrays([dataset[i]["id"], dataset[i]["text"], [i[:4] for i in dataset[i]["labels"]]], names=["id", "text", "labels"])
    ds[i] = Dataset(tab)

dataset = ds

### Tokenisation and encoding

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [15]:
def tokenize_and_encode(examples):
  return tokenizer(examples["text"], truncation=True)

In [16]:
cols = dataset["train"].column_names
cols.remove("labels")
ds_enc = dataset.map(tokenize_and_encode, batched=True, remove_columns=cols)

Map: 100%|██████████| 27224/27224 [00:10<00:00, 2638.52 examples/s]
Map: 100%|██████████| 127044/127044 [00:48<00:00, 2633.40 examples/s]
Map: 100%|██████████| 27224/27224 [00:10<00:00, 2613.19 examples/s]


### Training

In [17]:
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(CLASS_NAMES)).to("cuda")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
def accuracy_threshold(y_pred, y_true, threshold=.5, sigmoid=True):
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)

    if sigmoid:
        y_pred = y_pred.sigmoid()

    return ((y_pred > threshold) == y_true.bool()).float().mean().item()

In [20]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_threshold(predictions, labels)}

In [21]:
args = TrainingArguments(
    output_dir="vaers",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=.01,
    logging_steps=1,
    run_name=f"daedra-training",
    report_to=["wandb"]
)

In [22]:
multi_label_trainer = MultiLabelTrainer(
    model, 
    args, 
    train_dataset=ds_enc["train"], 
    eval_dataset=ds_enc["test"], 
    compute_metrics=compute_metrics, 
    tokenizer=tokenizer
)

In [23]:
if SUBSAMPLING != 1.0:
    wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
else:
    wandb_tag: List[str] = [f"full_sample"]
    
wandb.init(name="init_evaluation_run", tags=wandb_tag, magic=True)

multi_label_trainer.evaluate()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mchrisvoncsefalvay[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
eval/accuracy_thresh,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/global_step,▁

0,1
eval/accuracy_thresh,0.55198
eval/loss,0.68442
eval/runtime,105.0436
eval/samples_per_second,259.168
eval/steps_per_second,8.101
train/global_step,0.0


In [None]:
if SUBSAMPLING != 1.0:
    wandb_tag: List[str] = [f"subsample-{SUBSAMPLING}"]
else:
    wandb_tag: List[str] = [f"full_sample"]
    
wandb.init(name="daedra_training_run", tags=wandb_tag, magic=True)

multi_label_trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss


[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-500)... Done. 15.6s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-1000)... Done. 22.7s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-1500)... Done. 14.0s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-2000)... Done. 15.2s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-2500)... Done. 14.0s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-3000)... Done. 12.4s
[34m[1mwandb[0m: Adding directory to artifact (./vaers/checkpoint-3500)... Done. 13.4s


### Evaluation

We instantiate a classifier `pipeline` and push it to CUDA.

In [None]:
classifier = pipeline("text-classification", 
                      model, 
                      tokenizer=tokenizer, 
                      device="cuda:0")

We use the same tokenizer used for training to tokenize/encode the validation set.

In [None]:
test_encodings = tokenizer.batch_encode_plus(dataset["val"]["text"], 
                                             max_length=None, 
                                             padding='max_length', 
                                             return_token_type_ids=True, 
                                             truncation=True)

Once we've made the data loadable by putting it into a `DataLoader`, we 

In [None]:
test_data = torch.utils.data.TensorDataset(torch.tensor(test_encodings['input_ids']), 
                                           torch.tensor(test_encodings['attention_mask']), 
                                           torch.tensor(ds_enc["val"]["labels"]), 
                                           torch.tensor(test_encodings['token_type_ids']))
test_dataloader = torch.utils.data.DataLoader(test_data, 
                                              sampler=torch.utils.data.SequentialSampler(test_data), 
                                              batch_size=BATCH_SIZE)

In [None]:
model.eval()

logit_preds, true_labels, pred_labels, tokenized_texts = [], [], [], []

for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  
  with torch.no_grad():
    outs = model(b_input_ids, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Converting flattened binary values to boolean values
true_bools = [tl == 1 for tl in true_labels]
pred_bools = [pl > 0.50 for pl in pred_labels] 

We create a classification report:

In [None]:
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools, average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools), '\n')
clf_report = classification_report(true_bools, pred_bools, target_names=CLASS_NAMES)
print(clf_report)

Finally, we render a 'head to head' comparison table that maps each text prediction to actual and predicted labels.

In [None]:
# Creating a map of class names from class numbers
idx2label = dict(zip(range(len(CLASS_NAMES)), CLASS_NAMES))

In [None]:
true_label_idxs, pred_label_idxs = [], []

for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
true_label_texts, pred_label_texts = [], []

for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

In [None]:
symptom_texts = [tokenizer.decode(text,
                                  skip_special_tokens=True,
                                  clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
comparisons_df = pd.DataFrame({'symptom_text': symptom_texts, 
                               'true_labels': true_label_texts, 
                               'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df