# Dataset processing

This notebook processes the raw csv outputs from VAERS into Huggingface datasets. It shouldn't generally need to be run by the end user. 

In [15]:
import pandas as pd
import datasets
import glob
import tqdm.notebook as tqdm
from sklearn.model_selection import train_test_split
from typing import Tuple
from datetime import datetime

pd.set_option('future.no_silent_downcasting', True)

In [16]:
HF_URL: str = "chrisvoncsefalvay/vaers-outcomes"

FLAG_COLUMNS: list = ["DIED", "ER_VISIT", "HOSPITAL", "OFC_VISIT", "X_STAY", "DISABLE"]
DEMOGRAPHIC_COLUMNS: list = ["AGE_YRS", "SEX"]
DERIVED_COLUMNS: list = ["D_PRESENTED"]
ID_COLUMNS: list = ["VAERS_ID"]
TEXT_COLUMNS: list = ["SYMPTOM_TEXT"]

TEST_TRAIN_FRACTION: float = 0.3
TRAIN_VAL_FRACTION: float = 0.5

## Reading data files

In [17]:
def read_aggregate(pattern: str) -> pd.DataFrame:
    files = glob.glob(f"../data/{pattern}")
    dfs = []
    for file in tqdm.tqdm(files):
        dfs.append(pd.read_csv(file, encoding="latin-1", low_memory=False))

    res = pd.concat(dfs, ignore_index=True)
    
    print(f"Processed {len(dfs)} files for a total of {len(res)} records.")
        
    return res

In [18]:
data = read_aggregate("*VAERSDATA.csv")

  0%|          | 0/1 [00:00<?, ?it/s]

Processed 1 files for a total of 105726 records.


In [19]:
_keep: list = ID_COLUMNS + DEMOGRAPHIC_COLUMNS + TEXT_COLUMNS + FLAG_COLUMNS + ["ER_ED_VISIT"]
data = data[_keep]

## Recoding

We recode as follows:

* For the outcome flags, `NaN` is recoded as `0` and `Y` is recoded as `1`.
* `ER_VISIT` and `ER_ED_VISIT` are coalesced into a single column called `ER_VISIT` that is `1`-valued if either is `1`-valued, otherwise it is `0`-valued. This is to manage the renaming of the column in the VAERS data.
* `NaN`s in the symptom text will drop the record.

In [20]:
def recode(df: pd.DataFrame) -> pd.DataFrame:
    for column in FLAG_COLUMNS + ["ER_ED_VISIT"]:
        df[column] = df[column].replace("Y", 1).fillna(0).astype(int)
    
    df['ER_VISIT'] = df[['ER_VISIT', 'ER_ED_VISIT']].max(axis=1)
    
    df = df.drop(columns=['ER_ED_VISIT'])
    
    df = df.dropna(subset=['SYMPTOM_TEXT'])
    
    return df

In [21]:
data = recode(data)

## Derived fields

We create the derived field `D_PRESENTED`. This is to provide a shorthand for patients who present in any way: ER, hospitalisation, office visit. It also comprises patients whose hospital stay is extended (`X_STAY`) as this is typically the consequence of presenting.

In [22]:
data['D_PRESENTED'] = data[['ER_VISIT', 'HOSPITAL', 'OFC_VISIT', 'X_STAY']].max(axis=1)

## Test/train/validate split

We do a stratified split by age quintile and gender into test, train and validate sets.

In [23]:
def stratified_split(df: pd.DataFrame, test_train_fraction: float, train_val_fraction: float, random_state: int = None) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df['AGE_QUINTILE'] = pd.qcut(df['AGE_YRS'], 5, labels = False)
    df['STRATIFICATION_VARIABLE'] = df['SEX'].astype(str) + "_" + df['AGE_QUINTILE'].astype(str)
    df = df.drop(columns=['AGE_QUINTILE'])
     
    _, train = train_test_split(df, train_size=test_train_fraction, random_state=random_state, stratify=df.STRATIFICATION_VARIABLE)
    
    val, test = train_test_split(_, train_size=train_val_fraction, random_state=random_state, stratify=_.STRATIFICATION_VARIABLE)
    
    train = train.drop(columns="STRATIFICATION_VARIABLE")
    val = val.drop(columns="STRATIFICATION_VARIABLE")
    test = test.drop(columns="STRATIFICATION_VARIABLE") 
    
    return train, test, val

In [24]:
train, test, val = stratified_split(data, TEST_TRAIN_FRACTION, TRAIN_VAL_FRACTION)

## Converting to labels

In [25]:
def convert_to_dataset(df: pd.DataFrame) -> datasets.Dataset:
    df = df.loc[:, ID_COLUMNS + TEXT_COLUMNS + FLAG_COLUMNS + DERIVED_COLUMNS]
    
    # We create the labels â€“ these have to be floats for multilabel classification that uses BCEWithLogitsLoss
    df.loc[:, "labels"] = df[FLAG_COLUMNS + DERIVED_COLUMNS].values.astype(float).tolist()
    
    print(f"Building dataset with the following label order: {' '.join(FLAG_COLUMNS + DERIVED_COLUMNS)}")
    
    # We drop the flag columns
    df = df.drop(columns=FLAG_COLUMNS).drop(columns=DERIVED_COLUMNS)
    
    # We rename the remaining columns
    df = df.rename(columns={"SYMPTOM_TEXT": "text", "VAERS_ID": "id"})
    
    return datasets.Dataset.from_pandas(df, preserve_index=False)

In [26]:
ds = datasets.DatasetDict()
ds["train"] = convert_to_dataset(train)
ds["test"] = convert_to_dataset(test)
ds["val"] = convert_to_dataset(val)

Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED
Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED
Building dataset with the following label order: DIED ER_VISIT HOSPITAL OFC_VISIT X_STAY DISABLE D_PRESENTED


## Saving to Huggingface Hub

In [27]:
commit_message = f"""Data set commit of {len(train) + len(test) + len(val)} records of VAERS data at {datetime.now().isoformat()}."""

ds.push_to_hub(HF_URL, 
               commit_message=commit_message,
               create_pr=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/94.0 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chrisvoncsefalvay/vaers-outcomes/commit/65fa5129a0b1eb64f8fdd1aca5490965810e4ddb', commit_message='Data set commit of 105238 records of VAERS data at 2024-01-27T15:28:40.206686.', commit_description='', oid='65fa5129a0b1eb64f8fdd1aca5490965810e4ddb', pr_url='https://huggingface.co/datasets/chrisvoncsefalvay/vaers-outcomes/discussions/1', pr_revision='refs/pr/1', pr_num=1)