NLP EXERCISE: TEXT-CLASSIFICATION ON A DRUG DATASET.

DATA WRANGLING, FINE-TUNING AND PUSHING THE MODEL TO THE HUGGING-FACE HUB

In [None]:
#let's install the necessary libraries from Hugging Face
!pip install datasets evaluate transformers[sentencepiece] transformers[torch]

In [2]:
from huggingface_hub import notebook_login
#authenticate into the huggin face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
#We download the data and unzip the datasets
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2024-04-30 14:10:46--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [      <=>           ]  41.00M  37.7MB/s    in 1.1s    

2024-04-30 14:10:47 (37.7 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [4]:
from datasets import load_dataset

#save the data into a csv file with both splits
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

drug_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
# Randomize the order of the rows within each split
def randomize_dataset(dataset):
    for split in dataset.keys():
        shuffled_dataset = dataset[split].shuffle()
        dataset[split] = shuffled_dataset
    return dataset

drug_dataset = randomize_dataset(drug_dataset)

In [6]:
#rename columns
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [7]:
#remove rows where the condition feature has no data
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

In [8]:
#turn it into lowercase text
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset = drug_dataset.map(lowercase_condition)

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [9]:
# Function to filter out the unwanted string 'users found', because we find that some of the conditions include that weird string sequence
def filter_unwanted_string(example):
    # Return True if the string is not found in the 'condition' field
    return 'users found' not in example['condition']

# Apply the filter function to each split
drug_dataset = drug_dataset.filter(filter_unwanted_string)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

In [10]:
#create a feature called 'review length' because we will later remove reviews that are too short
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

Map:   0%|          | 0/159498 [00:00<?, ? examples/s]

Map:   0%|          | 0/53200 [00:00<?, ? examples/s]

In [11]:
#keep only reviews with at least 16 letters
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 15)
print(drug_dataset.num_rows)

Filter:   0%|          | 0/159498 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53200 [00:00<?, ? examples/s]

{'train': 150391, 'test': 50146}


In [12]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 150391
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 50146
    })
})

In [13]:
import html

#remove html characters from the review feature
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)

Map:   0%|          | 0/150391 [00:00<?, ? examples/s]

Map:   0%|          | 0/50146 [00:00<?, ? examples/s]

In [14]:
#create three splits (train, validation and test) with what we have
drug_clean = new_drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

# Rename the default "test" split to "validation"
drug_clean["validation"] = drug_clean.pop("test")

# Add the "test" set to our `DatasetDict`
drug_clean["test"] = new_drug_dataset["test"]

drug_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 120312
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 30079
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 50146
    })
})

In [15]:
drug_clean['train'].features

{'patient_id': Value(dtype='int64', id=None),
 'drugName': Value(dtype='string', id=None),
 'condition': Value(dtype='string', id=None),
 'review': Value(dtype='string', id=None),
 'rating': Value(dtype='float64', id=None),
 'date': Value(dtype='string', id=None),
 'usefulCount': Value(dtype='int64', id=None),
 'review_length': Value(dtype='int64', id=None)}

In [16]:
from transformers import AutoTokenizer, DataCollatorWithPadding

#let's tokenize the review feature, which is the one we'll use as predictive feature
raw_datasets = drug_clean
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["review"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_pandas = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/120312 [00:00<?, ? examples/s]

Map:   0%|          | 0/30079 [00:00<?, ? examples/s]

Map:   0%|          | 0/50146 [00:00<?, ? examples/s]

Map:   0%|          | 0/120312 [00:00<?, ? examples/s]

In [17]:
#let's remove the unwanted columns and change the name of the target feature

tokenized_datasets= tokenized_datasets.remove_columns(['patient_id', 'rating', 'drugName', 'date', 'usefulCount', 'review_length', 'review'])
tokenized_datasets= tokenized_datasets.rename_column(
    original_column_name="condition", new_column_name="labels"
)

tokenized_pandas= tokenized_pandas.remove_columns(['patient_id', 'rating', 'drugName', 'date', 'usefulCount', 'review_length', 'review'])
tokenized_pandas= tokenized_pandas.rename_column(
    original_column_name="condition", new_column_name="labels"
)

In [18]:
import pandas as pd
tokenized_pandas.set_format('pandas')

# Calculate label frequencies
label_counts_train = tokenized_pandas['train']['labels'].value_counts()

# Sort frequencies and select only the 50 most frequent labels
sorted_labels = label_counts_train.index[:100]

# Subset the dataset
for split in tokenized_datasets.keys():
    tokenized_datasets[split] = tokenized_datasets[split].filter(lambda example: example['labels'] in sorted_labels)

Filter:   0%|          | 0/120312 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30079 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50146 [00:00<?, ? examples/s]

In [19]:
#let's change the string labels to integers for the target feature (recode the labels feature)
# Identify unique labels
unique_labels = set()
for split in tokenized_datasets.keys():
    unique_labels.update(tokenized_datasets[split]['labels'])

# Create a mapping from labels to integers
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}

# Apply the mapping to convert labels to integers
def label_to_int_mapping(example):
    example['labels'] = label_to_int[example['labels']]
    return example

# Update datasets with the new integer labels
tokenized_datasets = tokenized_datasets.map(label_to_int_mapping)


Map:   0%|          | 0/107120 [00:00<?, ? examples/s]

Map:   0%|          | 0/26702 [00:00<?, ? examples/s]

Map:   0%|          | 0/44617 [00:00<?, ? examples/s]

In [22]:
# Print each label with its corresponding integer value
label_dict = {}
for label, integer in label_to_int.items():
    label_dict[integer] = label

print(label_dict)

{0: 'multiple sclerosis', 1: 'overactive bladde', 2: 'hyperhidrosis', 3: 'ibromyalgia', 4: 'menstrual disorders', 5: 'hypogonadism, male', 6: 'rosacea', 7: 'muscle spasm', 8: 'high blood pressure', 9: 'epilepsy', 10: 'psoriatic arthritis', 11: 'post traumatic stress disorde', 12: 'smoking cessation', 13: 'not listed / othe', 14: 'herpes simplex', 15: 'opiate dependence', 16: 'social anxiety disorde', 17: 'urticaria', 18: 'allergic rhinitis', 19: 'polycystic ovary syndrome', 20: 'obsessive compulsive disorde', 21: 'depression', 22: 'migraine prevention', 23: 'neuropathic pain', 24: 'ankylosing spondylitis', 25: 'skin or soft tissue infection', 26: 'constipation, drug induced', 27: 'obesity', 28: 'vaginal yeast infection', 29: 'osteoarthritis', 30: 'restless legs syndrome', 31: 'plaque psoriasis', 32: 'panic disorde', 33: 'abnormal uterine bleeding', 34: 'adhd', 35: 'high cholesterol', 36: 'diabetes, type 2', 37: 'anxiety and stress', 38: 'asthma, maintenance', 39: 'pneumonia', 40: 'schi

In [23]:
from evaluate import load
#let's define a comput_metrics function that will return accuracy, precision, recall and f1 for our training
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)

    # Load metrics with zero_division parameter set to 1
    metrics = {
        'accuracy': load('accuracy'),
        'precision': load('precision', config={'zero_division': 1}),
        'recall': load('recall', config={'zero_division': 1}),
        'f1': load('f1', config={'zero_division': 1})
    }

    # Compute each metric, omitting the 'average' argument for accuracy
    results = {}
    for metric_name, metric in metrics.items():
        if metric_name == 'accuracy':
            result = metric.compute(predictions=preds, references=labels)
            results[metric_name] = result['accuracy']
        else:
            result = metric.compute(predictions=preds, references=labels, average='weighted')
            results[metric_name] = result[metric_name]

    return results

In [24]:
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

#let's set the training arguments for the model
training_args = TrainingArguments("test-trainer")

#let's set the training arguments and model, and the trainer arguments below
training_args = TrainingArguments("bert-drug-review-to-condition", push_to_hub=True, save_strategy="epoch", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(label_to_int.items()))

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
import numpy as np
#let's prepare the metrics, getting first the predictions of the trainer
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  _warn_prf(average, modifier, msg_start, len(result))


(26702, 100) (26702,)


In [26]:
import numpy as np
#given that we get all the logs from the trainer, we'll select the label with the highest likelihood with an argmax function
preds = np.argmax(predictions.predictions, axis=-1)

In [27]:
#let's train the model right away
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8469,0.827508,0.767283,0.768594,0.767283,0.755066
2,0.6319,0.689465,0.809415,0.808975,0.809415,0.797826
3,0.4116,0.667829,0.837578,0.832482,0.837578,0.831665


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=40170, training_loss=0.7559129410606727, metrics={'train_runtime': 4322.1673, 'train_samples_per_second': 74.352, 'train_steps_per_second': 9.294, 'total_flos': 3.0660538964049024e+16, 'train_loss': 0.7559129410606727, 'epoch': 3.0})