# Processing the data (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
#let's install the necessary libraries from Hugging Face
!pip install datasets evaluate transformers[sentencepiece] transformers[torch]

In [137]:
from huggingface_hub import notebook_login
#authenticate into the huggin face
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [107]:
#We download the data and unzip the datasets
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2024-04-28 15:34:17--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.1’

drugsCom_raw.zip.1      [     <=>            ]  41.00M  40.1MB/s    in 1.0s    

2024-04-28 15:34:18 (40.1 MB/s) - ‘drugsCom_raw.zip.1’ saved [42989872]

Archive:  drugsCom_raw.zip
replace drugsComTest_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTest_raw.tsv    
replace drugsComTrain_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: drugsComTrain_raw.tsv   


In [108]:
from datasets import load_dataset

#save the data into a csv file with both splits
data_files = {"train": "drugsComTrain_raw.tsv", "test": "drugsComTest_raw.tsv"}
# \t is the tab character in Python
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [109]:
#rename columns
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [110]:
#remove rows where the condition feature has no data
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)

In [111]:
#turn it into lowercase text
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drug_dataset = drug_dataset.map(lowercase_condition)

In [112]:
#create a feature called 'review length' because we will later remove reviews that are too short
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)

In [113]:
#keep only reviews with at least 16 letters
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 15)
print(drug_dataset.num_rows)

{'train': 151236, 'test': 50396}


In [114]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 151236
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 50396
    })
})

In [115]:
import html

#remove html characters from the review feature
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True)

In [116]:
#create three splits (train, validation and test) with what we have
drug_clean = new_drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

# Rename the default "test" split to "validation"
drug_clean["validation"] = drug_clean.pop("test")

# Add the "test" set to our `DatasetDict`
drug_clean["test"] = new_drug_dataset["test"]

drug_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 120988
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 30248
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 50396
    })
})

In [117]:
drug_clean['train'].features

{'patient_id': Value(dtype='int64', id=None),
 'drugName': Value(dtype='string', id=None),
 'condition': Value(dtype='string', id=None),
 'review': Value(dtype='string', id=None),
 'rating': Value(dtype='float64', id=None),
 'date': Value(dtype='string', id=None),
 'usefulCount': Value(dtype='int64', id=None),
 'review_length': Value(dtype='int64', id=None)}

In [118]:
from transformers import AutoTokenizer, DataCollatorWithPadding

#let's tokenize the review feature, which is the one we'll use as predictive feature
raw_datasets = drug_clean
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["review"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/120988 [00:00<?, ? examples/s]

Map:   0%|          | 0/30248 [00:00<?, ? examples/s]

Map:   0%|          | 0/50396 [00:00<?, ? examples/s]

In [119]:
#let's remove the unwanted columns and change the name of the target feature

tokenized_datasets= tokenized_datasets.remove_columns(['patient_id', 'rating', 'drugName', 'date', 'usefulCount', 'review_length', 'review'])
tokenized_datasets= tokenized_datasets.rename_column(
    original_column_name="condition", new_column_name="labels"
)

tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120988
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30248
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50396
    })
})

In [120]:
#let's change the string labels to integers for the target feature (recode the labels feature)
# Identify unique labels
unique_labels = set()
for split in tokenized_datasets.keys():
    unique_labels.update(tokenized_datasets[split]['labels'])

# Create a mapping from labels to integers
label_to_int = {label: idx for idx, label in enumerate(unique_labels)}

# Apply the mapping to convert labels to integers
def label_to_int_mapping(example):
    example['labels'] = label_to_int[example['labels']]
    return example

# Update datasets with the new integer labels
tokenized_datasets = tokenized_datasets.map(label_to_int_mapping)


Map:   0%|          | 0/120988 [00:00<?, ? examples/s]

Map:   0%|          | 0/30248 [00:00<?, ? examples/s]

Map:   0%|          | 0/50396 [00:00<?, ? examples/s]

In [121]:
#Given that the original dataset has more than 800 labels and this is only an exercise, let's simplify the training by selecting only the first 21 labels
# Define a filter function
def filter_labels(example):
    # Return True for rows where 'labels' value is between 0 and 20
    return 0 <= example['labels'] <= 20

# Apply the filter function to each split in the dataset
tokenized_datasets = tokenized_datasets.filter(filter_labels)

Filter:   0%|          | 0/120988 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30248 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50396 [00:00<?, ? examples/s]

In [122]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 904
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 253
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 369
    })
})

In [138]:
from transformers import TrainingArguments
#let's set the training arguments for the model
training_args = TrainingArguments("test-trainer")

#let's set the training arguments and model, and the trainer arguments below
training_args = TrainingArguments("bert-drug-review-to-condition", push_to_hub=True, save_strategy="epoch", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=21)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [139]:
#let's prepare the metrics, getting first the predictions of the trainer
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [140]:
import numpy as np
#given that we get all the logs from the trainer, we'll select the label with the highest likelihood with an argmax function
preds = np.argmax(predictions.predictions, axis=-1)

In [141]:
from evaluate import load
#let's define a comput_metrics function that will return accuracy, precision, recall and f1 for our training
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)

    # Load metrics with zero_division parameter set to 1
    metrics = {
        'accuracy': load('accuracy'),
        'precision': load('precision', config={'zero_division': 1}),
        'recall': load('recall', config={'zero_division': 1}),
        'f1': load('f1', config={'zero_division': 1})
    }

    # Compute each metric, omitting the 'average' argument for accuracy
    results = {}
    for metric_name, metric in metrics.items():
        if metric_name == 'accuracy':
            result = metric.compute(predictions=preds, references=labels)
            results[metric_name] = result['accuracy']
        else:
            result = metric.compute(predictions=preds, references=labels, average='weighted')
            results[metric_name] = result[metric_name]

    return results

In [142]:
#let's train the model right away
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.137511,0.774704,0.730148,0.774704,0.744985
2,No log,0.559548,0.885375,0.867484,0.885375,0.872829
3,No log,0.430789,0.920949,0.906086,0.920949,0.910576


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=339, training_loss=1.01418788313514, metrics={'train_runtime': 138.9845, 'train_samples_per_second': 19.513, 'train_steps_per_second': 2.439, 'total_flos': 245512156913712.0, 'train_loss': 1.01418788313514, 'epoch': 3.0})

In [143]:
#it's good. Let's push the model to the hugging face hub
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/Marcuswas/bert-drug-review-to-condition/commit/d6f99808a6fad08574389e154a44694bbd24ad5d', commit_message='End of training', commit_description='', oid='d6f99808a6fad08574389e154a44694bbd24ad5d', pr_url=None, pr_revision=None, pr_num=None)