## Installing Necessary Libraries

In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes einops wandb
!pip install datasets==2.16.0

Collecting datasets==2.16.0
  Obtaining dependency information for datasets==2.16.0 from https://files.pythonhosted.org/packages/a0/93/da8a22a292e51ab76f969eb87bda8fd70cc3963b4dd71f67bb92a70a7992/datasets-2.16.0-py3-none-any.whl.metadata
  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.16.0)
  Obtaining dependency information for pyarrow-hotfix from https://files.pythonhosted.org/packages/e4/f4/9ec2222f5f5f8ea04f66f184caafd991a39c8782e31f5b0266f101cb68ca/pyarrow_hotfix-0.6-py3-none-any.whl.metadata
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting fsspec[http]<=2023.10.0,>=2023.1.0 (from datasets==2.16.0)
  Obtaining dependency information for fsspec[http]<=2023.10.0,>=2023.1.0 from https://files.pythonhosted.org/packages/e8/f6/3eccfb530aac90ad1301c582da228e4763f19e719ac8200752a4841b0b2d/fsspec-2023.10.0-py3-none-any.whl.metadata
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Down

# Dataset details
Instacart Data can be downloaded from [here](https://www.kaggle.com/competitions/instacart-market-basket-analysis/data). We just need product & department csv files

In [2]:
import pandas as pd
df_product = pd.read_csv("/kaggle/input/llft-datasets/departments.csv")
df_dept = pd.read_csv('/kaggle/input/llft-datasets/products.csv')

In [3]:
df_joined = pd.merge(df_product, df_dept, on = ['department_id'])
df_joined['text'] = df_joined.apply(lambda row: row['product_name'] + " ->: " + row['department'], axis = 1)

In [4]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df_joined, test_size=0.2, random_state=42)



In [5]:
train_df.head(10)

Unnamed: 0,department_id,department,product_id,product_name,aisle_id,text
7361,4,produce,38057,Spicy Organic Microgreens,83,Spicy Organic Microgreens ->: produce
32534,15,canned goods,32461,Spaghetti Cool Shapes Star Wars,59,Spaghetti Cool Shapes Star Wars ->: canned goods
1457,1,frozen,17540,Burnt Sugar Vanilla Ice Cream,37,Burnt Sugar Vanilla Ice Cream ->: frozen
5201,3,bakery,22335,Sliced Italian Bread,112,Sliced Italian Bread ->: bakery
38539,17,household,29431,Air Effects Value Pack Hawaiian Aloha,101,Air Effects Value Pack Hawaiian Aloha ->: hous...
18377,11,personal care,9014,Mint Mouthwash,20,Mint Mouthwash ->: personal care
28627,13,pantry,37330,Pure Cane Washed Raw Sugar,17,Pure Cane Washed Raw Sugar ->: pantry
30283,14,breakfast,11892,Mixed Berry BelVita Bites,48,Mixed Berry BelVita Bites ->: breakfast
32030,15,canned goods,20544,Peeled Whole Tomatoes,81,Peeled Whole Tomatoes ->: canned goods
28655,13,pantry,37534,Unprocessed Wheat Bran,17,Unprocessed Wheat Bran ->: pantry


In [6]:
test_df.head(10)

Unnamed: 0,department_id,department,product_id,product_name,aisle_id,text
33626,16,dairy eggs,5570,Coffee Rich Original Non-Dairy Creamer,53,Coffee Rich Original Non-Dairy Creamer ->: dai...
18192,11,personal care,7582,Cold Snap,11,Cold Snap ->: personal care
47099,19,snacks,49614,Sandies Pecan Shortbread Cookies,61,Sandies Pecan Shortbread Cookies ->: snacks
48183,20,deli,39968,Miso Soup,1,Miso Soup ->: deli
22197,11,personal care,37799,Body Envy Volumizing Shampoo,22,Body Envy Volumizing Shampoo ->: personal care
31573,15,canned goods,9984,Bean Salad,81,Bean Salad ->: canned goods
45362,19,snacks,35795,Mexican Restaurant Style Corn Tortilla Chips,107,Mexican Restaurant Style Corn Tortilla Chips -...
14131,7,beverages,47666,Grapefruit No Sugar Added 100% Juice,98,Grapefruit No Sugar Added 100% Juice ->: bever...
26903,13,pantry,21534,"Salsa, Diablo, Hot",51,"Salsa, Diablo, Hot ->: pantry"
39417,17,household,43688,Toilet Bowl Cleaner with Lime & Rust Remover,114,Toilet Bowl Cleaner with Lime & Rust Remover -...


In [7]:
from datasets import Dataset,DatasetDict
train_dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df),
})

## Loading the model

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

config.json:   0%|          | 0.00/626 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00001-of-00014.safetensors:   0%|          | 0.00/981M [00:00<?, ?B/s]

model-00002-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00003-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00004-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00005-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00006-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00007-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00008-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00009-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00010-of-00014.safetensors:   0%|          | 0.00/944M [00:00<?, ?B/s]

model-00011-of-00014.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

model-00012-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00013-of-00014.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

model-00014-of-00014.safetensors:   0%|          | 0.00/847M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Let's also load the tokenizer below

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/676 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Below we will load the configuration file in order to create the LoRA model. According to QLoRA paper, it is important to consider all linear layers in the transformer block for maximum performance.

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [12]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","v_proj"]
)

## Loading the trainer

Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below.

In [13]:
from transformers import TrainingArguments

output_dir = "results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 120
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

Then finally pass everthing to the trainer

In [14]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset_dict['train'],
    # train_dataset=data['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    # dataset_text_field="prediction",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

Map:   0%|          | 0/39750 [00:00<?, ? examples/s]



## Train the model

Now let's train the model! Simply call `trainer.train()`

In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,4.6888
2,3.0755
3,4.1088
4,5.8947
5,4.0705
6,4.1923
7,6.4053
8,4.3635
9,4.2534
10,4.2418


TrainOutput(global_step=120, training_loss=2.617857338984807, metrics={'train_runtime': 691.1477, 'train_samples_per_second': 2.778, 'train_steps_per_second': 0.174, 'total_flos': 1214487435018240.0, 'train_loss': 2.617857338984807, 'epoch': 0.05})

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

peft_model_dir = "/kaggle/working/results/checkpoint-120"
trained_model = AutoPeftModelForCausalLM.from_pretrained(
    peft_model_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
# Merge LoRA and base model
merged_model = trained_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")

# push merged model to the hub
merged_model.push_to_hub("llama_ft")
tokenizer.push_to_hub("llama_ft")

Loading checkpoint shards:   0%|          | 0/14 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sweetpablo/llama_ft/commit/6189c17b05b7e4e1dd91e1ed2c3ebe48d088509c', commit_message='Upload tokenizer', commit_description='', oid='6189c17b05b7e4e1dd91e1ed2c3ebe48d088509c', pr_url=None, pr_revision=None, pr_num=None)