Crystalcareai commited on
Commit
0b98bc4
·
verified ·
1 Parent(s): 0b25161

Update schedulefree.py

Browse files
Files changed (1) hide show
  1. schedulefree.py +59 -62
schedulefree.py CHANGED
@@ -1,12 +1,12 @@
1
  import signal
2
  import sys
 
3
  from datasets import load_dataset
4
- from transformers import TrainingArguments
5
  from trl import SFTTrainer
6
- import torch
7
- from transformers import AutoTokenizer, AutoModelForCausalLM
8
- from peft import LoraConfig
9
- from schedulefree import AdamWScheduleFree
10
 
11
  # Signal handler function
12
  def signal_handler(sig, frame):
@@ -16,88 +16,85 @@ def signal_handler(sig, frame):
16
  # Register signal handler
17
  signal.signal(signal.SIGINT, signal_handler)
18
 
19
- dataset = load_dataset("Crystalcareai/Orca-Reka")['train']
 
 
 
 
 
 
 
 
20
 
21
- def chatml_format(example):
22
- """Format the dataset for training, accounting for empty columns."""
23
- return {
24
- "instruction": example['instruction'] if 'instruction' in example else " \n",
25
- "input": example['input'] if 'input' in example else " \n",
26
- "system": example['system'] if 'system' in example else " \n",
27
- "output": example['output'] if 'output' in example else " \n",
28
- }
29
 
30
- # Format dataset
31
- dataset = dataset.map(chatml_format, remove_columns=dataset.column_names)
32
 
33
- # Load model and tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  model = AutoModelForCausalLM.from_pretrained(
35
  model_id,
36
  device_map="auto",
37
- attn_implementation="flash_attention_2",
38
  torch_dtype=torch.bfloat16,
39
- )
40
- tokenizer = AutoTokenizer.from_pretrained(model)
41
- tokenizer.padding_side = 'right' # to prevent warnings
42
-
43
- peft_config = LoraConfig(
44
- lora_alpha=16,
45
- lora_dropout=0.05,
46
- r=32,
47
- bias="none",
48
- target_modules=[
49
- "0.w1",
50
- "0.w2",
51
- "0.w3",
52
- "q_proj",
53
- "v_proj",
54
- "k_proj",
55
- "o_proj"
56
- ],
57
- task_type="CAUSAL_LM",
58
- use_dora=False, # Enable Dora method
59
  )
60
 
 
61
  args = TrainingArguments(
62
- output_dir="./out", # directory to save and repository id
63
- num_train_epochs=3, # number of training epochs
64
- per_device_train_batch_size=4, # batch size per device during training
65
- gradient_checkpointing=True, # use gradient checkpointing to save memory
66
- optim="adamw_hf",
67
  logging_steps=2,
68
  save_strategy="steps",
69
  save_steps=300,
70
- bf16=True, # use bfloat16 precision
71
- tf32=True, # use tf32 precision
72
- ### peft specific arguments ###
73
- learning_rate=2e-4,
74
- max_grad_norm=0.3,
75
  warmup_ratio=0.00,
76
- lr_scheduler_type="constant",
77
- report_to="wandb",
78
- push_to_hub=False,
79
- # push model to hub
80
  )
81
 
82
- max_seq_length = 2048 # max sequence length for model and packing of the dataset
83
 
84
- # Create the schedulefree optimizer
85
- optimizer = AdamWScheduleFree(model.parameters(), lr=args.learning_rate, beta=0.9)
 
 
 
86
 
87
- trainer = SFTTrainer(
 
88
  model=model,
89
  args=args,
90
  train_dataset=dataset,
91
- ### peft specific arguments ###
92
- peft_config=peft_config,
93
  max_seq_length=max_seq_length,
94
  tokenizer=tokenizer,
 
95
  packing=False,
96
- optimizers=(optimizer, None), # Pass the schedulefree optimizer
97
  )
98
 
99
- # start training, the model will be automatically saved to the hub and the output directory
100
  trainer.train()
101
 
102
- # save model
103
- trainer.save_model()
 
1
  import signal
2
  import sys
3
+ import torch
4
  from datasets import load_dataset
5
+ from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM
6
  from trl import SFTTrainer
7
+
8
+ # Importing Sophia optimizer
9
+ from sophia import SophiaG
 
10
 
11
  # Signal handler function
12
  def signal_handler(sig, frame):
 
16
  # Register signal handler
17
  signal.signal(signal.SIGINT, signal_handler)
18
 
19
+ # Load the dataset
20
+ dataset = load_dataset("Crystalcareai/Orca-Reka", split="train")
21
+ model_id = "./outkannn"
22
+ tokenizer_id = model_id
23
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
24
+ tokenizer.padding_side = 'right'
25
+
26
+ # Formatting function for the dataset
27
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
28
 
29
+ ### Instruction:
30
+ {}
 
 
 
 
 
 
31
 
32
+ ### Input:
33
+ {}
34
 
35
+ ### Response:
36
+ {}"""
37
+ def formatting_prompts_func(examples):
38
+ instructions = examples["instruction"]
39
+ inputs = examples["input"]
40
+ outputs = examples["output"]
41
+ texts = []
42
+ EOS_TOKEN = tokenizer.eos_token
43
+ for instruction, input, output in zip(instructions, inputs, outputs):
44
+ text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
45
+ texts.append(text)
46
+ return {"text": texts}
47
+
48
+ # Process and map the formatting function
49
+ dataset = dataset.map(formatting_prompts_func, batched=True)
50
+
51
+ # Load model
52
  model = AutoModelForCausalLM.from_pretrained(
53
  model_id,
54
  device_map="auto",
 
55
  torch_dtype=torch.bfloat16,
56
+ trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  )
58
 
59
+ # Define training arguments
60
  args = TrainingArguments(
61
+ output_dir="./out",
62
+ num_train_epochs=3,
63
+ per_device_train_batch_size=4,
64
+ gradient_checkpointing=True,
 
65
  logging_steps=2,
66
  save_strategy="steps",
67
  save_steps=300,
68
+ bf16=True,
69
+ tf32=True,
70
+ learning_rate=1e-4,
71
+ max_grad_norm=0.1,
 
72
  warmup_ratio=0.00,
73
+ lr_scheduler_type="cosine",
74
+ push_to_hub=False
 
 
75
  )
76
 
77
+ max_seq_length = 2048
78
 
79
+ # Custom Trainer Class
80
+ class CustomTrainer(SFTTrainer):
81
+ def create_optimizer(self):
82
+ # Override to use SophiaG optimizer
83
+ self.optimizer = SophiaG(self.model.parameters(), lr=self.args.learning_rate, betas=(0.965, 0.99), rho=0.01, weight_decay=0.1)
84
 
85
+ # Trainer configuration
86
+ trainer = CustomTrainer(
87
  model=model,
88
  args=args,
89
  train_dataset=dataset,
 
 
90
  max_seq_length=max_seq_length,
91
  tokenizer=tokenizer,
92
+ dataset_text_field="output",
93
  packing=False,
 
94
  )
95
 
96
+ # Start training
97
  trainer.train()
98
 
99
+ # Save model
100
+ trainer.save_model()