Crystalcareai commited on
Commit
a7a47b9
·
verified ·
1 Parent(s): 07a9f15

Update train-h100-sharegpt-sft.py

Browse files
Files changed (1) hide show
  1. train-h100-sharegpt-sft.py +44 -44
train-h100-sharegpt-sft.py CHANGED
@@ -15,12 +15,12 @@ random_seed = 42
15
  torch.manual_seed(random_seed)
16
  random.seed(random_seed)
17
 
18
- dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(3000))
19
 
20
 
21
- n_ahead_talk_global = 2
22
  n_passes_global = 1
23
- n_ahead_global = 8
24
  # n_examples = 1000
25
  # full_batch_size = 8
26
 
@@ -96,15 +96,15 @@ def model_init(params):
96
  model.train()
97
  return model
98
 
99
- max_seq_length = 8192
100
  run_id = int(time.time())
101
  training_args = TrainingArguments(
102
  output_dir="./out",
103
  num_train_epochs=1,
104
  per_device_train_batch_size=1,
105
  gradient_checkpointing=False,
106
- gradient_accumulation_steps=1,
107
- optim="lion_32bit",
108
  logging_steps=1,
109
  save_strategy="steps",
110
  save_steps=100,
@@ -114,74 +114,74 @@ training_args = TrainingArguments(
114
  bf16=True,
115
 
116
  tf32=True,
117
- learning_rate=1e-07,
118
  max_grad_norm=0,
119
  warmup_steps=20,
120
- lr_scheduler_type="constant",
121
  push_to_hub=False,
122
  report_to="wandb"
123
  )
124
 
125
  peft_config = LoraConfig(
126
  r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
127
- target_modules =["up_proj", "down_proj", "gate_proj"],
128
  lora_alpha = 32,
129
  lora_dropout = 0, # Supports any, but = 0 is optimized
130
  bias = "none",
131
- use_dora=False,
132
  task_type="CAUSAL_LM"
133
  )
134
 
135
  torch.autograd.set_detect_anomaly(True)
136
 
137
- class CustomSFTTrainer(SFTTrainer):
138
- def __init__(self, *args, **kwargs):
139
- super().__init__(*args, **kwargs)
140
- self.beta = 0.9 # momentum factor
141
- self.clip_factor = 1.0 # clipping factor
142
- self.moving_avg = 0.0
143
 
144
- def training_step(self, model, inputs):
145
- model.train()
146
- inputs = self._prepare_inputs(inputs)
147
 
148
- outputs = model(**inputs)
149
- loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
150
 
151
- if self.args.gradient_accumulation_steps > 1:
152
- loss = loss / self.args.gradient_accumulation_steps
153
 
154
- loss.backward()
155
 
156
- # Compute gradients and their norm
157
- grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
158
 
159
- # Update moving average and apply gradient clipping
160
- if self.state.global_step == 0:
161
- self.moving_avg = grad_norm
162
- else:
163
- self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
164
 
165
- if grad_norm > self.clip_factor * self.moving_avg:
166
- clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
167
- for param in model.parameters():
168
- if param.grad is not None:
169
- param.grad.data.mul_(clip_coef)
170
 
171
- if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
172
- self.optimizer.step()
173
- self.lr_scheduler.step()
174
- model.zero_grad()
175
- self.state.global_step += 1
176
 
177
- # Return the loss as a Tensor
178
- return loss
179
 
180
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
181
 
182
  model = model_init(None)
183
 
184
- trainer = CustomSFTTrainer(
185
  model=model,
186
  args=training_args,
187
  train_dataset=dataset,
 
15
  torch.manual_seed(random_seed)
16
  random.seed(random_seed)
17
 
18
+ dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1000))
19
 
20
 
21
+ n_ahead_talk_global = 4
22
  n_passes_global = 1
23
+ n_ahead_global = 4
24
  # n_examples = 1000
25
  # full_batch_size = 8
26
 
 
96
  model.train()
97
  return model
98
 
99
+ max_seq_length = 1024
100
  run_id = int(time.time())
101
  training_args = TrainingArguments(
102
  output_dir="./out",
103
  num_train_epochs=1,
104
  per_device_train_batch_size=1,
105
  gradient_checkpointing=False,
106
+ gradient_accumulation_steps=8,
107
+ optim="adamw_torch_fused",
108
  logging_steps=1,
109
  save_strategy="steps",
110
  save_steps=100,
 
114
  bf16=True,
115
 
116
  tf32=True,
117
+ learning_rate=2e-10,
118
  max_grad_norm=0,
119
  warmup_steps=20,
120
+ lr_scheduler_type="cosine",
121
  push_to_hub=False,
122
  report_to="wandb"
123
  )
124
 
125
  peft_config = LoraConfig(
126
  r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
127
+ target_modules =["q_proj", "v_proj"],
128
  lora_alpha = 32,
129
  lora_dropout = 0, # Supports any, but = 0 is optimized
130
  bias = "none",
131
+ use_dora=True,
132
  task_type="CAUSAL_LM"
133
  )
134
 
135
  torch.autograd.set_detect_anomaly(True)
136
 
137
+ # class CustomSFTTrainer(SFTTrainer):
138
+ # def __init__(self, *args, **kwargs):
139
+ # super().__init__(*args, **kwargs)
140
+ # self.beta = 0.9 # momentum factor
141
+ # self.clip_factor = 1.0 # clipping factor
142
+ # self.moving_avg = 0.0
143
 
144
+ # def training_step(self, model, inputs):
145
+ # model.train()
146
+ # inputs = self._prepare_inputs(inputs)
147
 
148
+ # outputs = model(**inputs)
149
+ # loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
150
 
151
+ # if self.args.gradient_accumulation_steps > 1:
152
+ # loss = loss / self.args.gradient_accumulation_steps
153
 
154
+ # loss.backward()
155
 
156
+ # # Compute gradients and their norm
157
+ # grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
158
 
159
+ # # Update moving average and apply gradient clipping
160
+ # if self.state.global_step == 0:
161
+ # self.moving_avg = grad_norm
162
+ # else:
163
+ # self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
164
 
165
+ # if grad_norm > self.clip_factor * self.moving_avg:
166
+ # clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
167
+ # for param in model.parameters():
168
+ # if param.grad is not None:
169
+ # param.grad.data.mul_(clip_coef)
170
 
171
+ # if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
172
+ # self.optimizer.step()
173
+ # self.lr_scheduler.step()
174
+ # model.zero_grad()
175
+ # self.state.global_step += 1
176
 
177
+ # # Return the loss as a Tensor
178
+ # return loss
179
 
180
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
181
 
182
  model = model_init(None)
183
 
184
+ trainer = SFTTrainer(
185
  model=model,
186
  args=training_args,
187
  train_dataset=dataset,