Crystalcareai
commited on
Update train-h100-sharegpt-sft.py
Browse files- train-h100-sharegpt-sft.py +44 -44
train-h100-sharegpt-sft.py
CHANGED
@@ -15,12 +15,12 @@ random_seed = 42
|
|
15 |
torch.manual_seed(random_seed)
|
16 |
random.seed(random_seed)
|
17 |
|
18 |
-
dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(
|
19 |
|
20 |
|
21 |
-
n_ahead_talk_global =
|
22 |
n_passes_global = 1
|
23 |
-
n_ahead_global =
|
24 |
# n_examples = 1000
|
25 |
# full_batch_size = 8
|
26 |
|
@@ -96,15 +96,15 @@ def model_init(params):
|
|
96 |
model.train()
|
97 |
return model
|
98 |
|
99 |
-
max_seq_length =
|
100 |
run_id = int(time.time())
|
101 |
training_args = TrainingArguments(
|
102 |
output_dir="./out",
|
103 |
num_train_epochs=1,
|
104 |
per_device_train_batch_size=1,
|
105 |
gradient_checkpointing=False,
|
106 |
-
gradient_accumulation_steps=
|
107 |
-
optim="
|
108 |
logging_steps=1,
|
109 |
save_strategy="steps",
|
110 |
save_steps=100,
|
@@ -114,74 +114,74 @@ training_args = TrainingArguments(
|
|
114 |
bf16=True,
|
115 |
|
116 |
tf32=True,
|
117 |
-
learning_rate=
|
118 |
max_grad_norm=0,
|
119 |
warmup_steps=20,
|
120 |
-
lr_scheduler_type="
|
121 |
push_to_hub=False,
|
122 |
report_to="wandb"
|
123 |
)
|
124 |
|
125 |
peft_config = LoraConfig(
|
126 |
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
127 |
-
target_modules =["
|
128 |
lora_alpha = 32,
|
129 |
lora_dropout = 0, # Supports any, but = 0 is optimized
|
130 |
bias = "none",
|
131 |
-
use_dora=
|
132 |
task_type="CAUSAL_LM"
|
133 |
)
|
134 |
|
135 |
torch.autograd.set_detect_anomaly(True)
|
136 |
|
137 |
-
class CustomSFTTrainer(SFTTrainer):
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
|
148 |
-
|
149 |
-
|
150 |
|
151 |
-
|
152 |
-
|
153 |
|
154 |
-
|
155 |
|
156 |
-
|
157 |
-
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
|
177 |
-
|
178 |
-
|
179 |
|
180 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
181 |
|
182 |
model = model_init(None)
|
183 |
|
184 |
-
trainer =
|
185 |
model=model,
|
186 |
args=training_args,
|
187 |
train_dataset=dataset,
|
|
|
15 |
torch.manual_seed(random_seed)
|
16 |
random.seed(random_seed)
|
17 |
|
18 |
+
dataset = load_dataset("HuggingFaceH4/orca-math-word-problems-200k", split="train_sft").select(range(1000))
|
19 |
|
20 |
|
21 |
+
n_ahead_talk_global = 4
|
22 |
n_passes_global = 1
|
23 |
+
n_ahead_global = 4
|
24 |
# n_examples = 1000
|
25 |
# full_batch_size = 8
|
26 |
|
|
|
96 |
model.train()
|
97 |
return model
|
98 |
|
99 |
+
max_seq_length = 1024
|
100 |
run_id = int(time.time())
|
101 |
training_args = TrainingArguments(
|
102 |
output_dir="./out",
|
103 |
num_train_epochs=1,
|
104 |
per_device_train_batch_size=1,
|
105 |
gradient_checkpointing=False,
|
106 |
+
gradient_accumulation_steps=8,
|
107 |
+
optim="adamw_torch_fused",
|
108 |
logging_steps=1,
|
109 |
save_strategy="steps",
|
110 |
save_steps=100,
|
|
|
114 |
bf16=True,
|
115 |
|
116 |
tf32=True,
|
117 |
+
learning_rate=2e-10,
|
118 |
max_grad_norm=0,
|
119 |
warmup_steps=20,
|
120 |
+
lr_scheduler_type="cosine",
|
121 |
push_to_hub=False,
|
122 |
report_to="wandb"
|
123 |
)
|
124 |
|
125 |
peft_config = LoraConfig(
|
126 |
r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
127 |
+
target_modules =["q_proj", "v_proj"],
|
128 |
lora_alpha = 32,
|
129 |
lora_dropout = 0, # Supports any, but = 0 is optimized
|
130 |
bias = "none",
|
131 |
+
use_dora=True,
|
132 |
task_type="CAUSAL_LM"
|
133 |
)
|
134 |
|
135 |
torch.autograd.set_detect_anomaly(True)
|
136 |
|
137 |
+
# class CustomSFTTrainer(SFTTrainer):
|
138 |
+
# def __init__(self, *args, **kwargs):
|
139 |
+
# super().__init__(*args, **kwargs)
|
140 |
+
# self.beta = 0.9 # momentum factor
|
141 |
+
# self.clip_factor = 1.0 # clipping factor
|
142 |
+
# self.moving_avg = 0.0
|
143 |
|
144 |
+
# def training_step(self, model, inputs):
|
145 |
+
# model.train()
|
146 |
+
# inputs = self._prepare_inputs(inputs)
|
147 |
|
148 |
+
# outputs = model(**inputs)
|
149 |
+
# loss = outputs.loss if isinstance(outputs, dict) else outputs[0]
|
150 |
|
151 |
+
# if self.args.gradient_accumulation_steps > 1:
|
152 |
+
# loss = loss / self.args.gradient_accumulation_steps
|
153 |
|
154 |
+
# loss.backward()
|
155 |
|
156 |
+
# # Compute gradients and their norm
|
157 |
+
# grad_norm = torch.sqrt(sum(p.grad.data.norm().to(model.device)**2 for p in model.parameters() if p.grad is not None))
|
158 |
|
159 |
+
# # Update moving average and apply gradient clipping
|
160 |
+
# if self.state.global_step == 0:
|
161 |
+
# self.moving_avg = grad_norm
|
162 |
+
# else:
|
163 |
+
# self.moving_avg = self.beta * self.moving_avg + (1 - self.beta) * grad_norm
|
164 |
|
165 |
+
# if grad_norm > self.clip_factor * self.moving_avg:
|
166 |
+
# clip_coef = (self.clip_factor * self.moving_avg / grad_norm).item()
|
167 |
+
# for param in model.parameters():
|
168 |
+
# if param.grad is not None:
|
169 |
+
# param.grad.data.mul_(clip_coef)
|
170 |
|
171 |
+
# if (self.state.global_step + 1) % self.args.gradient_accumulation_steps == 0:
|
172 |
+
# self.optimizer.step()
|
173 |
+
# self.lr_scheduler.step()
|
174 |
+
# model.zero_grad()
|
175 |
+
# self.state.global_step += 1
|
176 |
|
177 |
+
# # Return the loss as a Tensor
|
178 |
+
# return loss
|
179 |
|
180 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
181 |
|
182 |
model = model_init(None)
|
183 |
|
184 |
+
trainer = SFTTrainer(
|
185 |
model=model,
|
186 |
args=training_args,
|
187 |
train_dataset=dataset,
|