mtasic85 commited on
Commit
bbec1b1
1 Parent(s): 4dc0a8b

pretrain model

Browse files
Files changed (1) hide show
  1. scripts/pretrain-model.yaml +8 -6
scripts/pretrain-model.yaml CHANGED
@@ -72,13 +72,15 @@ train:
72
  micro_batch_size: 16
73
 
74
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
75
- lr_warmup_steps: 2000
 
76
 
77
  # Number of epochs to train on (type: Optional[int], default: null)
78
  epochs:
79
 
80
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
81
- max_tokens: 21260243688 # 5187956 * 2049 * 2
 
82
 
83
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
84
  max_steps:
@@ -121,15 +123,15 @@ optimizer:
121
 
122
  init_args:
123
  # (type: float, default: 0.001)
124
- lr: 1e-2
125
 
126
  # (type: float, default: 0.01)
127
- weight_decay: 0.1
128
 
129
  # (type: tuple, default: (0.9,0.999))
130
  betas:
131
  - 0.9
132
- - 0.95
133
 
134
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
135
  devices: auto
@@ -145,4 +147,4 @@ tokenizer_dir: "../"
145
  logger_name: "wandb"
146
 
147
  # The random seed to use for reproducibility. (type: int, default: 42)
148
- seed: 42
 
72
  micro_batch_size: 16
73
 
74
  # Number of iterations with learning rate warmup active (type: int, default: 2000)
75
+ # lr_warmup_steps: 2000
76
+ lr_warmup_steps: 10
77
 
78
  # Number of epochs to train on (type: Optional[int], default: null)
79
  epochs:
80
 
81
  # Total number of tokens to train on (type: Optional[int], default: 3000000000000)
82
+ # max_tokens: 21260243688 # 5187956 * 2049 * 2
83
+ max_tokens: 10630121844 # 5187956 * 2049 * 1
84
 
85
  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
86
  max_steps:
 
123
 
124
  init_args:
125
  # (type: float, default: 0.001)
126
+ lr: 1e-3
127
 
128
  # (type: float, default: 0.01)
129
+ weight_decay: 1e-2
130
 
131
  # (type: tuple, default: (0.9,0.999))
132
  betas:
133
  - 0.9
134
+ - 0.999
135
 
136
  # How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
137
  devices: auto
 
147
  logger_name: "wandb"
148
 
149
  # The random seed to use for reproducibility. (type: int, default: 42)
150
+ seed: 23