rdiehlmartinez commited on
Commit
8d16789
·
1 Parent(s): 59653b3

Saving Training Config -- Step 0

Browse files
Files changed (1) hide show
  1. training_config.yaml +74 -0
training_config.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ checkpointing:
2
+ checkpoints_dir: checkpoints
3
+ evaluation:
4
+ eval_results_dir: eval_results
5
+ fabric_checkpoint_dir: fabric_state
6
+ fabric_checkpoint_filename: checkpoint.pt
7
+ hf_checkpoint:
8
+ collection_slug: null
9
+ repo_id: pico-lm/pico-decoder-large
10
+ learning_dynamics:
11
+ batch_size: 128
12
+ eval_data: pico-lm/pretokenized-paloma-tinsy
13
+ layer_suffixes:
14
+ - attention.v_proj
15
+ - attention.o_proj
16
+ - swiglu.w_2
17
+ sequence_idx: -1
18
+ learning_dynamics_dir: learning_dynamics
19
+ logs_dir: logs
20
+ run_name: pico-decoder-large-1
21
+ runs_dir: runs
22
+ save_every_n_steps: 1000
23
+ save_to_hf: true
24
+ training:
25
+ auto_resume: true
26
+ data:
27
+ dataloader:
28
+ batch_size: 1024
29
+ dataset:
30
+ name: pico-lm/pretokenized-dolma
31
+ tokenizer:
32
+ name: allenai/OLMo-7B-0724-hf
33
+ vocab_size: 50304
34
+ evaluation:
35
+ metrics:
36
+ - paloma
37
+ paloma:
38
+ batch_size: 16
39
+ dataset_name: pico-lm/pretokenized-paloma-tinsy
40
+ dataset_split: val
41
+ max_length: 2048
42
+ model:
43
+ activation_hidden_dim: 6144
44
+ attention_n_heads: 12
45
+ attention_n_kv_heads: 4
46
+ batch_size: 1024
47
+ d_model: 1536
48
+ max_seq_len: 2048
49
+ model_type: pico_decoder
50
+ n_layers: 12
51
+ norm_eps: 1.0e-06
52
+ position_emb_theta: 10000.0
53
+ vocab_size: 50304
54
+ monitoring:
55
+ logging:
56
+ log_every_n_steps: 100
57
+ log_level: INFO
58
+ save_to_wandb: true
59
+ wandb:
60
+ entity: pico-lm
61
+ project: pico-decoder
62
+ training:
63
+ fabric:
64
+ accelerator: cuda
65
+ num_devices: 4
66
+ num_nodes: 4
67
+ precision: bf16-mixed
68
+ max_steps: 200000
69
+ optimization:
70
+ gradient_accumulation_steps: 8
71
+ lr: 0.0003
72
+ lr_scheduler: linear_with_warmup
73
+ lr_warmup_steps: 2500
74
+ optimizer: adamw