“Sara commited on
Commit
45522b4
·
1 Parent(s): b5b589a

adding model files

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/generation_config.json → generation_config.json RENAMED
File without changes
last-checkpoint/added_tokens.json DELETED
@@ -1,11 +0,0 @@
1
- {
2
- "</headline>": 32008,
3
- "</scratchpad>": 32006,
4
- "<</SYS>>": 32004,
5
- "<<SYS>>": 32003,
6
- "<headline>": 32007,
7
- "<pad>": 32000,
8
- "<scratchpad>": 32005,
9
- "[/INST]": 32002,
10
- "[INST]": 32001
11
- }
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "meta-llama/Llama-2-7b-hf",
3
- "architectures": [
4
- "LlamaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 1,
9
- "eos_token_id": 2,
10
- "hidden_act": "silu",
11
- "hidden_size": 4096,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 11008,
14
- "max_position_embeddings": 4096,
15
- "model_type": "llama",
16
- "num_attention_heads": 32,
17
- "num_hidden_layers": 32,
18
- "num_key_value_heads": 32,
19
- "pretraining_tp": 1,
20
- "rms_norm_eps": 1e-05,
21
- "rope_scaling": null,
22
- "rope_theta": 10000.0,
23
- "tie_word_embeddings": false,
24
- "torch_dtype": "float32",
25
- "transformers_version": "4.40.2",
26
- "use_cache": true,
27
- "vocab_size": 32016
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/special_tokens_map.json DELETED
@@ -1,82 +0,0 @@
1
- {
2
- "additional_special_tokens": [
3
- {
4
- "content": "[INST]",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": true
9
- },
10
- {
11
- "content": "[/INST]",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": true
16
- },
17
- {
18
- "content": "<<SYS>>",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": true
23
- },
24
- {
25
- "content": "<</SYS>>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": true
30
- },
31
- {
32
- "content": "<scratchpad>",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": true
37
- },
38
- {
39
- "content": "</scratchpad>",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": true
44
- },
45
- {
46
- "content": "<headline>",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": true
51
- },
52
- {
53
- "content": "</headline>",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": true
58
- }
59
- ],
60
- "bos_token": {
61
- "content": "<s>",
62
- "lstrip": false,
63
- "normalized": false,
64
- "rstrip": false,
65
- "single_word": false
66
- },
67
- "eos_token": {
68
- "content": "</s>",
69
- "lstrip": false,
70
- "normalized": false,
71
- "rstrip": false,
72
- "single_word": false
73
- },
74
- "pad_token": "<pad>",
75
- "unk_token": {
76
- "content": "<unk>",
77
- "lstrip": false,
78
- "normalized": false,
79
- "rstrip": false,
80
- "single_word": false
81
- }
82
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
- size 499723
 
 
 
 
last-checkpoint/tokenizer_config.json DELETED
@@ -1,124 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "32000": {
30
- "content": "<pad>",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": true
36
- },
37
- "32001": {
38
- "content": "[INST]",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": true,
43
- "special": true
44
- },
45
- "32002": {
46
- "content": "[/INST]",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": true,
51
- "special": true
52
- },
53
- "32003": {
54
- "content": "<<SYS>>",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": true,
59
- "special": true
60
- },
61
- "32004": {
62
- "content": "<</SYS>>",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": true,
67
- "special": true
68
- },
69
- "32005": {
70
- "content": "<scratchpad>",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": true,
75
- "special": true
76
- },
77
- "32006": {
78
- "content": "</scratchpad>",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": true,
83
- "special": true
84
- },
85
- "32007": {
86
- "content": "<headline>",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": true,
91
- "special": true
92
- },
93
- "32008": {
94
- "content": "</headline>",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": true,
99
- "special": true
100
- }
101
- },
102
- "additional_special_tokens": [
103
- "[INST]",
104
- "[/INST]",
105
- "<<SYS>>",
106
- "<</SYS>>",
107
- "<scratchpad>",
108
- "</scratchpad>",
109
- "<headline>",
110
- "</headline>"
111
- ],
112
- "bos_token": "<s>",
113
- "clean_up_tokenization_spaces": false,
114
- "eos_token": "</s>",
115
- "legacy": false,
116
- "model_max_length": 1000000000000000019884624838656,
117
- "pad_to_multiple_of": 8,
118
- "pad_token": "<pad>",
119
- "padding_side": "left",
120
- "sp_model_kwargs": {},
121
- "tokenizer_class": "LlamaTokenizer",
122
- "unk_token": "<unk>",
123
- "use_default_system_prompt": false
124
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
last-checkpoint/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:676922c8c88a048a8c76fcb1420a476fbd64daf190286d2f67fca7597684e8c6
3
- size 5176
 
 
 
 
last-checkpoint/model-00001-of-00006.safetensors → model-00001-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a93d426f85b1f0cf9314700f2b678255f6f690251de2e2b2362564b11572c4c
3
  size 4840658560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:899d935b5c5a49d9246d93bb12f53fb2a5d65c9908cc6913af4e648924301803
3
  size 4840658560
last-checkpoint/model-00002-of-00006.safetensors → model-00002-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83453b65b109be28de860fd5576e94fd1a18525ebafed114feee2d191e01364d
3
  size 4857206856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b447174ff33cd1c0d7a82b337d96f938df8c838d65d6f9fde9fd7985fc02cd93
3
  size 4857206856
last-checkpoint/model-00003-of-00006.safetensors → model-00003-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4594c4acfd9fe032f5f5f93557f1b0576aa808f283fc499a5ca67955dca82c84
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd870d340d98656a432d2d32968441795a4c6b494ed4ea44a770d0edc35c3865
3
  size 4857206904
last-checkpoint/model-00004-of-00006.safetensors → model-00004-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:96a5d953b5e97f4d4354fe24efe8378b7eee246457ddae26ac336c462b00b556
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7dfc7b648b3aa287c824b7feecc82122461caa52be5de3d18dcdd6975ecbbc
3
  size 4857206904
last-checkpoint/model-00005-of-00006.safetensors → model-00005-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3caed05f52f25c6f7efa8ab04593f866cd644bb133dbedadb84affefd9ce1b83
3
  size 4857206904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d28f9e42e36012953b0ad80e3029216d6abc5eb46b9cc829c31ea903fb7a731
3
  size 4857206904
last-checkpoint/model-00006-of-00006.safetensors → model-00006-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b23439a29dd32c57e3a728b59c7c69a0689fa50c6c95f587e340e2742ac498ea
3
  size 2684734256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f8b6e521feebdc5b2b7367c01fd851c314151387cf42e49264cd3b76a86b2b
3
  size 2684734256
last-checkpoint/model.safetensors.index.json → model.safetensors.index.json RENAMED
File without changes
last-checkpoint/trainer_state.json → trainer_state.json RENAMED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 9.6,
5
  "eval_steps": 50,
6
- "global_step": 4800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -847,606 +847,6 @@
847
  "eval_samples_per_second": 69.235,
848
  "eval_steps_per_second": 3.551,
849
  "step": 2800
850
- },
851
- {
852
- "epoch": 5.7,
853
- "grad_norm": 1.1381618976593018,
854
- "learning_rate": 9.302435262558748e-06,
855
- "loss": 0.0708,
856
- "step": 2850
857
- },
858
- {
859
- "epoch": 5.7,
860
- "eval_loss": 1.0073773860931396,
861
- "eval_runtime": 2.4353,
862
- "eval_samples_per_second": 64.058,
863
- "eval_steps_per_second": 3.285,
864
- "step": 2850
865
- },
866
- {
867
- "epoch": 5.8,
868
- "grad_norm": 1.1022274494171143,
869
- "learning_rate": 8.954715367323468e-06,
870
- "loss": 0.0708,
871
- "step": 2900
872
- },
873
- {
874
- "epoch": 5.8,
875
- "eval_loss": 1.0213180780410767,
876
- "eval_runtime": 2.8636,
877
- "eval_samples_per_second": 54.478,
878
- "eval_steps_per_second": 2.794,
879
- "step": 2900
880
- },
881
- {
882
- "epoch": 5.9,
883
- "grad_norm": 0.8953984975814819,
884
- "learning_rate": 8.60826899039935e-06,
885
- "loss": 0.0733,
886
- "step": 2950
887
- },
888
- {
889
- "epoch": 5.9,
890
- "eval_loss": 1.0069844722747803,
891
- "eval_runtime": 3.3874,
892
- "eval_samples_per_second": 46.053,
893
- "eval_steps_per_second": 2.362,
894
- "step": 2950
895
- },
896
- {
897
- "epoch": 6.0,
898
- "grad_norm": 1.0139652490615845,
899
- "learning_rate": 8.263518223330698e-06,
900
- "loss": 0.0741,
901
- "step": 3000
902
- },
903
- {
904
- "epoch": 6.0,
905
- "eval_loss": 0.9869266152381897,
906
- "eval_runtime": 2.4224,
907
- "eval_samples_per_second": 64.4,
908
- "eval_steps_per_second": 3.303,
909
- "step": 3000
910
- },
911
- {
912
- "epoch": 6.1,
913
- "grad_norm": 0.40148624777793884,
914
- "learning_rate": 7.92088309182241e-06,
915
- "loss": 0.0523,
916
- "step": 3050
917
- },
918
- {
919
- "epoch": 6.1,
920
- "eval_loss": 1.042676329612732,
921
- "eval_runtime": 2.2655,
922
- "eval_samples_per_second": 68.859,
923
- "eval_steps_per_second": 3.531,
924
- "step": 3050
925
- },
926
- {
927
- "epoch": 6.2,
928
- "grad_norm": 0.7412729859352112,
929
- "learning_rate": 7.580781044003324e-06,
930
- "loss": 0.0517,
931
- "step": 3100
932
- },
933
- {
934
- "epoch": 6.2,
935
- "eval_loss": 1.0524495840072632,
936
- "eval_runtime": 2.2722,
937
- "eval_samples_per_second": 68.656,
938
- "eval_steps_per_second": 3.521,
939
- "step": 3100
940
- },
941
- {
942
- "epoch": 6.3,
943
- "grad_norm": 0.6903051137924194,
944
- "learning_rate": 7.243626441830009e-06,
945
- "loss": 0.0532,
946
- "step": 3150
947
- },
948
- {
949
- "epoch": 6.3,
950
- "eval_loss": 1.0479012727737427,
951
- "eval_runtime": 2.2573,
952
- "eval_samples_per_second": 69.11,
953
- "eval_steps_per_second": 3.544,
954
- "step": 3150
955
- },
956
- {
957
- "epoch": 6.4,
958
- "grad_norm": 0.6340187788009644,
959
- "learning_rate": 6.909830056250527e-06,
960
- "loss": 0.0532,
961
- "step": 3200
962
- },
963
- {
964
- "epoch": 6.4,
965
- "eval_loss": 1.0606757402420044,
966
- "eval_runtime": 2.2583,
967
- "eval_samples_per_second": 69.08,
968
- "eval_steps_per_second": 3.543,
969
- "step": 3200
970
- },
971
- {
972
- "epoch": 6.5,
973
- "grad_norm": 0.6936119794845581,
974
- "learning_rate": 6.579798566743314e-06,
975
- "loss": 0.0531,
976
- "step": 3250
977
- },
978
- {
979
- "epoch": 6.5,
980
- "eval_loss": 1.06425142288208,
981
- "eval_runtime": 2.9676,
982
- "eval_samples_per_second": 52.568,
983
- "eval_steps_per_second": 2.696,
984
- "step": 3250
985
- },
986
- {
987
- "epoch": 6.6,
988
- "grad_norm": 0.7855706810951233,
989
- "learning_rate": 6.25393406584088e-06,
990
- "loss": 0.0513,
991
- "step": 3300
992
- },
993
- {
994
- "epoch": 6.6,
995
- "eval_loss": 1.0764490365982056,
996
- "eval_runtime": 2.7705,
997
- "eval_samples_per_second": 56.307,
998
- "eval_steps_per_second": 2.888,
999
- "step": 3300
1000
- },
1001
- {
1002
- "epoch": 6.7,
1003
- "grad_norm": 0.43579840660095215,
1004
- "learning_rate": 5.932633569242e-06,
1005
- "loss": 0.0532,
1006
- "step": 3350
1007
- },
1008
- {
1009
- "epoch": 6.7,
1010
- "eval_loss": 1.0692789554595947,
1011
- "eval_runtime": 2.6435,
1012
- "eval_samples_per_second": 59.012,
1013
- "eval_steps_per_second": 3.026,
1014
- "step": 3350
1015
- },
1016
- {
1017
- "epoch": 6.8,
1018
- "grad_norm": 0.5169562101364136,
1019
- "learning_rate": 5.616288532109225e-06,
1020
- "loss": 0.0525,
1021
- "step": 3400
1022
- },
1023
- {
1024
- "epoch": 6.8,
1025
- "eval_loss": 1.0697879791259766,
1026
- "eval_runtime": 2.3878,
1027
- "eval_samples_per_second": 65.333,
1028
- "eval_steps_per_second": 3.35,
1029
- "step": 3400
1030
- },
1031
- {
1032
- "epoch": 6.9,
1033
- "grad_norm": 0.35735023021698,
1034
- "learning_rate": 5.305284372141095e-06,
1035
- "loss": 0.0529,
1036
- "step": 3450
1037
- },
1038
- {
1039
- "epoch": 6.9,
1040
- "eval_loss": 1.059874176979065,
1041
- "eval_runtime": 2.2721,
1042
- "eval_samples_per_second": 68.66,
1043
- "eval_steps_per_second": 3.521,
1044
- "step": 3450
1045
- },
1046
- {
1047
- "epoch": 7.0,
1048
- "grad_norm": 0.4535483717918396,
1049
- "learning_rate": 5.000000000000003e-06,
1050
- "loss": 0.0521,
1051
- "step": 3500
1052
- },
1053
- {
1054
- "epoch": 7.0,
1055
- "eval_loss": 1.064403772354126,
1056
- "eval_runtime": 2.2621,
1057
- "eval_samples_per_second": 68.964,
1058
- "eval_steps_per_second": 3.537,
1059
- "step": 3500
1060
- },
1061
- {
1062
- "epoch": 7.1,
1063
- "grad_norm": 0.3609411418437958,
1064
- "learning_rate": 4.700807357667953e-06,
1065
- "loss": 0.045,
1066
- "step": 3550
1067
- },
1068
- {
1069
- "epoch": 7.1,
1070
- "eval_loss": 1.0945632457733154,
1071
- "eval_runtime": 2.2531,
1072
- "eval_samples_per_second": 69.238,
1073
- "eval_steps_per_second": 3.551,
1074
- "step": 3550
1075
- },
1076
- {
1077
- "epoch": 7.2,
1078
- "grad_norm": 0.7414348721504211,
1079
- "learning_rate": 4.408070965292534e-06,
1080
- "loss": 0.0442,
1081
- "step": 3600
1082
- },
1083
- {
1084
- "epoch": 7.2,
1085
- "eval_loss": 1.1062610149383545,
1086
- "eval_runtime": 2.2531,
1087
- "eval_samples_per_second": 69.237,
1088
- "eval_steps_per_second": 3.551,
1089
- "step": 3600
1090
- },
1091
- {
1092
- "epoch": 7.3,
1093
- "grad_norm": 0.40675053000450134,
1094
- "learning_rate": 4.12214747707527e-06,
1095
- "loss": 0.0459,
1096
- "step": 3650
1097
- },
1098
- {
1099
- "epoch": 7.3,
1100
- "eval_loss": 1.1099870204925537,
1101
- "eval_runtime": 2.4464,
1102
- "eval_samples_per_second": 63.767,
1103
- "eval_steps_per_second": 3.27,
1104
- "step": 3650
1105
- },
1106
- {
1107
- "epoch": 7.4,
1108
- "grad_norm": 0.33660000562667847,
1109
- "learning_rate": 3.8433852467434175e-06,
1110
- "loss": 0.046,
1111
- "step": 3700
1112
- },
1113
- {
1114
- "epoch": 7.4,
1115
- "eval_loss": 1.1118826866149902,
1116
- "eval_runtime": 2.4409,
1117
- "eval_samples_per_second": 63.912,
1118
- "eval_steps_per_second": 3.278,
1119
- "step": 3700
1120
- },
1121
- {
1122
- "epoch": 7.5,
1123
- "grad_norm": 0.23761674761772156,
1124
- "learning_rate": 3.5721239031346067e-06,
1125
- "loss": 0.0457,
1126
- "step": 3750
1127
- },
1128
- {
1129
- "epoch": 7.5,
1130
- "eval_loss": 1.1216858625411987,
1131
- "eval_runtime": 3.2036,
1132
- "eval_samples_per_second": 48.696,
1133
- "eval_steps_per_second": 2.497,
1134
- "step": 3750
1135
- },
1136
- {
1137
- "epoch": 7.6,
1138
- "grad_norm": 0.8762012124061584,
1139
- "learning_rate": 3.308693936411421e-06,
1140
- "loss": 0.0469,
1141
- "step": 3800
1142
- },
1143
- {
1144
- "epoch": 7.6,
1145
- "eval_loss": 1.1176018714904785,
1146
- "eval_runtime": 2.7692,
1147
- "eval_samples_per_second": 56.334,
1148
- "eval_steps_per_second": 2.889,
1149
- "step": 3800
1150
- },
1151
- {
1152
- "epoch": 7.7,
1153
- "grad_norm": 0.4060334861278534,
1154
- "learning_rate": 3.0534162954100264e-06,
1155
- "loss": 0.0462,
1156
- "step": 3850
1157
- },
1158
- {
1159
- "epoch": 7.7,
1160
- "eval_loss": 1.1231276988983154,
1161
- "eval_runtime": 2.2766,
1162
- "eval_samples_per_second": 68.525,
1163
- "eval_steps_per_second": 3.514,
1164
- "step": 3850
1165
- },
1166
- {
1167
- "epoch": 7.8,
1168
- "grad_norm": 0.42169174551963806,
1169
- "learning_rate": 2.8066019966134907e-06,
1170
- "loss": 0.0452,
1171
- "step": 3900
1172
- },
1173
- {
1174
- "epoch": 7.8,
1175
- "eval_loss": 1.1297779083251953,
1176
- "eval_runtime": 2.2629,
1177
- "eval_samples_per_second": 68.938,
1178
- "eval_steps_per_second": 3.535,
1179
- "step": 3900
1180
- },
1181
- {
1182
- "epoch": 7.9,
1183
- "grad_norm": 0.3084222674369812,
1184
- "learning_rate": 2.5685517452260566e-06,
1185
- "loss": 0.0458,
1186
- "step": 3950
1187
- },
1188
- {
1189
- "epoch": 7.9,
1190
- "eval_loss": 1.1398730278015137,
1191
- "eval_runtime": 2.2543,
1192
- "eval_samples_per_second": 69.2,
1193
- "eval_steps_per_second": 3.549,
1194
- "step": 3950
1195
- },
1196
- {
1197
- "epoch": 8.0,
1198
- "grad_norm": 0.36014610528945923,
1199
- "learning_rate": 2.339555568810221e-06,
1200
- "loss": 0.046,
1201
- "step": 4000
1202
- },
1203
- {
1204
- "epoch": 8.0,
1205
- "eval_loss": 1.1379703283309937,
1206
- "eval_runtime": 2.2541,
1207
- "eval_samples_per_second": 69.207,
1208
- "eval_steps_per_second": 3.549,
1209
- "step": 4000
1210
- },
1211
- {
1212
- "epoch": 8.1,
1213
- "grad_norm": 0.28584274649620056,
1214
- "learning_rate": 2.119892463932781e-06,
1215
- "loss": 0.0416,
1216
- "step": 4050
1217
- },
1218
- {
1219
- "epoch": 8.1,
1220
- "eval_loss": 1.1522297859191895,
1221
- "eval_runtime": 3.0804,
1222
- "eval_samples_per_second": 50.643,
1223
- "eval_steps_per_second": 2.597,
1224
- "step": 4050
1225
- },
1226
- {
1227
- "epoch": 8.2,
1228
- "grad_norm": 0.5283023118972778,
1229
- "learning_rate": 1.9098300562505266e-06,
1230
- "loss": 0.0425,
1231
- "step": 4100
1232
- },
1233
- {
1234
- "epoch": 8.2,
1235
- "eval_loss": 1.1616079807281494,
1236
- "eval_runtime": 2.3831,
1237
- "eval_samples_per_second": 65.462,
1238
- "eval_steps_per_second": 3.357,
1239
- "step": 4100
1240
- },
1241
- {
1242
- "epoch": 8.3,
1243
- "grad_norm": 0.3108871877193451,
1244
- "learning_rate": 1.709624274449584e-06,
1245
- "loss": 0.0422,
1246
- "step": 4150
1247
- },
1248
- {
1249
- "epoch": 8.3,
1250
- "eval_loss": 1.1679902076721191,
1251
- "eval_runtime": 3.5663,
1252
- "eval_samples_per_second": 43.742,
1253
- "eval_steps_per_second": 2.243,
1254
- "step": 4150
1255
- },
1256
- {
1257
- "epoch": 8.4,
1258
- "grad_norm": 0.5419384837150574,
1259
- "learning_rate": 1.5195190384357405e-06,
1260
- "loss": 0.0431,
1261
- "step": 4200
1262
- },
1263
- {
1264
- "epoch": 8.4,
1265
- "eval_loss": 1.1734154224395752,
1266
- "eval_runtime": 2.7016,
1267
- "eval_samples_per_second": 57.743,
1268
- "eval_steps_per_second": 2.961,
1269
- "step": 4200
1270
- },
1271
- {
1272
- "epoch": 8.5,
1273
- "grad_norm": 0.598882257938385,
1274
- "learning_rate": 1.339745962155613e-06,
1275
- "loss": 0.0425,
1276
- "step": 4250
1277
- },
1278
- {
1279
- "epoch": 8.5,
1280
- "eval_loss": 1.177977204322815,
1281
- "eval_runtime": 2.2591,
1282
- "eval_samples_per_second": 69.054,
1283
- "eval_steps_per_second": 3.541,
1284
- "step": 4250
1285
- },
1286
- {
1287
- "epoch": 8.6,
1288
- "grad_norm": 0.3328254520893097,
1289
- "learning_rate": 1.1705240714107301e-06,
1290
- "loss": 0.0425,
1291
- "step": 4300
1292
- },
1293
- {
1294
- "epoch": 8.6,
1295
- "eval_loss": 1.1803592443466187,
1296
- "eval_runtime": 2.2622,
1297
- "eval_samples_per_second": 68.96,
1298
- "eval_steps_per_second": 3.536,
1299
- "step": 4300
1300
- },
1301
- {
1302
- "epoch": 8.7,
1303
- "grad_norm": 0.29016920924186707,
1304
- "learning_rate": 1.012059537008332e-06,
1305
- "loss": 0.0423,
1306
- "step": 4350
1307
- },
1308
- {
1309
- "epoch": 8.7,
1310
- "eval_loss": 1.1833291053771973,
1311
- "eval_runtime": 2.2562,
1312
- "eval_samples_per_second": 69.142,
1313
- "eval_steps_per_second": 3.546,
1314
- "step": 4350
1315
- },
1316
- {
1317
- "epoch": 8.8,
1318
- "grad_norm": 0.29944583773612976,
1319
- "learning_rate": 8.645454235739903e-07,
1320
- "loss": 0.0432,
1321
- "step": 4400
1322
- },
1323
- {
1324
- "epoch": 8.8,
1325
- "eval_loss": 1.1854578256607056,
1326
- "eval_runtime": 2.2563,
1327
- "eval_samples_per_second": 69.141,
1328
- "eval_steps_per_second": 3.546,
1329
- "step": 4400
1330
- },
1331
- {
1332
- "epoch": 8.9,
1333
- "grad_norm": 0.34581369161605835,
1334
- "learning_rate": 7.281614543321269e-07,
1335
- "loss": 0.0439,
1336
- "step": 4450
1337
- },
1338
- {
1339
- "epoch": 8.9,
1340
- "eval_loss": 1.1898452043533325,
1341
- "eval_runtime": 2.428,
1342
- "eval_samples_per_second": 64.25,
1343
- "eval_steps_per_second": 3.295,
1344
- "step": 4450
1345
- },
1346
- {
1347
- "epoch": 9.0,
1348
- "grad_norm": 0.6652135252952576,
1349
- "learning_rate": 6.030737921409169e-07,
1350
- "loss": 0.0433,
1351
- "step": 4500
1352
- },
1353
- {
1354
- "epoch": 9.0,
1355
- "eval_loss": 1.1917668581008911,
1356
- "eval_runtime": 3.4892,
1357
- "eval_samples_per_second": 44.71,
1358
- "eval_steps_per_second": 2.293,
1359
- "step": 4500
1360
- },
1361
- {
1362
- "epoch": 9.1,
1363
- "grad_norm": 0.6365101337432861,
1364
- "learning_rate": 4.894348370484648e-07,
1365
- "loss": 0.0403,
1366
- "step": 4550
1367
- },
1368
- {
1369
- "epoch": 9.1,
1370
- "eval_loss": 1.1966166496276855,
1371
- "eval_runtime": 2.4825,
1372
- "eval_samples_per_second": 62.841,
1373
- "eval_steps_per_second": 3.223,
1374
- "step": 4550
1375
- },
1376
- {
1377
- "epoch": 9.2,
1378
- "grad_norm": 0.33352887630462646,
1379
- "learning_rate": 3.8738304061681107e-07,
1380
- "loss": 0.0423,
1381
- "step": 4600
1382
- },
1383
- {
1384
- "epoch": 9.2,
1385
- "eval_loss": 1.2006295919418335,
1386
- "eval_runtime": 2.9462,
1387
- "eval_samples_per_second": 52.95,
1388
- "eval_steps_per_second": 2.715,
1389
- "step": 4600
1390
- },
1391
- {
1392
- "epoch": 9.3,
1393
- "grad_norm": 0.3310016691684723,
1394
- "learning_rate": 2.970427372400353e-07,
1395
- "loss": 0.0402,
1396
- "step": 4650
1397
- },
1398
- {
1399
- "epoch": 9.3,
1400
- "eval_loss": 1.2032084465026855,
1401
- "eval_runtime": 2.2539,
1402
- "eval_samples_per_second": 69.214,
1403
- "eval_steps_per_second": 3.549,
1404
- "step": 4650
1405
- },
1406
- {
1407
- "epoch": 9.4,
1408
- "grad_norm": 0.25688716769218445,
1409
- "learning_rate": 2.1852399266194312e-07,
1410
- "loss": 0.0408,
1411
- "step": 4700
1412
- },
1413
- {
1414
- "epoch": 9.4,
1415
- "eval_loss": 1.2052472829818726,
1416
- "eval_runtime": 2.2671,
1417
- "eval_samples_per_second": 68.809,
1418
- "eval_steps_per_second": 3.529,
1419
- "step": 4700
1420
- },
1421
- {
1422
- "epoch": 9.5,
1423
- "grad_norm": 0.20443540811538696,
1424
- "learning_rate": 1.519224698779198e-07,
1425
- "loss": 0.0402,
1426
- "step": 4750
1427
- },
1428
- {
1429
- "epoch": 9.5,
1430
- "eval_loss": 1.2061351537704468,
1431
- "eval_runtime": 2.2561,
1432
- "eval_samples_per_second": 69.145,
1433
- "eval_steps_per_second": 3.546,
1434
- "step": 4750
1435
- },
1436
- {
1437
- "epoch": 9.6,
1438
- "grad_norm": 0.30436694622039795,
1439
- "learning_rate": 9.731931258429638e-08,
1440
- "loss": 0.0414,
1441
- "step": 4800
1442
- },
1443
- {
1444
- "epoch": 9.6,
1445
- "eval_loss": 1.2072834968566895,
1446
- "eval_runtime": 2.2552,
1447
- "eval_samples_per_second": 69.173,
1448
- "eval_steps_per_second": 3.547,
1449
- "step": 4800
1450
  }
1451
  ],
1452
  "logging_steps": 50,
@@ -1454,7 +854,7 @@
1454
  "num_input_tokens_seen": 0,
1455
  "num_train_epochs": 10,
1456
  "save_steps": 400,
1457
- "total_flos": 2.318720979561349e+17,
1458
  "train_batch_size": 4,
1459
  "trial_name": null,
1460
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.6,
5
  "eval_steps": 50,
6
+ "global_step": 2800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
847
  "eval_samples_per_second": 69.235,
848
  "eval_steps_per_second": 3.551,
849
  "step": 2800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
850
  }
851
  ],
852
  "logging_steps": 50,
 
854
  "num_input_tokens_seen": 0,
855
  "num_train_epochs": 10,
856
  "save_steps": 400,
857
+ "total_flos": 1.3524716052545536e+17,
858
  "train_batch_size": 4,
859
  "trial_name": null,
860
  "trial_params": null
training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:676922c8c88a048a8c76fcb1420a476fbd64daf190286d2f67fca7597684e8c6
3
- size 5176