End of training

Browse files

Files changed (7) hide show

README.md +2 -1
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1417 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: top_18_ranking_stackexchange
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # top_18_ranking_stackexchange
-This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8025

 base_model: meta-llama/Meta-Llama-3.1-8B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: top_18_ranking_stackexchange
 # top_18_ranking_stackexchange
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) on the mlfoundations-dev/top_18_ranking_stackexchange dataset.
 It achieves the following results on the evaluation set:
 - Loss: 0.8025

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.9963285024154587,
+    "eval_loss": 0.8025317192077637,
+    "eval_runtime": 692.9086,
+    "eval_samples_per_second": 25.156,
+    "eval_steps_per_second": 0.394,
+    "total_flos": 3246012802007040.0,
+    "train_loss": 0.7871413270263357,
+    "train_runtime": 114590.6996,
+    "train_samples_per_second": 8.67,
+    "train_steps_per_second": 0.017
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.9963285024154587,
+    "eval_loss": 0.8025317192077637,
+    "eval_runtime": 692.9086,
+    "eval_samples_per_second": 25.156,
+    "eval_steps_per_second": 0.394
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.9963285024154587,
+    "total_flos": 3246012802007040.0,
+    "train_loss": 0.7871413270263357,
+    "train_runtime": 114590.6996,
+    "train_samples_per_second": 8.67,
+    "train_steps_per_second": 0.017
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1417 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9963285024154587,
+  "eval_steps": 500,
+  "global_step": 1938,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.015458937198067632,
+      "grad_norm": 16.53294779967864,
+      "learning_rate": 5e-06,
+      "loss": 1.0792,
+      "step": 10
+    },
+    {
+      "epoch": 0.030917874396135265,
+      "grad_norm": 2.649325227711683,
+      "learning_rate": 5e-06,
+      "loss": 0.9837,
+      "step": 20
+    },
+    {
+      "epoch": 0.0463768115942029,
+      "grad_norm": 8.284136228811613,
+      "learning_rate": 5e-06,
+      "loss": 0.9474,
+      "step": 30
+    },
+    {
+      "epoch": 0.06183574879227053,
+      "grad_norm": 2.524509268818302,
+      "learning_rate": 5e-06,
+      "loss": 0.9301,
+      "step": 40
+    },
+    {
+      "epoch": 0.07729468599033816,
+      "grad_norm": 1.5326850741391629,
+      "learning_rate": 5e-06,
+      "loss": 0.9155,
+      "step": 50
+    },
+    {
+      "epoch": 0.0927536231884058,
+      "grad_norm": 1.0781869595697948,
+      "learning_rate": 5e-06,
+      "loss": 0.9037,
+      "step": 60
+    },
+    {
+      "epoch": 0.10821256038647344,
+      "grad_norm": 0.8943372564864077,
+      "learning_rate": 5e-06,
+      "loss": 0.8967,
+      "step": 70
+    },
+    {
+      "epoch": 0.12367149758454106,
+      "grad_norm": 0.8868808075595438,
+      "learning_rate": 5e-06,
+      "loss": 0.8867,
+      "step": 80
+    },
+    {
+      "epoch": 0.1391304347826087,
+      "grad_norm": 0.6755190824568067,
+      "learning_rate": 5e-06,
+      "loss": 0.8792,
+      "step": 90
+    },
+    {
+      "epoch": 0.15458937198067632,
+      "grad_norm": 0.6359486667796315,
+      "learning_rate": 5e-06,
+      "loss": 0.8769,
+      "step": 100
+    },
+    {
+      "epoch": 0.17004830917874397,
+      "grad_norm": 0.6299873717513695,
+      "learning_rate": 5e-06,
+      "loss": 0.8717,
+      "step": 110
+    },
+    {
+      "epoch": 0.1855072463768116,
+      "grad_norm": 0.5832655638295026,
+      "learning_rate": 5e-06,
+      "loss": 0.8671,
+      "step": 120
+    },
+    {
+      "epoch": 0.20096618357487922,
+      "grad_norm": 0.5586732401198556,
+      "learning_rate": 5e-06,
+      "loss": 0.8642,
+      "step": 130
+    },
+    {
+      "epoch": 0.21642512077294687,
+      "grad_norm": 0.6759403203697444,
+      "learning_rate": 5e-06,
+      "loss": 0.863,
+      "step": 140
+    },
+    {
+      "epoch": 0.2318840579710145,
+      "grad_norm": 0.5234393167054423,
+      "learning_rate": 5e-06,
+      "loss": 0.861,
+      "step": 150
+    },
+    {
+      "epoch": 0.24734299516908212,
+      "grad_norm": 0.5405697501688478,
+      "learning_rate": 5e-06,
+      "loss": 0.864,
+      "step": 160
+    },
+    {
+      "epoch": 0.26280193236714977,
+      "grad_norm": 0.5609477995758988,
+      "learning_rate": 5e-06,
+      "loss": 0.8528,
+      "step": 170
+    },
+    {
+      "epoch": 0.2782608695652174,
+      "grad_norm": 0.648827500892738,
+      "learning_rate": 5e-06,
+      "loss": 0.857,
+      "step": 180
+    },
+    {
+      "epoch": 0.293719806763285,
+      "grad_norm": 0.6627079853918527,
+      "learning_rate": 5e-06,
+      "loss": 0.8511,
+      "step": 190
+    },
+    {
+      "epoch": 0.30917874396135264,
+      "grad_norm": 0.6915034637639949,
+      "learning_rate": 5e-06,
+      "loss": 0.8485,
+      "step": 200
+    },
+    {
+      "epoch": 0.32463768115942027,
+      "grad_norm": 0.6366893171242987,
+      "learning_rate": 5e-06,
+      "loss": 0.8462,
+      "step": 210
+    },
+    {
+      "epoch": 0.34009661835748795,
+      "grad_norm": 0.5070708715215638,
+      "learning_rate": 5e-06,
+      "loss": 0.8446,
+      "step": 220
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.5871716320705027,
+      "learning_rate": 5e-06,
+      "loss": 0.8445,
+      "step": 230
+    },
+    {
+      "epoch": 0.3710144927536232,
+      "grad_norm": 0.7730510671832644,
+      "learning_rate": 5e-06,
+      "loss": 0.8497,
+      "step": 240
+    },
+    {
+      "epoch": 0.3864734299516908,
+      "grad_norm": 0.7142598054734569,
+      "learning_rate": 5e-06,
+      "loss": 0.8438,
+      "step": 250
+    },
+    {
+      "epoch": 0.40193236714975844,
+      "grad_norm": 0.5417070693738956,
+      "learning_rate": 5e-06,
+      "loss": 0.8476,
+      "step": 260
+    },
+    {
+      "epoch": 0.41739130434782606,
+      "grad_norm": 0.6163533652451042,
+      "learning_rate": 5e-06,
+      "loss": 0.8442,
+      "step": 270
+    },
+    {
+      "epoch": 0.43285024154589374,
+      "grad_norm": 0.5863440757370957,
+      "learning_rate": 5e-06,
+      "loss": 0.8384,
+      "step": 280
+    },
+    {
+      "epoch": 0.44830917874396137,
+      "grad_norm": 0.6841438022567938,
+      "learning_rate": 5e-06,
+      "loss": 0.8416,
+      "step": 290
+    },
+    {
+      "epoch": 0.463768115942029,
+      "grad_norm": 0.5830887688966261,
+      "learning_rate": 5e-06,
+      "loss": 0.8363,
+      "step": 300
+    },
+    {
+      "epoch": 0.4792270531400966,
+      "grad_norm": 0.7565147804535631,
+      "learning_rate": 5e-06,
+      "loss": 0.8391,
+      "step": 310
+    },
+    {
+      "epoch": 0.49468599033816424,
+      "grad_norm": 0.5377296055296723,
+      "learning_rate": 5e-06,
+      "loss": 0.8384,
+      "step": 320
+    },
+    {
+      "epoch": 0.5101449275362319,
+      "grad_norm": 0.6313225350990711,
+      "learning_rate": 5e-06,
+      "loss": 0.8344,
+      "step": 330
+    },
+    {
+      "epoch": 0.5256038647342995,
+      "grad_norm": 0.6011085474374273,
+      "learning_rate": 5e-06,
+      "loss": 0.8352,
+      "step": 340
+    },
+    {
+      "epoch": 0.5410628019323671,
+      "grad_norm": 0.6313718464958992,
+      "learning_rate": 5e-06,
+      "loss": 0.8319,
+      "step": 350
+    },
+    {
+      "epoch": 0.5565217391304348,
+      "grad_norm": 0.5602186929251594,
+      "learning_rate": 5e-06,
+      "loss": 0.8305,
+      "step": 360
+    },
+    {
+      "epoch": 0.5719806763285025,
+      "grad_norm": 0.5682159814727703,
+      "learning_rate": 5e-06,
+      "loss": 0.8283,
+      "step": 370
+    },
+    {
+      "epoch": 0.58743961352657,
+      "grad_norm": 0.5718556832144389,
+      "learning_rate": 5e-06,
+      "loss": 0.8341,
+      "step": 380
+    },
+    {
+      "epoch": 0.6028985507246377,
+      "grad_norm": 0.546768373760242,
+      "learning_rate": 5e-06,
+      "loss": 0.8291,
+      "step": 390
+    },
+    {
+      "epoch": 0.6183574879227053,
+      "grad_norm": 0.5993423126379529,
+      "learning_rate": 5e-06,
+      "loss": 0.8361,
+      "step": 400
+    },
+    {
+      "epoch": 0.633816425120773,
+      "grad_norm": 0.5491459229199431,
+      "learning_rate": 5e-06,
+      "loss": 0.8291,
+      "step": 410
+    },
+    {
+      "epoch": 0.6492753623188405,
+      "grad_norm": 0.5168339143544802,
+      "learning_rate": 5e-06,
+      "loss": 0.83,
+      "step": 420
+    },
+    {
+      "epoch": 0.6647342995169082,
+      "grad_norm": 0.5210184948415354,
+      "learning_rate": 5e-06,
+      "loss": 0.8237,
+      "step": 430
+    },
+    {
+      "epoch": 0.6801932367149759,
+      "grad_norm": 0.5424122263787127,
+      "learning_rate": 5e-06,
+      "loss": 0.8228,
+      "step": 440
+    },
+    {
+      "epoch": 0.6956521739130435,
+      "grad_norm": 0.5637417843194678,
+      "learning_rate": 5e-06,
+      "loss": 0.829,
+      "step": 450
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.4743888435043112,
+      "learning_rate": 5e-06,
+      "loss": 0.8221,
+      "step": 460
+    },
+    {
+      "epoch": 0.7265700483091787,
+      "grad_norm": 0.5224626427829437,
+      "learning_rate": 5e-06,
+      "loss": 0.8254,
+      "step": 470
+    },
+    {
+      "epoch": 0.7420289855072464,
+      "grad_norm": 0.6508696496425336,
+      "learning_rate": 5e-06,
+      "loss": 0.8264,
+      "step": 480
+    },
+    {
+      "epoch": 0.7574879227053141,
+      "grad_norm": 0.5904629481154552,
+      "learning_rate": 5e-06,
+      "loss": 0.8295,
+      "step": 490
+    },
+    {
+      "epoch": 0.7729468599033816,
+      "grad_norm": 0.574388340778744,
+      "learning_rate": 5e-06,
+      "loss": 0.8223,
+      "step": 500
+    },
+    {
+      "epoch": 0.7884057971014493,
+      "grad_norm": 0.5353158279457252,
+      "learning_rate": 5e-06,
+      "loss": 0.8257,
+      "step": 510
+    },
+    {
+      "epoch": 0.8038647342995169,
+      "grad_norm": 0.5821234180105461,
+      "learning_rate": 5e-06,
+      "loss": 0.8265,
+      "step": 520
+    },
+    {
+      "epoch": 0.8193236714975846,
+      "grad_norm": 0.5068894400573091,
+      "learning_rate": 5e-06,
+      "loss": 0.8224,
+      "step": 530
+    },
+    {
+      "epoch": 0.8347826086956521,
+      "grad_norm": 0.5375828449207095,
+      "learning_rate": 5e-06,
+      "loss": 0.821,
+      "step": 540
+    },
+    {
+      "epoch": 0.8502415458937198,
+      "grad_norm": 0.5410980089439529,
+      "learning_rate": 5e-06,
+      "loss": 0.8271,
+      "step": 550
+    },
+    {
+      "epoch": 0.8657004830917875,
+      "grad_norm": 0.5953566167479901,
+      "learning_rate": 5e-06,
+      "loss": 0.8168,
+      "step": 560
+    },
+    {
+      "epoch": 0.881159420289855,
+      "grad_norm": 0.6599677690788177,
+      "learning_rate": 5e-06,
+      "loss": 0.8234,
+      "step": 570
+    },
+    {
+      "epoch": 0.8966183574879227,
+      "grad_norm": 0.6548750016255851,
+      "learning_rate": 5e-06,
+      "loss": 0.8198,
+      "step": 580
+    },
+    {
+      "epoch": 0.9120772946859903,
+      "grad_norm": 0.6314610392042965,
+      "learning_rate": 5e-06,
+      "loss": 0.8169,
+      "step": 590
+    },
+    {
+      "epoch": 0.927536231884058,
+      "grad_norm": 0.4920135279723095,
+      "learning_rate": 5e-06,
+      "loss": 0.8225,
+      "step": 600
+    },
+    {
+      "epoch": 0.9429951690821256,
+      "grad_norm": 0.5279136309564921,
+      "learning_rate": 5e-06,
+      "loss": 0.8188,
+      "step": 610
+    },
+    {
+      "epoch": 0.9584541062801932,
+      "grad_norm": 0.5531816471285108,
+      "learning_rate": 5e-06,
+      "loss": 0.8217,
+      "step": 620
+    },
+    {
+      "epoch": 0.9739130434782609,
+      "grad_norm": 0.5924801838934433,
+      "learning_rate": 5e-06,
+      "loss": 0.8221,
+      "step": 630
+    },
+    {
+      "epoch": 0.9893719806763285,
+      "grad_norm": 0.5117042192273262,
+      "learning_rate": 5e-06,
+      "loss": 0.8188,
+      "step": 640
+    },
+    {
+      "epoch": 0.9986473429951691,
+      "eval_loss": 0.8183467984199524,
+      "eval_runtime": 686.568,
+      "eval_samples_per_second": 25.389,
+      "eval_steps_per_second": 0.398,
+      "step": 646
+    },
+    {
+      "epoch": 1.0050241545893719,
+      "grad_norm": 0.6925909025947767,
+      "learning_rate": 5e-06,
+      "loss": 0.8235,
+      "step": 650
+    },
+    {
+      "epoch": 1.0204830917874397,
+      "grad_norm": 0.6132973767615294,
+      "learning_rate": 5e-06,
+      "loss": 0.7771,
+      "step": 660
+    },
+    {
+      "epoch": 1.0359420289855072,
+      "grad_norm": 0.6797848846411009,
+      "learning_rate": 5e-06,
+      "loss": 0.7802,
+      "step": 670
+    },
+    {
+      "epoch": 1.0514009661835748,
+      "grad_norm": 0.5295735808202817,
+      "learning_rate": 5e-06,
+      "loss": 0.7777,
+      "step": 680
+    },
+    {
+      "epoch": 1.0668599033816426,
+      "grad_norm": 0.5271721002677758,
+      "learning_rate": 5e-06,
+      "loss": 0.7751,
+      "step": 690
+    },
+    {
+      "epoch": 1.0823188405797102,
+      "grad_norm": 0.47521281338293253,
+      "learning_rate": 5e-06,
+      "loss": 0.7808,
+      "step": 700
+    },
+    {
+      "epoch": 1.0977777777777777,
+      "grad_norm": 0.5201403409762577,
+      "learning_rate": 5e-06,
+      "loss": 0.7769,
+      "step": 710
+    },
+    {
+      "epoch": 1.1132367149758453,
+      "grad_norm": 0.5374055398678584,
+      "learning_rate": 5e-06,
+      "loss": 0.7775,
+      "step": 720
+    },
+    {
+      "epoch": 1.128695652173913,
+      "grad_norm": 0.520683864449963,
+      "learning_rate": 5e-06,
+      "loss": 0.7787,
+      "step": 730
+    },
+    {
+      "epoch": 1.1441545893719807,
+      "grad_norm": 0.5406489528118505,
+      "learning_rate": 5e-06,
+      "loss": 0.7816,
+      "step": 740
+    },
+    {
+      "epoch": 1.1596135265700482,
+      "grad_norm": 0.585881797178412,
+      "learning_rate": 5e-06,
+      "loss": 0.7811,
+      "step": 750
+    },
+    {
+      "epoch": 1.175072463768116,
+      "grad_norm": 0.5490222258224376,
+      "learning_rate": 5e-06,
+      "loss": 0.7763,
+      "step": 760
+    },
+    {
+      "epoch": 1.1905314009661836,
+      "grad_norm": 0.6049557272461074,
+      "learning_rate": 5e-06,
+      "loss": 0.7821,
+      "step": 770
+    },
+    {
+      "epoch": 1.2059903381642512,
+      "grad_norm": 0.6287813068938076,
+      "learning_rate": 5e-06,
+      "loss": 0.7771,
+      "step": 780
+    },
+    {
+      "epoch": 1.221449275362319,
+      "grad_norm": 0.5791771698431348,
+      "learning_rate": 5e-06,
+      "loss": 0.7832,
+      "step": 790
+    },
+    {
+      "epoch": 1.2369082125603865,
+      "grad_norm": 0.552647068072239,
+      "learning_rate": 5e-06,
+      "loss": 0.7795,
+      "step": 800
+    },
+    {
+      "epoch": 1.252367149758454,
+      "grad_norm": 0.48953182542010515,
+      "learning_rate": 5e-06,
+      "loss": 0.7767,
+      "step": 810
+    },
+    {
+      "epoch": 1.2678260869565217,
+      "grad_norm": 0.5809037976182655,
+      "learning_rate": 5e-06,
+      "loss": 0.7784,
+      "step": 820
+    },
+    {
+      "epoch": 1.2832850241545894,
+      "grad_norm": 0.49664609280994976,
+      "learning_rate": 5e-06,
+      "loss": 0.7765,
+      "step": 830
+    },
+    {
+      "epoch": 1.298743961352657,
+      "grad_norm": 0.5514267021897065,
+      "learning_rate": 5e-06,
+      "loss": 0.7791,
+      "step": 840
+    },
+    {
+      "epoch": 1.3142028985507246,
+      "grad_norm": 0.6174163379347436,
+      "learning_rate": 5e-06,
+      "loss": 0.7775,
+      "step": 850
+    },
+    {
+      "epoch": 1.3296618357487922,
+      "grad_norm": 0.5893029009867757,
+      "learning_rate": 5e-06,
+      "loss": 0.7743,
+      "step": 860
+    },
+    {
+      "epoch": 1.34512077294686,
+      "grad_norm": 0.5884521899466931,
+      "learning_rate": 5e-06,
+      "loss": 0.7768,
+      "step": 870
+    },
+    {
+      "epoch": 1.3605797101449275,
+      "grad_norm": 0.526781782612563,
+      "learning_rate": 5e-06,
+      "loss": 0.773,
+      "step": 880
+    },
+    {
+      "epoch": 1.376038647342995,
+      "grad_norm": 0.5133541303046,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 890
+    },
+    {
+      "epoch": 1.3914975845410629,
+      "grad_norm": 0.5514217537787884,
+      "learning_rate": 5e-06,
+      "loss": 0.7802,
+      "step": 900
+    },
+    {
+      "epoch": 1.4069565217391304,
+      "grad_norm": 0.5829849669974898,
+      "learning_rate": 5e-06,
+      "loss": 0.7787,
+      "step": 910
+    },
+    {
+      "epoch": 1.422415458937198,
+      "grad_norm": 0.6099035981973764,
+      "learning_rate": 5e-06,
+      "loss": 0.7738,
+      "step": 920
+    },
+    {
+      "epoch": 1.4378743961352658,
+      "grad_norm": 0.4767884324426242,
+      "learning_rate": 5e-06,
+      "loss": 0.7773,
+      "step": 930
+    },
+    {
+      "epoch": 1.4533333333333334,
+      "grad_norm": 0.5611337081061908,
+      "learning_rate": 5e-06,
+      "loss": 0.7767,
+      "step": 940
+    },
+    {
+      "epoch": 1.468792270531401,
+      "grad_norm": 0.47930773858272085,
+      "learning_rate": 5e-06,
+      "loss": 0.7765,
+      "step": 950
+    },
+    {
+      "epoch": 1.4842512077294687,
+      "grad_norm": 0.498168257215718,
+      "learning_rate": 5e-06,
+      "loss": 0.7728,
+      "step": 960
+    },
+    {
+      "epoch": 1.4997101449275363,
+      "grad_norm": 0.5576989172567428,
+      "learning_rate": 5e-06,
+      "loss": 0.7777,
+      "step": 970
+    },
+    {
+      "epoch": 1.5151690821256039,
+      "grad_norm": 0.5873903650866855,
+      "learning_rate": 5e-06,
+      "loss": 0.7747,
+      "step": 980
+    },
+    {
+      "epoch": 1.5306280193236717,
+      "grad_norm": 0.5564865473674926,
+      "learning_rate": 5e-06,
+      "loss": 0.7786,
+      "step": 990
+    },
+    {
+      "epoch": 1.546086956521739,
+      "grad_norm": 0.6746662280932265,
+      "learning_rate": 5e-06,
+      "loss": 0.7823,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5615458937198068,
+      "grad_norm": 0.550553366091711,
+      "learning_rate": 5e-06,
+      "loss": 0.7704,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5770048309178744,
+      "grad_norm": 0.555996816403915,
+      "learning_rate": 5e-06,
+      "loss": 0.7758,
+      "step": 1020
+    },
+    {
+      "epoch": 1.592463768115942,
+      "grad_norm": 0.5621088990135378,
+      "learning_rate": 5e-06,
+      "loss": 0.7751,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6079227053140097,
+      "grad_norm": 0.4672348676970037,
+      "learning_rate": 5e-06,
+      "loss": 0.7742,
+      "step": 1040
+    },
+    {
+      "epoch": 1.6233816425120773,
+      "grad_norm": 0.49112359521062937,
+      "learning_rate": 5e-06,
+      "loss": 0.777,
+      "step": 1050
+    },
+    {
+      "epoch": 1.6388405797101449,
+      "grad_norm": 0.5517626252028611,
+      "learning_rate": 5e-06,
+      "loss": 0.7757,
+      "step": 1060
+    },
+    {
+      "epoch": 1.6542995169082126,
+      "grad_norm": 0.5518129870744243,
+      "learning_rate": 5e-06,
+      "loss": 0.7744,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6697584541062802,
+      "grad_norm": 0.685405898117341,
+      "learning_rate": 5e-06,
+      "loss": 0.7753,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6852173913043478,
+      "grad_norm": 0.5720673042328214,
+      "learning_rate": 5e-06,
+      "loss": 0.7753,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7006763285024156,
+      "grad_norm": 0.4690028175072265,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 1100
+    },
+    {
+      "epoch": 1.7161352657004831,
+      "grad_norm": 0.55568178811657,
+      "learning_rate": 5e-06,
+      "loss": 0.7772,
+      "step": 1110
+    },
+    {
+      "epoch": 1.7315942028985507,
+      "grad_norm": 0.5185570493500619,
+      "learning_rate": 5e-06,
+      "loss": 0.781,
+      "step": 1120
+    },
+    {
+      "epoch": 1.7470531400966185,
+      "grad_norm": 0.5292299708932318,
+      "learning_rate": 5e-06,
+      "loss": 0.7749,
+      "step": 1130
+    },
+    {
+      "epoch": 1.7625120772946858,
+      "grad_norm": 0.5884058161213621,
+      "learning_rate": 5e-06,
+      "loss": 0.7719,
+      "step": 1140
+    },
+    {
+      "epoch": 1.7779710144927536,
+      "grad_norm": 0.5072506431239099,
+      "learning_rate": 5e-06,
+      "loss": 0.7753,
+      "step": 1150
+    },
+    {
+      "epoch": 1.7934299516908214,
+      "grad_norm": 0.5551938392960334,
+      "learning_rate": 5e-06,
+      "loss": 0.7777,
+      "step": 1160
+    },
+    {
+      "epoch": 1.8088888888888888,
+      "grad_norm": 0.5566487078925049,
+      "learning_rate": 5e-06,
+      "loss": 0.7774,
+      "step": 1170
+    },
+    {
+      "epoch": 1.8243478260869566,
+      "grad_norm": 0.4749917546235466,
+      "learning_rate": 5e-06,
+      "loss": 0.7734,
+      "step": 1180
+    },
+    {
+      "epoch": 1.8398067632850241,
+      "grad_norm": 0.5022635709311233,
+      "learning_rate": 5e-06,
+      "loss": 0.7743,
+      "step": 1190
+    },
+    {
+      "epoch": 1.8552657004830917,
+      "grad_norm": 0.5442982810099344,
+      "learning_rate": 5e-06,
+      "loss": 0.7728,
+      "step": 1200
+    },
+    {
+      "epoch": 1.8707246376811595,
+      "grad_norm": 0.5155014433123901,
+      "learning_rate": 5e-06,
+      "loss": 0.774,
+      "step": 1210
+    },
+    {
+      "epoch": 1.886183574879227,
+      "grad_norm": 0.5934285413681538,
+      "learning_rate": 5e-06,
+      "loss": 0.7746,
+      "step": 1220
+    },
+    {
+      "epoch": 1.9016425120772946,
+      "grad_norm": 0.5260175972638601,
+      "learning_rate": 5e-06,
+      "loss": 0.7693,
+      "step": 1230
+    },
+    {
+      "epoch": 1.9171014492753624,
+      "grad_norm": 0.515080447493818,
+      "learning_rate": 5e-06,
+      "loss": 0.7717,
+      "step": 1240
+    },
+    {
+      "epoch": 1.93256038647343,
+      "grad_norm": 0.6011160845737209,
+      "learning_rate": 5e-06,
+      "loss": 0.7754,
+      "step": 1250
+    },
+    {
+      "epoch": 1.9480193236714975,
+      "grad_norm": 0.46061302659355685,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 1260
+    },
+    {
+      "epoch": 1.9634782608695653,
+      "grad_norm": 0.46039484020056154,
+      "learning_rate": 5e-06,
+      "loss": 0.7722,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9789371980676327,
+      "grad_norm": 0.5658493454639554,
+      "learning_rate": 5e-06,
+      "loss": 0.7755,
+      "step": 1280
+    },
+    {
+      "epoch": 1.9943961352657005,
+      "grad_norm": 0.5908199178180503,
+      "learning_rate": 5e-06,
+      "loss": 0.7709,
+      "step": 1290
+    },
+    {
+      "epoch": 1.9990338164251207,
+      "eval_loss": 0.8051349520683289,
+      "eval_runtime": 690.0891,
+      "eval_samples_per_second": 25.259,
+      "eval_steps_per_second": 0.396,
+      "step": 1293
+    },
+    {
+      "epoch": 2.0100483091787438,
+      "grad_norm": 0.6943407863572525,
+      "learning_rate": 5e-06,
+      "loss": 0.7604,
+      "step": 1300
+    },
+    {
+      "epoch": 2.0255072463768116,
+      "grad_norm": 0.5840764739328596,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1310
+    },
+    {
+      "epoch": 2.0409661835748794,
+      "grad_norm": 0.645835170219903,
+      "learning_rate": 5e-06,
+      "loss": 0.7254,
+      "step": 1320
+    },
+    {
+      "epoch": 2.0564251207729467,
+      "grad_norm": 0.6967100498978297,
+      "learning_rate": 5e-06,
+      "loss": 0.7312,
+      "step": 1330
+    },
+    {
+      "epoch": 2.0718840579710145,
+      "grad_norm": 0.5424795093750152,
+      "learning_rate": 5e-06,
+      "loss": 0.7283,
+      "step": 1340
+    },
+    {
+      "epoch": 2.0873429951690823,
+      "grad_norm": 0.5651081335517218,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 1350
+    },
+    {
+      "epoch": 2.1028019323671496,
+      "grad_norm": 0.5793019251125064,
+      "learning_rate": 5e-06,
+      "loss": 0.7317,
+      "step": 1360
+    },
+    {
+      "epoch": 2.1182608695652174,
+      "grad_norm": 0.5653295937261641,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1370
+    },
+    {
+      "epoch": 2.133719806763285,
+      "grad_norm": 0.6945092784765529,
+      "learning_rate": 5e-06,
+      "loss": 0.7346,
+      "step": 1380
+    },
+    {
+      "epoch": 2.1491787439613526,
+      "grad_norm": 0.5795163218543443,
+      "learning_rate": 5e-06,
+      "loss": 0.7336,
+      "step": 1390
+    },
+    {
+      "epoch": 2.1646376811594203,
+      "grad_norm": 0.5922357321216497,
+      "learning_rate": 5e-06,
+      "loss": 0.7299,
+      "step": 1400
+    },
+    {
+      "epoch": 2.1800966183574877,
+      "grad_norm": 0.5570557796263025,
+      "learning_rate": 5e-06,
+      "loss": 0.7333,
+      "step": 1410
+    },
+    {
+      "epoch": 2.1955555555555555,
+      "grad_norm": 0.5392312450784695,
+      "learning_rate": 5e-06,
+      "loss": 0.7371,
+      "step": 1420
+    },
+    {
+      "epoch": 2.2110144927536233,
+      "grad_norm": 0.569063560563541,
+      "learning_rate": 5e-06,
+      "loss": 0.7314,
+      "step": 1430
+    },
+    {
+      "epoch": 2.2264734299516906,
+      "grad_norm": 0.6107660118171969,
+      "learning_rate": 5e-06,
+      "loss": 0.7322,
+      "step": 1440
+    },
+    {
+      "epoch": 2.2419323671497584,
+      "grad_norm": 0.6566517138097786,
+      "learning_rate": 5e-06,
+      "loss": 0.7356,
+      "step": 1450
+    },
+    {
+      "epoch": 2.257391304347826,
+      "grad_norm": 0.5806353609910259,
+      "learning_rate": 5e-06,
+      "loss": 0.7418,
+      "step": 1460
+    },
+    {
+      "epoch": 2.2728502415458935,
+      "grad_norm": 0.544246667709765,
+      "learning_rate": 5e-06,
+      "loss": 0.7319,
+      "step": 1470
+    },
+    {
+      "epoch": 2.2883091787439613,
+      "grad_norm": 0.5424208252581,
+      "learning_rate": 5e-06,
+      "loss": 0.7332,
+      "step": 1480
+    },
+    {
+      "epoch": 2.303768115942029,
+      "grad_norm": 0.5380434193955503,
+      "learning_rate": 5e-06,
+      "loss": 0.7342,
+      "step": 1490
+    },
+    {
+      "epoch": 2.3192270531400965,
+      "grad_norm": 0.5919093406358342,
+      "learning_rate": 5e-06,
+      "loss": 0.7345,
+      "step": 1500
+    },
+    {
+      "epoch": 2.3346859903381643,
+      "grad_norm": 0.5815232359700448,
+      "learning_rate": 5e-06,
+      "loss": 0.7357,
+      "step": 1510
+    },
+    {
+      "epoch": 2.350144927536232,
+      "grad_norm": 0.6561512544812266,
+      "learning_rate": 5e-06,
+      "loss": 0.7339,
+      "step": 1520
+    },
+    {
+      "epoch": 2.3656038647342994,
+      "grad_norm": 0.5328952220385875,
+      "learning_rate": 5e-06,
+      "loss": 0.7297,
+      "step": 1530
+    },
+    {
+      "epoch": 2.381062801932367,
+      "grad_norm": 0.5216733576185124,
+      "learning_rate": 5e-06,
+      "loss": 0.7298,
+      "step": 1540
+    },
+    {
+      "epoch": 2.396521739130435,
+      "grad_norm": 0.6063067814678768,
+      "learning_rate": 5e-06,
+      "loss": 0.7368,
+      "step": 1550
+    },
+    {
+      "epoch": 2.4119806763285023,
+      "grad_norm": 0.5818602690123681,
+      "learning_rate": 5e-06,
+      "loss": 0.7353,
+      "step": 1560
+    },
+    {
+      "epoch": 2.42743961352657,
+      "grad_norm": 0.5913577701518534,
+      "learning_rate": 5e-06,
+      "loss": 0.7338,
+      "step": 1570
+    },
+    {
+      "epoch": 2.442898550724638,
+      "grad_norm": 0.5527497540961946,
+      "learning_rate": 5e-06,
+      "loss": 0.7329,
+      "step": 1580
+    },
+    {
+      "epoch": 2.4583574879227053,
+      "grad_norm": 0.6737570445790982,
+      "learning_rate": 5e-06,
+      "loss": 0.7367,
+      "step": 1590
+    },
+    {
+      "epoch": 2.473816425120773,
+      "grad_norm": 0.6619470586787684,
+      "learning_rate": 5e-06,
+      "loss": 0.733,
+      "step": 1600
+    },
+    {
+      "epoch": 2.4892753623188404,
+      "grad_norm": 0.4750068577638755,
+      "learning_rate": 5e-06,
+      "loss": 0.7375,
+      "step": 1610
+    },
+    {
+      "epoch": 2.504734299516908,
+      "grad_norm": 0.6847743909506772,
+      "learning_rate": 5e-06,
+      "loss": 0.7374,
+      "step": 1620
+    },
+    {
+      "epoch": 2.520193236714976,
+      "grad_norm": 0.5239840846624293,
+      "learning_rate": 5e-06,
+      "loss": 0.7311,
+      "step": 1630
+    },
+    {
+      "epoch": 2.5356521739130433,
+      "grad_norm": 0.4721718835375596,
+      "learning_rate": 5e-06,
+      "loss": 0.7308,
+      "step": 1640
+    },
+    {
+      "epoch": 2.551111111111111,
+      "grad_norm": 0.51093602092176,
+      "learning_rate": 5e-06,
+      "loss": 0.7337,
+      "step": 1650
+    },
+    {
+      "epoch": 2.566570048309179,
+      "grad_norm": 0.5517386015611798,
+      "learning_rate": 5e-06,
+      "loss": 0.7318,
+      "step": 1660
+    },
+    {
+      "epoch": 2.5820289855072462,
+      "grad_norm": 0.6326674619813268,
+      "learning_rate": 5e-06,
+      "loss": 0.736,
+      "step": 1670
+    },
+    {
+      "epoch": 2.597487922705314,
+      "grad_norm": 0.5232840712675151,
+      "learning_rate": 5e-06,
+      "loss": 0.7325,
+      "step": 1680
+    },
+    {
+      "epoch": 2.6129468599033814,
+      "grad_norm": 0.4969751533645812,
+      "learning_rate": 5e-06,
+      "loss": 0.7375,
+      "step": 1690
+    },
+    {
+      "epoch": 2.628405797101449,
+      "grad_norm": 0.49538430512331766,
+      "learning_rate": 5e-06,
+      "loss": 0.7366,
+      "step": 1700
+    },
+    {
+      "epoch": 2.643864734299517,
+      "grad_norm": 0.6208865012276192,
+      "learning_rate": 5e-06,
+      "loss": 0.7372,
+      "step": 1710
+    },
+    {
+      "epoch": 2.6593236714975843,
+      "grad_norm": 0.5276942120485377,
+      "learning_rate": 5e-06,
+      "loss": 0.7351,
+      "step": 1720
+    },
+    {
+      "epoch": 2.674782608695652,
+      "grad_norm": 0.570808842039396,
+      "learning_rate": 5e-06,
+      "loss": 0.7384,
+      "step": 1730
+    },
+    {
+      "epoch": 2.69024154589372,
+      "grad_norm": 0.5214638213365278,
+      "learning_rate": 5e-06,
+      "loss": 0.7361,
+      "step": 1740
+    },
+    {
+      "epoch": 2.7057004830917872,
+      "grad_norm": 0.5190586781651014,
+      "learning_rate": 5e-06,
+      "loss": 0.7309,
+      "step": 1750
+    },
+    {
+      "epoch": 2.721159420289855,
+      "grad_norm": 0.5317230869170978,
+      "learning_rate": 5e-06,
+      "loss": 0.7296,
+      "step": 1760
+    },
+    {
+      "epoch": 2.736618357487923,
+      "grad_norm": 0.5917255596181432,
+      "learning_rate": 5e-06,
+      "loss": 0.7406,
+      "step": 1770
+    },
+    {
+      "epoch": 2.75207729468599,
+      "grad_norm": 0.49202576322983876,
+      "learning_rate": 5e-06,
+      "loss": 0.7324,
+      "step": 1780
+    },
+    {
+      "epoch": 2.767536231884058,
+      "grad_norm": 0.5594574654106287,
+      "learning_rate": 5e-06,
+      "loss": 0.7331,
+      "step": 1790
+    },
+    {
+      "epoch": 2.7829951690821257,
+      "grad_norm": 0.6198580773466541,
+      "learning_rate": 5e-06,
+      "loss": 0.7372,
+      "step": 1800
+    },
+    {
+      "epoch": 2.798454106280193,
+      "grad_norm": 0.5740394274550438,
+      "learning_rate": 5e-06,
+      "loss": 0.7359,
+      "step": 1810
+    },
+    {
+      "epoch": 2.813913043478261,
+      "grad_norm": 0.5501912428656768,
+      "learning_rate": 5e-06,
+      "loss": 0.7384,
+      "step": 1820
+    },
+    {
+      "epoch": 2.8293719806763287,
+      "grad_norm": 0.5104778986757859,
+      "learning_rate": 5e-06,
+      "loss": 0.7324,
+      "step": 1830
+    },
+    {
+      "epoch": 2.844830917874396,
+      "grad_norm": 0.5395220598812313,
+      "learning_rate": 5e-06,
+      "loss": 0.736,
+      "step": 1840
+    },
+    {
+      "epoch": 2.860289855072464,
+      "grad_norm": 0.6030104859258091,
+      "learning_rate": 5e-06,
+      "loss": 0.7327,
+      "step": 1850
+    },
+    {
+      "epoch": 2.8757487922705316,
+      "grad_norm": 0.556906171705928,
+      "learning_rate": 5e-06,
+      "loss": 0.7374,
+      "step": 1860
+    },
+    {
+      "epoch": 2.891207729468599,
+      "grad_norm": 0.6174821846225631,
+      "learning_rate": 5e-06,
+      "loss": 0.7351,
+      "step": 1870
+    },
+    {
+      "epoch": 2.9066666666666667,
+      "grad_norm": 0.5078906232420815,
+      "learning_rate": 5e-06,
+      "loss": 0.7326,
+      "step": 1880
+    },
+    {
+      "epoch": 2.9221256038647345,
+      "grad_norm": 0.6177111487230912,
+      "learning_rate": 5e-06,
+      "loss": 0.7321,
+      "step": 1890
+    },
+    {
+      "epoch": 2.937584541062802,
+      "grad_norm": 0.5520929737500946,
+      "learning_rate": 5e-06,
+      "loss": 0.7395,
+      "step": 1900
+    },
+    {
+      "epoch": 2.9530434782608697,
+      "grad_norm": 0.5185834378400617,
+      "learning_rate": 5e-06,
+      "loss": 0.7368,
+      "step": 1910
+    },
+    {
+      "epoch": 2.9685024154589374,
+      "grad_norm": 0.5204851978024219,
+      "learning_rate": 5e-06,
+      "loss": 0.7339,
+      "step": 1920
+    },
+    {
+      "epoch": 2.983961352657005,
+      "grad_norm": 0.5807949438616106,
+      "learning_rate": 5e-06,
+      "loss": 0.7386,
+      "step": 1930
+    },
+    {
+      "epoch": 2.9963285024154587,
+      "eval_loss": 0.8025317192077637,
+      "eval_runtime": 693.9451,
+      "eval_samples_per_second": 25.119,
+      "eval_steps_per_second": 0.393,
+      "step": 1938
+    },
+    {
+      "epoch": 2.9963285024154587,
+      "step": 1938,
+      "total_flos": 3246012802007040.0,
+      "train_loss": 0.7871413270263357,
+      "train_runtime": 114590.6996,
+      "train_samples_per_second": 8.67,
+      "train_steps_per_second": 0.017
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 1938,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3246012802007040.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed