diff --git a/.ipynb_checkpoints/Untitled-checkpoint.ipynb b/.ipynb_checkpoints/Untitled-checkpoint.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..363fcab7ed6e9634e198cf5555ceb88932c9a245 --- /dev/null +++ b/.ipynb_checkpoints/Untitled-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/PT/.ipynb_checkpoints/trainer_log-checkpoint.jsonl b/PT/.ipynb_checkpoints/trainer_log-checkpoint.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41e1a08e24a916eb1d8fe30e21ff61df365c1f76 --- /dev/null +++ b/PT/.ipynb_checkpoints/trainer_log-checkpoint.jsonl @@ -0,0 +1,582 @@ +{"current_steps": 5, "total_steps": 2766, "loss": 2.0025, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999919374161553, "epoch": 0.01, "percentage": 0.18, "elapsed_time": "0:00:18", "remaining_time": "2:48:35"} +{"current_steps": 10, "total_steps": 2766, "loss": 1.7737, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999677499246417, "epoch": 0.01, "percentage": 0.36, "elapsed_time": "0:00:35", "remaining_time": "2:44:54"} +{"current_steps": 15, "total_steps": 2766, "loss": 1.7391, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999274383055143, "epoch": 0.02, "percentage": 0.54, "elapsed_time": "0:00:53", "remaining_time": "2:43:29"} +{"current_steps": 20, "total_steps": 2766, "loss": 1.7959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009998710038588363, "epoch": 0.02, "percentage": 0.72, "elapsed_time": "0:01:11", "remaining_time": "2:42:37"} +{"current_steps": 25, "total_steps": 2766, "loss": 1.713, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009997984484046375, "epoch": 0.03, "percentage": 0.9, "elapsed_time": "0:01:28", "remaining_time": "2:41:59"} +{"current_steps": 30, "total_steps": 2766, "loss": 1.6441, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009997097742828556, "epoch": 0.03, "percentage": 1.08, "elapsed_time": "0:01:46", "remaining_time": "2:41:28"} +{"current_steps": 35, "total_steps": 2766, "loss": 1.704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009996049843532607, "epoch": 0.04, "percentage": 1.27, "elapsed_time": "0:02:03", "remaining_time": "2:41:01"} +{"current_steps": 40, "total_steps": 2766, "loss": 1.6532, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009994840819953633, "epoch": 0.04, "percentage": 1.45, "elapsed_time": "0:02:21", "remaining_time": "2:40:36"} +{"current_steps": 45, "total_steps": 2766, "loss": 1.6791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009993470711083048, "epoch": 0.05, "percentage": 1.63, "elapsed_time": "0:02:38", "remaining_time": "2:40:13"} +{"current_steps": 50, "total_steps": 2766, "loss": 1.6465, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009991939561107325, "epoch": 0.05, "percentage": 1.81, "elapsed_time": "0:02:56", "remaining_time": "2:39:51"} +{"current_steps": 55, "total_steps": 2766, "loss": 1.6511, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000999024741940656, "epoch": 0.06, "percentage": 1.99, "elapsed_time": "0:03:14", "remaining_time": "2:39:30"} +{"current_steps": 60, "total_steps": 2766, "loss": 1.6727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009988394340552898, "epoch": 0.07, "percentage": 2.17, "elapsed_time": "0:03:31", "remaining_time": "2:39:09"} +{"current_steps": 65, "total_steps": 2766, "loss": 1.6653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009986380384308746, "epoch": 0.07, "percentage": 2.35, "elapsed_time": "0:03:49", "remaining_time": "2:38:49"} +{"current_steps": 70, "total_steps": 2766, "loss": 1.6339, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009984205615624873, "epoch": 0.08, "percentage": 2.53, "elapsed_time": "0:04:06", "remaining_time": "2:38:29"} +{"current_steps": 75, "total_steps": 2766, "loss": 1.5562, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009981870104638294, "epoch": 0.08, "percentage": 2.71, "elapsed_time": "0:04:24", "remaining_time": "2:38:09"} +{"current_steps": 80, "total_steps": 2766, "loss": 1.6291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009979373926670028, "epoch": 0.09, "percentage": 2.89, "elapsed_time": "0:04:42", "remaining_time": "2:37:50"} +{"current_steps": 85, "total_steps": 2766, "loss": 1.625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009976717162222645, "epoch": 0.09, "percentage": 3.07, "elapsed_time": "0:04:59", "remaining_time": "2:37:31"} +{"current_steps": 90, "total_steps": 2766, "loss": 1.6008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009973899896977695, "epoch": 0.1, "percentage": 3.25, "elapsed_time": "0:05:17", "remaining_time": "2:37:12"} +{"current_steps": 95, "total_steps": 2766, "loss": 1.6821, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000997092222179292, "epoch": 0.1, "percentage": 3.43, "elapsed_time": "0:05:34", "remaining_time": "2:36:54"} +{"current_steps": 100, "total_steps": 2766, "loss": 1.582, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009967784232699352, "epoch": 0.11, "percentage": 3.62, "elapsed_time": "0:05:52", "remaining_time": "2:36:35"} +{"current_steps": 100, "total_steps": 2766, "loss": null, "eval_loss": 1.6186352968215942, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.11, "percentage": 3.62, "elapsed_time": "0:05:52", "remaining_time": "2:36:35"} +{"current_steps": 105, "total_steps": 2766, "loss": 1.5769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009964486030898186, "epoch": 0.11, "percentage": 3.8, "elapsed_time": "0:06:20", "remaining_time": "2:40:48"} +{"current_steps": 110, "total_steps": 2766, "loss": 1.5868, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009961027722757538, "epoch": 0.12, "percentage": 3.98, "elapsed_time": "0:06:38", "remaining_time": "2:40:17"} +{"current_steps": 115, "total_steps": 2766, "loss": 1.5601, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009957409419809006, "epoch": 0.12, "percentage": 4.16, "elapsed_time": "0:06:55", "remaining_time": "2:39:47"} +{"current_steps": 120, "total_steps": 2766, "loss": 1.6061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000995363123874407, "epoch": 0.13, "percentage": 4.34, "elapsed_time": "0:07:13", "remaining_time": "2:39:18"} +{"current_steps": 125, "total_steps": 2766, "loss": 1.6073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009949693301410341, "epoch": 0.14, "percentage": 4.52, "elapsed_time": "0:07:31", "remaining_time": "2:38:50"} +{"current_steps": 130, "total_steps": 2766, "loss": 1.4998, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009945595734807615, "epoch": 0.14, "percentage": 4.7, "elapsed_time": "0:07:48", "remaining_time": "2:38:22"} +{"current_steps": 135, "total_steps": 2766, "loss": 1.5295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009941338671083794, "epoch": 0.15, "percentage": 4.88, "elapsed_time": "0:08:06", "remaining_time": "2:37:56"} +{"current_steps": 140, "total_steps": 2766, "loss": 1.5418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009936922247530606, "epoch": 0.15, "percentage": 5.06, "elapsed_time": "0:08:23", "remaining_time": "2:37:30"} +{"current_steps": 145, "total_steps": 2766, "loss": 1.554, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009932346606579192, "epoch": 0.16, "percentage": 5.24, "elapsed_time": "0:08:41", "remaining_time": "2:37:04"} +{"current_steps": 150, "total_steps": 2766, "loss": 1.5509, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009927611895795513, "epoch": 0.16, "percentage": 5.42, "elapsed_time": "0:08:58", "remaining_time": "2:36:40"} +{"current_steps": 155, "total_steps": 2766, "loss": 1.6123, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009922718267875571, "epoch": 0.17, "percentage": 5.6, "elapsed_time": "0:09:16", "remaining_time": "2:36:15"} +{"current_steps": 160, "total_steps": 2766, "loss": 1.6267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009917665880640515, "epoch": 0.17, "percentage": 5.78, "elapsed_time": "0:09:34", "remaining_time": "2:35:51"} +{"current_steps": 165, "total_steps": 2766, "loss": 1.6116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009912454897031524, "epoch": 0.18, "percentage": 5.97, "elapsed_time": "0:09:51", "remaining_time": "2:35:28"} +{"current_steps": 170, "total_steps": 2766, "loss": 1.5618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009907085485104568, "epoch": 0.18, "percentage": 6.15, "elapsed_time": "0:10:09", "remaining_time": "2:35:04"} +{"current_steps": 175, "total_steps": 2766, "loss": 1.6085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009901557818024981, "epoch": 0.19, "percentage": 6.33, "elapsed_time": "0:10:26", "remaining_time": "2:34:41"} +{"current_steps": 180, "total_steps": 2766, "loss": 1.5829, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009895872074061885, "epoch": 0.2, "percentage": 6.51, "elapsed_time": "0:10:44", "remaining_time": "2:34:19"} +{"current_steps": 185, "total_steps": 2766, "loss": 1.5407, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009890028436582426, "epoch": 0.2, "percentage": 6.69, "elapsed_time": "0:11:02", "remaining_time": "2:33:57"} +{"current_steps": 190, "total_steps": 2766, "loss": 1.5568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009884027094045871, "epoch": 0.21, "percentage": 6.87, "elapsed_time": "0:11:19", "remaining_time": "2:33:34"} +{"current_steps": 195, "total_steps": 2766, "loss": 1.5831, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009877868239997532, "epoch": 0.21, "percentage": 7.05, "elapsed_time": "0:11:37", "remaining_time": "2:33:13"} +{"current_steps": 200, "total_steps": 2766, "loss": 1.5231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009871552073062516, "epoch": 0.22, "percentage": 7.23, "elapsed_time": "0:11:54", "remaining_time": "2:32:51"} +{"current_steps": 200, "total_steps": 2766, "loss": null, "eval_loss": 1.5717933177947998, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.22, "percentage": 7.23, "elapsed_time": "0:11:54", "remaining_time": "2:32:51"} +{"current_steps": 205, "total_steps": 2766, "loss": 1.5467, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009865078796939327, "epoch": 0.22, "percentage": 7.41, "elapsed_time": "0:12:23", "remaining_time": "2:34:43"} +{"current_steps": 210, "total_steps": 2766, "loss": 1.6403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000985844862039329, "epoch": 0.23, "percentage": 7.59, "elapsed_time": "0:12:40", "remaining_time": "2:34:19"} +{"current_steps": 215, "total_steps": 2766, "loss": 1.5352, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009851661757249823, "epoch": 0.23, "percentage": 7.77, "elapsed_time": "0:12:58", "remaining_time": "2:33:54"} +{"current_steps": 220, "total_steps": 2766, "loss": 1.5616, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009844718426387537, "epoch": 0.24, "percentage": 7.95, "elapsed_time": "0:13:15", "remaining_time": "2:33:30"} +{"current_steps": 225, "total_steps": 2766, "loss": 1.5274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000983761885173118, "epoch": 0.24, "percentage": 8.13, "elapsed_time": "0:13:33", "remaining_time": "2:33:06"} +{"current_steps": 230, "total_steps": 2766, "loss": 1.6153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000983036326224442, "epoch": 0.25, "percentage": 8.32, "elapsed_time": "0:13:51", "remaining_time": "2:32:43"} +{"current_steps": 235, "total_steps": 2766, "loss": 1.5062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009822951891922448, "epoch": 0.25, "percentage": 8.5, "elapsed_time": "0:14:08", "remaining_time": "2:32:20"} +{"current_steps": 240, "total_steps": 2766, "loss": 1.6038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009815384979784444, "epoch": 0.26, "percentage": 8.68, "elapsed_time": "0:14:26", "remaining_time": "2:31:57"} +{"current_steps": 245, "total_steps": 2766, "loss": 1.5097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000980766276986586, "epoch": 0.27, "percentage": 8.86, "elapsed_time": "0:14:43", "remaining_time": "2:31:34"} +{"current_steps": 250, "total_steps": 2766, "loss": 1.535, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009799785511210557, "epoch": 0.27, "percentage": 9.04, "elapsed_time": "0:15:01", "remaining_time": "2:31:11"} +{"current_steps": 255, "total_steps": 2766, "loss": 1.52, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000979175345786277, "epoch": 0.28, "percentage": 9.22, "elapsed_time": "0:15:18", "remaining_time": "2:30:49"} +{"current_steps": 260, "total_steps": 2766, "loss": 1.5678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009783566868858912, "epoch": 0.28, "percentage": 9.4, "elapsed_time": "0:15:36", "remaining_time": "2:30:27"} +{"current_steps": 265, "total_steps": 2766, "loss": 1.5536, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009775226008219224, "epoch": 0.29, "percentage": 9.58, "elapsed_time": "0:15:54", "remaining_time": "2:30:04"} +{"current_steps": 270, "total_steps": 2766, "loss": 1.4826, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009766731144939258, "epoch": 0.29, "percentage": 9.76, "elapsed_time": "0:16:11", "remaining_time": "2:29:43"} +{"current_steps": 275, "total_steps": 2766, "loss": 1.5537, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009758082552981204, "epoch": 0.3, "percentage": 9.94, "elapsed_time": "0:16:29", "remaining_time": "2:29:21"} +{"current_steps": 280, "total_steps": 2766, "loss": 1.5277, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009749280511265056, "epoch": 0.3, "percentage": 10.12, "elapsed_time": "0:16:46", "remaining_time": "2:28:59"} +{"current_steps": 285, "total_steps": 2766, "loss": 1.5445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009740325303659609, "epoch": 0.31, "percentage": 10.3, "elapsed_time": "0:17:04", "remaining_time": "2:28:38"} +{"current_steps": 290, "total_steps": 2766, "loss": 1.4944, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000973121721897331, "epoch": 0.31, "percentage": 10.48, "elapsed_time": "0:17:22", "remaining_time": "2:28:17"} +{"current_steps": 295, "total_steps": 2766, "loss": 1.5088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009721956550944948, "epoch": 0.32, "percentage": 10.67, "elapsed_time": "0:17:39", "remaining_time": "2:27:55"} +{"current_steps": 300, "total_steps": 2766, "loss": 1.585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009712543598234172, "epoch": 0.33, "percentage": 10.85, "elapsed_time": "0:17:57", "remaining_time": "2:27:34"} +{"current_steps": 300, "total_steps": 2766, "loss": null, "eval_loss": 1.5345921516418457, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.33, "percentage": 10.85, "elapsed_time": "0:17:57", "remaining_time": "2:27:34"} +{"current_steps": 305, "total_steps": 2766, "loss": 1.5427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009702978664411863, "epoch": 0.33, "percentage": 11.03, "elapsed_time": "0:18:25", "remaining_time": "2:28:40"} +{"current_steps": 310, "total_steps": 2766, "loss": 1.4475, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009693262057950345, "epoch": 0.34, "percentage": 11.21, "elapsed_time": "0:18:43", "remaining_time": "2:28:18"} +{"current_steps": 315, "total_steps": 2766, "loss": 1.5321, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009683394092213436, "epoch": 0.34, "percentage": 11.39, "elapsed_time": "0:19:00", "remaining_time": "2:27:55"} +{"current_steps": 320, "total_steps": 2766, "loss": 1.5171, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009673375085446339, "epoch": 0.35, "percentage": 11.57, "elapsed_time": "0:19:18", "remaining_time": "2:27:33"} +{"current_steps": 325, "total_steps": 2766, "loss": 1.5198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009663205360765382, "epoch": 0.35, "percentage": 11.75, "elapsed_time": "0:19:35", "remaining_time": "2:27:11"} +{"current_steps": 330, "total_steps": 2766, "loss": 1.492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00096528852461476, "epoch": 0.36, "percentage": 11.93, "elapsed_time": "0:19:53", "remaining_time": "2:26:49"} +{"current_steps": 335, "total_steps": 2766, "loss": 1.5036, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009642415074420146, "epoch": 0.36, "percentage": 12.11, "elapsed_time": "0:20:11", "remaining_time": "2:26:28"} +{"current_steps": 340, "total_steps": 2766, "loss": 1.5134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009631795183249573, "epoch": 0.37, "percentage": 12.29, "elapsed_time": "0:20:28", "remaining_time": "2:26:06"} +{"current_steps": 345, "total_steps": 2766, "loss": 1.5568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009621025915130932, "epoch": 0.37, "percentage": 12.47, "elapsed_time": "0:20:46", "remaining_time": "2:25:45"} +{"current_steps": 350, "total_steps": 2766, "loss": 1.503, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009610107617376733, "epoch": 0.38, "percentage": 12.65, "elapsed_time": "0:21:03", "remaining_time": "2:25:23"} +{"current_steps": 355, "total_steps": 2766, "loss": 1.4584, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009599040642105736, "epoch": 0.38, "percentage": 12.83, "elapsed_time": "0:21:21", "remaining_time": "2:25:02"} +{"current_steps": 360, "total_steps": 2766, "loss": 1.4832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000958782534623161, "epoch": 0.39, "percentage": 13.02, "elapsed_time": "0:21:38", "remaining_time": "2:24:41"} +{"current_steps": 365, "total_steps": 2766, "loss": 1.4598, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009576462091451406, "epoch": 0.4, "percentage": 13.2, "elapsed_time": "0:21:56", "remaining_time": "2:24:20"} +{"current_steps": 370, "total_steps": 2766, "loss": 1.5492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009564951244233901, "epoch": 0.4, "percentage": 13.38, "elapsed_time": "0:22:14", "remaining_time": "2:23:59"} +{"current_steps": 375, "total_steps": 2766, "loss": 1.5145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000955329317580778, "epoch": 0.41, "percentage": 13.56, "elapsed_time": "0:22:31", "remaining_time": "2:23:38"} +{"current_steps": 380, "total_steps": 2766, "loss": 1.589, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009541488262149661, "epoch": 0.41, "percentage": 13.74, "elapsed_time": "0:22:49", "remaining_time": "2:23:17"} +{"current_steps": 385, "total_steps": 2766, "loss": 1.6003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009529536883971963, "epoch": 0.42, "percentage": 13.92, "elapsed_time": "0:23:06", "remaining_time": "2:22:56"} +{"current_steps": 390, "total_steps": 2766, "loss": 1.55, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009517439426710646, "epoch": 0.42, "percentage": 14.1, "elapsed_time": "0:23:24", "remaining_time": "2:22:36"} +{"current_steps": 395, "total_steps": 2766, "loss": 1.5359, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009505196280512762, "epoch": 0.43, "percentage": 14.28, "elapsed_time": "0:23:42", "remaining_time": "2:22:15"} +{"current_steps": 400, "total_steps": 2766, "loss": 1.4854, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009492807840223881, "epoch": 0.43, "percentage": 14.46, "elapsed_time": "0:23:59", "remaining_time": "2:21:55"} +{"current_steps": 400, "total_steps": 2766, "loss": null, "eval_loss": 1.5193477869033813, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.43, "percentage": 14.46, "elapsed_time": "0:23:59", "remaining_time": "2:21:55"} +{"current_steps": 405, "total_steps": 2766, "loss": 1.4891, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009480274505375358, "epoch": 0.44, "percentage": 14.64, "elapsed_time": "0:24:27", "remaining_time": "2:22:37"} +{"current_steps": 410, "total_steps": 2766, "loss": 1.4719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009467596680171446, "epoch": 0.44, "percentage": 14.82, "elapsed_time": "0:24:45", "remaining_time": "2:22:16"} +{"current_steps": 415, "total_steps": 2766, "loss": 1.4939, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009454774773476257, "epoch": 0.45, "percentage": 15.0, "elapsed_time": "0:25:03", "remaining_time": "2:21:55"} +{"current_steps": 420, "total_steps": 2766, "loss": 1.4382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009441809198800587, "epoch": 0.46, "percentage": 15.18, "elapsed_time": "0:25:20", "remaining_time": "2:21:34"} +{"current_steps": 425, "total_steps": 2766, "loss": 1.4427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009428700374288564, "epoch": 0.46, "percentage": 15.37, "elapsed_time": "0:25:38", "remaining_time": "2:21:13"} +{"current_steps": 430, "total_steps": 2766, "loss": 1.4767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009415448722704175, "epoch": 0.47, "percentage": 15.55, "elapsed_time": "0:25:55", "remaining_time": "2:20:52"} +{"current_steps": 435, "total_steps": 2766, "loss": 1.4799, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009402054671417628, "epoch": 0.47, "percentage": 15.73, "elapsed_time": "0:26:13", "remaining_time": "2:20:31"} +{"current_steps": 440, "total_steps": 2766, "loss": 1.4608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009388518652391571, "epoch": 0.48, "percentage": 15.91, "elapsed_time": "0:26:31", "remaining_time": "2:20:10"} +{"current_steps": 445, "total_steps": 2766, "loss": 1.4937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009374841102167157, "epoch": 0.48, "percentage": 16.09, "elapsed_time": "0:26:48", "remaining_time": "2:19:49"} +{"current_steps": 450, "total_steps": 2766, "loss": 1.5468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009361022461849965, "epoch": 0.49, "percentage": 16.27, "elapsed_time": "0:27:06", "remaining_time": "2:19:29"} +{"current_steps": 455, "total_steps": 2766, "loss": 1.5481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009347063177095783, "epoch": 0.49, "percentage": 16.45, "elapsed_time": "0:27:23", "remaining_time": "2:19:08"} +{"current_steps": 460, "total_steps": 2766, "loss": 1.4478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009332963698096223, "epoch": 0.5, "percentage": 16.63, "elapsed_time": "0:27:41", "remaining_time": "2:18:48"} +{"current_steps": 465, "total_steps": 2766, "loss": 1.4977, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009318724479564215, "epoch": 0.5, "percentage": 16.81, "elapsed_time": "0:27:58", "remaining_time": "2:18:27"} +{"current_steps": 470, "total_steps": 2766, "loss": 1.5091, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009304345980719329, "epoch": 0.51, "percentage": 16.99, "elapsed_time": "0:28:16", "remaining_time": "2:18:07"} +{"current_steps": 475, "total_steps": 2766, "loss": 1.43, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009289828665272977, "epoch": 0.51, "percentage": 17.17, "elapsed_time": "0:28:34", "remaining_time": "2:17:47"} +{"current_steps": 480, "total_steps": 2766, "loss": 1.4725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009275173001413448, "epoch": 0.52, "percentage": 17.35, "elapsed_time": "0:28:51", "remaining_time": "2:17:27"} +{"current_steps": 485, "total_steps": 2766, "loss": 1.3741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009260379461790822, "epoch": 0.53, "percentage": 17.53, "elapsed_time": "0:29:09", "remaining_time": "2:17:06"} +{"current_steps": 490, "total_steps": 2766, "loss": 1.4917, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009245448523501708, "epoch": 0.53, "percentage": 17.72, "elapsed_time": "0:29:26", "remaining_time": "2:16:46"} +{"current_steps": 495, "total_steps": 2766, "loss": 1.4684, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009230380668073877, "epoch": 0.54, "percentage": 17.9, "elapsed_time": "0:29:44", "remaining_time": "2:16:26"} +{"current_steps": 500, "total_steps": 2766, "loss": 1.5209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009215176381450717, "epoch": 0.54, "percentage": 18.08, "elapsed_time": "0:30:02", "remaining_time": "2:16:06"} +{"current_steps": 500, "total_steps": 2766, "loss": null, "eval_loss": 1.5050214529037476, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.54, "percentage": 18.08, "elapsed_time": "0:30:02", "remaining_time": "2:16:06"} +{"current_steps": 505, "total_steps": 2766, "loss": 1.4913, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009199836153975573, "epoch": 0.55, "percentage": 18.26, "elapsed_time": "0:30:30", "remaining_time": "2:16:34"} +{"current_steps": 510, "total_steps": 2766, "loss": 1.5377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009184360480375926, "epoch": 0.55, "percentage": 18.44, "elapsed_time": "0:30:47", "remaining_time": "2:16:14"} +{"current_steps": 515, "total_steps": 2766, "loss": 1.4608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009168749859747438, "epoch": 0.56, "percentage": 18.62, "elapsed_time": "0:31:05", "remaining_time": "2:15:53"} +{"current_steps": 520, "total_steps": 2766, "loss": 1.4738, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009153004795537861, "epoch": 0.56, "percentage": 18.8, "elapsed_time": "0:31:23", "remaining_time": "2:15:33"} +{"current_steps": 525, "total_steps": 2766, "loss": 1.4947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009137125795530795, "epoch": 0.57, "percentage": 18.98, "elapsed_time": "0:31:40", "remaining_time": "2:15:13"} +{"current_steps": 530, "total_steps": 2766, "loss": 1.5267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009121113371829318, "epoch": 0.57, "percentage": 19.16, "elapsed_time": "0:31:58", "remaining_time": "2:14:52"} +{"current_steps": 535, "total_steps": 2766, "loss": 1.5116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009104968040839463, "epoch": 0.58, "percentage": 19.34, "elapsed_time": "0:32:15", "remaining_time": "2:14:32"} +{"current_steps": 540, "total_steps": 2766, "loss": 1.4423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000908869032325357, "epoch": 0.59, "percentage": 19.52, "elapsed_time": "0:32:33", "remaining_time": "2:14:12"} +{"current_steps": 545, "total_steps": 2766, "loss": 1.4565, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000907228074403349, "epoch": 0.59, "percentage": 19.7, "elapsed_time": "0:32:50", "remaining_time": "2:13:52"} +{"current_steps": 550, "total_steps": 2766, "loss": 1.4923, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009055739832393655, "epoch": 0.6, "percentage": 19.88, "elapsed_time": "0:33:08", "remaining_time": "2:13:32"} +{"current_steps": 555, "total_steps": 2766, "loss": 1.4304, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009039068121784016, "epoch": 0.6, "percentage": 20.07, "elapsed_time": "0:33:26", "remaining_time": "2:13:12"} +{"current_steps": 560, "total_steps": 2766, "loss": 1.4422, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009022266149872829, "epoch": 0.61, "percentage": 20.25, "elapsed_time": "0:33:43", "remaining_time": "2:12:52"} +{"current_steps": 565, "total_steps": 2766, "loss": 1.522, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009005334458529322, "epoch": 0.61, "percentage": 20.43, "elapsed_time": "0:34:01", "remaining_time": "2:12:32"} +{"current_steps": 570, "total_steps": 2766, "loss": 1.499, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008988273593806222, "epoch": 0.62, "percentage": 20.61, "elapsed_time": "0:34:18", "remaining_time": "2:12:12"} +{"current_steps": 575, "total_steps": 2766, "loss": 1.4796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008971084105922139, "epoch": 0.62, "percentage": 20.79, "elapsed_time": "0:34:36", "remaining_time": "2:11:52"} +{"current_steps": 580, "total_steps": 2766, "loss": 1.4231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008953766549243818, "epoch": 0.63, "percentage": 20.97, "elapsed_time": "0:34:54", "remaining_time": "2:11:32"} +{"current_steps": 585, "total_steps": 2766, "loss": 1.462, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008936321482268275, "epoch": 0.63, "percentage": 21.15, "elapsed_time": "0:35:11", "remaining_time": "2:11:12"} +{"current_steps": 590, "total_steps": 2766, "loss": 1.5191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008918749467604766, "epoch": 0.64, "percentage": 21.33, "elapsed_time": "0:35:29", "remaining_time": "2:10:52"} +{"current_steps": 595, "total_steps": 2766, "loss": 1.4845, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008901051071956661, "epoch": 0.64, "percentage": 21.51, "elapsed_time": "0:35:46", "remaining_time": "2:10:33"} +{"current_steps": 600, "total_steps": 2766, "loss": 1.4652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008883226866103152, "epoch": 0.65, "percentage": 21.69, "elapsed_time": "0:36:04", "remaining_time": "2:10:13"} +{"current_steps": 600, "total_steps": 2766, "loss": null, "eval_loss": 1.486396074295044, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.65, "percentage": 21.69, "elapsed_time": "0:36:04", "remaining_time": "2:10:13"} +{"current_steps": 605, "total_steps": 2766, "loss": 1.4773, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008865277424880859, "epoch": 0.66, "percentage": 21.87, "elapsed_time": "0:36:32", "remaining_time": "2:10:32"} +{"current_steps": 610, "total_steps": 2766, "loss": 1.4555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008847203327165278, "epoch": 0.66, "percentage": 22.05, "elapsed_time": "0:36:50", "remaining_time": "2:10:12"} +{"current_steps": 615, "total_steps": 2766, "loss": 1.5235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008829005155852125, "epoch": 0.67, "percentage": 22.23, "elapsed_time": "0:37:07", "remaining_time": "2:09:52"} +{"current_steps": 620, "total_steps": 2766, "loss": 1.4329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008810683497838525, "epoch": 0.67, "percentage": 22.42, "elapsed_time": "0:37:25", "remaining_time": "2:09:32"} +{"current_steps": 625, "total_steps": 2766, "loss": 1.4515, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008792238944004096, "epoch": 0.68, "percentage": 22.6, "elapsed_time": "0:37:43", "remaining_time": "2:09:12"} +{"current_steps": 630, "total_steps": 2766, "loss": 1.4616, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008773672089191885, "epoch": 0.68, "percentage": 22.78, "elapsed_time": "0:38:00", "remaining_time": "2:08:52"} +{"current_steps": 635, "total_steps": 2766, "loss": 1.3931, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008754983532189185, "epoch": 0.69, "percentage": 22.96, "elapsed_time": "0:38:18", "remaining_time": "2:08:32"} +{"current_steps": 640, "total_steps": 2766, "loss": 1.4714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008736173875708229, "epoch": 0.69, "percentage": 23.14, "elapsed_time": "0:38:35", "remaining_time": "2:08:12"} +{"current_steps": 645, "total_steps": 2766, "loss": 1.4831, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008717243726366746, "epoch": 0.7, "percentage": 23.32, "elapsed_time": "0:38:53", "remaining_time": "2:07:52"} +{"current_steps": 650, "total_steps": 2766, "loss": 1.4928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00086981936946684, "epoch": 0.7, "percentage": 23.5, "elapsed_time": "0:39:10", "remaining_time": "2:07:33"} +{"current_steps": 655, "total_steps": 2766, "loss": 1.3735, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008679024394983105, "epoch": 0.71, "percentage": 23.68, "elapsed_time": "0:39:28", "remaining_time": "2:07:13"} +{"current_steps": 660, "total_steps": 2766, "loss": 1.4587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008659736445527202, "epoch": 0.72, "percentage": 23.86, "elapsed_time": "0:39:46", "remaining_time": "2:06:53"} +{"current_steps": 665, "total_steps": 2766, "loss": 1.5138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008640330468343532, "epoch": 0.72, "percentage": 24.04, "elapsed_time": "0:40:03", "remaining_time": "2:06:34"} +{"current_steps": 670, "total_steps": 2766, "loss": 1.4625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008620807089281364, "epoch": 0.73, "percentage": 24.22, "elapsed_time": "0:40:21", "remaining_time": "2:06:14"} +{"current_steps": 675, "total_steps": 2766, "loss": 1.4173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008601166937976226, "epoch": 0.73, "percentage": 24.4, "elapsed_time": "0:40:38", "remaining_time": "2:05:55"} +{"current_steps": 680, "total_steps": 2766, "loss": 1.4901, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000858141064782958, "epoch": 0.74, "percentage": 24.58, "elapsed_time": "0:40:56", "remaining_time": "2:05:35"} +{"current_steps": 685, "total_steps": 2766, "loss": 1.4056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008561538855988409, "epoch": 0.74, "percentage": 24.77, "elapsed_time": "0:41:14", "remaining_time": "2:05:16"} +{"current_steps": 690, "total_steps": 2766, "loss": 1.4486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008541552203324667, "epoch": 0.75, "percentage": 24.95, "elapsed_time": "0:41:31", "remaining_time": "2:04:56"} +{"current_steps": 695, "total_steps": 2766, "loss": 1.4147, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008521451334414605, "epoch": 0.75, "percentage": 25.13, "elapsed_time": "0:41:49", "remaining_time": "2:04:37"} +{"current_steps": 700, "total_steps": 2766, "loss": 1.4547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008501236897517987, "epoch": 0.76, "percentage": 25.31, "elapsed_time": "0:42:06", "remaining_time": "2:04:17"} +{"current_steps": 700, "total_steps": 2766, "loss": null, "eval_loss": 1.4729957580566406, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.76, "percentage": 25.31, "elapsed_time": "0:42:06", "remaining_time": "2:04:17"} +{"current_steps": 705, "total_steps": 2766, "loss": 1.4464, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000848090954455718, "epoch": 0.76, "percentage": 25.49, "elapsed_time": "0:42:35", "remaining_time": "2:04:29"} +{"current_steps": 710, "total_steps": 2766, "loss": 1.4163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008460469931096138, "epoch": 0.77, "percentage": 25.67, "elapsed_time": "0:42:52", "remaining_time": "2:04:09"} +{"current_steps": 715, "total_steps": 2766, "loss": 1.5283, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008439918716319246, "epoch": 0.77, "percentage": 25.85, "elapsed_time": "0:43:10", "remaining_time": "2:03:50"} +{"current_steps": 720, "total_steps": 2766, "loss": 1.4313, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008419256563010076, "epoch": 0.78, "percentage": 26.03, "elapsed_time": "0:43:27", "remaining_time": "2:03:30"} +{"current_steps": 725, "total_steps": 2766, "loss": 1.3995, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000839848413753, "epoch": 0.79, "percentage": 26.21, "elapsed_time": "0:43:45", "remaining_time": "2:03:10"} +{"current_steps": 730, "total_steps": 2766, "loss": 1.4265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008377602109796709, "epoch": 0.79, "percentage": 26.39, "elapsed_time": "0:44:02", "remaining_time": "2:02:51"} +{"current_steps": 735, "total_steps": 2766, "loss": 1.4426, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008356611153262598, "epoch": 0.8, "percentage": 26.57, "elapsed_time": "0:44:20", "remaining_time": "2:02:31"} +{"current_steps": 740, "total_steps": 2766, "loss": 1.4251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008335511944893057, "epoch": 0.8, "percentage": 26.75, "elapsed_time": "0:44:38", "remaining_time": "2:02:12"} +{"current_steps": 745, "total_steps": 2766, "loss": 1.4686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008314305165144633, "epoch": 0.81, "percentage": 26.93, "elapsed_time": "0:44:55", "remaining_time": "2:01:52"} +{"current_steps": 750, "total_steps": 2766, "loss": 1.4658, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008292991497943081, "epoch": 0.81, "percentage": 27.11, "elapsed_time": "0:45:13", "remaining_time": "2:01:33"} +{"current_steps": 755, "total_steps": 2766, "loss": 1.4347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008271571630661321, "epoch": 0.82, "percentage": 27.3, "elapsed_time": "0:45:30", "remaining_time": "2:01:13"} +{"current_steps": 760, "total_steps": 2766, "loss": 1.4235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008250046254097255, "epoch": 0.82, "percentage": 27.48, "elapsed_time": "0:45:48", "remaining_time": "2:00:54"} +{"current_steps": 765, "total_steps": 2766, "loss": 1.5047, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008228416062451494, "epoch": 0.83, "percentage": 27.66, "elapsed_time": "0:46:06", "remaining_time": "2:00:35"} +{"current_steps": 770, "total_steps": 2766, "loss": 1.445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008206681753304976, "epoch": 0.83, "percentage": 27.84, "elapsed_time": "0:46:23", "remaining_time": "2:00:15"} +{"current_steps": 775, "total_steps": 2766, "loss": 1.4077, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008184844027596461, "epoch": 0.84, "percentage": 28.02, "elapsed_time": "0:46:41", "remaining_time": "1:59:56"} +{"current_steps": 780, "total_steps": 2766, "loss": 1.5057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008162903589599924, "epoch": 0.85, "percentage": 28.2, "elapsed_time": "0:46:58", "remaining_time": "1:59:37"} +{"current_steps": 785, "total_steps": 2766, "loss": 1.4445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008140861146901849, "epoch": 0.85, "percentage": 28.38, "elapsed_time": "0:47:16", "remaining_time": "1:59:17"} +{"current_steps": 790, "total_steps": 2766, "loss": 1.5333, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008118717410378407, "epoch": 0.86, "percentage": 28.56, "elapsed_time": "0:47:33", "remaining_time": "1:58:58"} +{"current_steps": 795, "total_steps": 2766, "loss": 1.3786, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008096473094172527, "epoch": 0.86, "percentage": 28.74, "elapsed_time": "0:47:51", "remaining_time": "1:58:39"} +{"current_steps": 800, "total_steps": 2766, "loss": 1.3781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008074128915670868, "epoch": 0.87, "percentage": 28.92, "elapsed_time": "0:48:09", "remaining_time": "1:58:20"} +{"current_steps": 800, "total_steps": 2766, "loss": null, "eval_loss": 1.4600605964660645, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.87, "percentage": 28.92, "elapsed_time": "0:48:09", "remaining_time": "1:58:20"} +{"current_steps": 805, "total_steps": 2766, "loss": 1.5097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008051685595480678, "epoch": 0.87, "percentage": 29.1, "elapsed_time": "0:48:37", "remaining_time": "1:58:26"} +{"current_steps": 810, "total_steps": 2766, "loss": 1.5608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008029143857406563, "epoch": 0.88, "percentage": 29.28, "elapsed_time": "0:48:55", "remaining_time": "1:58:07"} +{"current_steps": 815, "total_steps": 2766, "loss": 1.4113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008006504428427133, "epoch": 0.88, "percentage": 29.46, "elapsed_time": "0:49:12", "remaining_time": "1:57:48"} +{"current_steps": 820, "total_steps": 2766, "loss": 1.3781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007983768038671568, "epoch": 0.89, "percentage": 29.65, "elapsed_time": "0:49:30", "remaining_time": "1:57:28"} +{"current_steps": 825, "total_steps": 2766, "loss": 1.4056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007960935421396062, "epoch": 0.89, "percentage": 29.83, "elapsed_time": "0:49:47", "remaining_time": "1:57:09"} +{"current_steps": 830, "total_steps": 2766, "loss": 1.4463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007938007312960178, "epoch": 0.9, "percentage": 30.01, "elapsed_time": "0:50:05", "remaining_time": "1:56:50"} +{"current_steps": 835, "total_steps": 2766, "loss": 1.3983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007914984452803105, "epoch": 0.9, "percentage": 30.19, "elapsed_time": "0:50:22", "remaining_time": "1:56:30"} +{"current_steps": 840, "total_steps": 2766, "loss": 1.3968, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007891867583419805, "epoch": 0.91, "percentage": 30.37, "elapsed_time": "0:50:40", "remaining_time": "1:56:11"} +{"current_steps": 845, "total_steps": 2766, "loss": 1.4587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007868657450337066, "epoch": 0.92, "percentage": 30.55, "elapsed_time": "0:50:58", "remaining_time": "1:55:52"} +{"current_steps": 850, "total_steps": 2766, "loss": 1.4654, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007845354802089463, "epoch": 0.92, "percentage": 30.73, "elapsed_time": "0:51:15", "remaining_time": "1:55:32"} +{"current_steps": 855, "total_steps": 2766, "loss": 1.4384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007821960390195224, "epoch": 0.93, "percentage": 30.91, "elapsed_time": "0:51:33", "remaining_time": "1:55:13"} +{"current_steps": 860, "total_steps": 2766, "loss": 1.44, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007798474969131971, "epoch": 0.93, "percentage": 31.09, "elapsed_time": "0:51:50", "remaining_time": "1:54:54"} +{"current_steps": 865, "total_steps": 2766, "loss": 1.4221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007774899296312414, "epoch": 0.94, "percentage": 31.27, "elapsed_time": "0:52:08", "remaining_time": "1:54:35"} +{"current_steps": 870, "total_steps": 2766, "loss": 1.3795, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007751234132059906, "epoch": 0.94, "percentage": 31.45, "elapsed_time": "0:52:26", "remaining_time": "1:54:16"} +{"current_steps": 875, "total_steps": 2766, "loss": 1.4748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007727480239583933, "epoch": 0.95, "percentage": 31.63, "elapsed_time": "0:52:43", "remaining_time": "1:53:57"} +{"current_steps": 880, "total_steps": 2766, "loss": 1.5171, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007703638384955494, "epoch": 0.95, "percentage": 31.81, "elapsed_time": "0:53:01", "remaining_time": "1:53:37"} +{"current_steps": 885, "total_steps": 2766, "loss": 1.3996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007679709337082394, "epoch": 0.96, "percentage": 32.0, "elapsed_time": "0:53:18", "remaining_time": "1:53:18"} +{"current_steps": 890, "total_steps": 2766, "loss": 1.4386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007655693867684454, "epoch": 0.96, "percentage": 32.18, "elapsed_time": "0:53:36", "remaining_time": "1:52:59"} +{"current_steps": 895, "total_steps": 2766, "loss": 1.3789, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007631592751268618, "epoch": 0.97, "percentage": 32.36, "elapsed_time": "0:53:53", "remaining_time": "1:52:40"} +{"current_steps": 900, "total_steps": 2766, "loss": 1.4553, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007607406765103972, "epoch": 0.98, "percentage": 32.54, "elapsed_time": "0:54:11", "remaining_time": "1:52:21"} +{"current_steps": 900, "total_steps": 2766, "loss": null, "eval_loss": 1.4479364156723022, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.98, "percentage": 32.54, "elapsed_time": "0:54:11", "remaining_time": "1:52:21"} +{"current_steps": 905, "total_steps": 2766, "loss": 1.3962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000758313668919668, "epoch": 0.98, "percentage": 32.72, "elapsed_time": "0:54:39", "remaining_time": "1:52:24"} +{"current_steps": 910, "total_steps": 2766, "loss": 1.3899, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000755878330626483, "epoch": 0.99, "percentage": 32.9, "elapsed_time": "0:54:57", "remaining_time": "1:52:05"} +{"current_steps": 915, "total_steps": 2766, "loss": 1.3965, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007534347401713191, "epoch": 0.99, "percentage": 33.08, "elapsed_time": "0:55:14", "remaining_time": "1:51:46"} +{"current_steps": 920, "total_steps": 2766, "loss": 1.367, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007509829763607879, "epoch": 1.0, "percentage": 33.26, "elapsed_time": "0:55:32", "remaining_time": "1:51:26"} +{"current_steps": 925, "total_steps": 2766, "loss": 1.4027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007485231182650945, "epoch": 1.0, "percentage": 33.44, "elapsed_time": "0:55:50", "remaining_time": "1:51:07"} +{"current_steps": 930, "total_steps": 2766, "loss": 1.3563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007460552452154877, "epoch": 1.01, "percentage": 33.62, "elapsed_time": "0:56:07", "remaining_time": "1:50:48"} +{"current_steps": 935, "total_steps": 2766, "loss": 1.3192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007435794368017007, "epoch": 1.01, "percentage": 33.8, "elapsed_time": "0:56:25", "remaining_time": "1:50:29"} +{"current_steps": 940, "total_steps": 2766, "loss": 1.2772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007410957728693856, "epoch": 1.02, "percentage": 33.98, "elapsed_time": "0:56:42", "remaining_time": "1:50:10"} +{"current_steps": 945, "total_steps": 2766, "loss": 1.3291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007386043335175367, "epoch": 1.02, "percentage": 34.16, "elapsed_time": "0:57:00", "remaining_time": "1:49:51"} +{"current_steps": 950, "total_steps": 2766, "loss": 1.304, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000736105199095909, "epoch": 1.03, "percentage": 34.35, "elapsed_time": "0:57:18", "remaining_time": "1:49:32"} +{"current_steps": 955, "total_steps": 2766, "loss": 1.3832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007335984502024256, "epoch": 1.03, "percentage": 34.53, "elapsed_time": "0:57:35", "remaining_time": "1:49:13"} +{"current_steps": 960, "total_steps": 2766, "loss": 1.3351, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007310841676805791, "epoch": 1.04, "percentage": 34.71, "elapsed_time": "0:57:53", "remaining_time": "1:48:54"} +{"current_steps": 965, "total_steps": 2766, "loss": 1.3375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000728562432616824, "epoch": 1.05, "percentage": 34.89, "elapsed_time": "0:58:10", "remaining_time": "1:48:34"} +{"current_steps": 970, "total_steps": 2766, "loss": 1.3323, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007260333263379619, "epoch": 1.05, "percentage": 35.07, "elapsed_time": "0:58:28", "remaining_time": "1:48:15"} +{"current_steps": 975, "total_steps": 2766, "loss": 1.3293, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007234969304085186, "epoch": 1.06, "percentage": 35.25, "elapsed_time": "0:58:45", "remaining_time": "1:47:56"} +{"current_steps": 980, "total_steps": 2766, "loss": 1.3859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007209533266281133, "epoch": 1.06, "percentage": 35.43, "elapsed_time": "0:59:03", "remaining_time": "1:47:37"} +{"current_steps": 985, "total_steps": 2766, "loss": 1.3553, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007184025970288211, "epoch": 1.07, "percentage": 35.61, "elapsed_time": "0:59:21", "remaining_time": "1:47:18"} +{"current_steps": 990, "total_steps": 2766, "loss": 1.3607, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000715844823872527, "epoch": 1.07, "percentage": 35.79, "elapsed_time": "0:59:38", "remaining_time": "1:47:00"} +{"current_steps": 995, "total_steps": 2766, "loss": 1.3457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007132800896482731, "epoch": 1.08, "percentage": 35.97, "elapsed_time": "0:59:56", "remaining_time": "1:46:41"} +{"current_steps": 1000, "total_steps": 2766, "loss": 1.3788, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007107084770695986, "epoch": 1.08, "percentage": 36.15, "elapsed_time": "1:00:13", "remaining_time": "1:46:22"} +{"current_steps": 1000, "total_steps": 2766, "loss": null, "eval_loss": 1.4371482133865356, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.08, "percentage": 36.15, "elapsed_time": "1:00:13", "remaining_time": "1:46:22"} +{"current_steps": 1005, "total_steps": 2766, "loss": 1.3039, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007081300690718709, "epoch": 1.09, "percentage": 36.33, "elapsed_time": "1:00:42", "remaining_time": "1:46:21"} +{"current_steps": 1010, "total_steps": 2766, "loss": 1.2719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007055449488096132, "epoch": 1.09, "percentage": 36.51, "elapsed_time": "1:00:59", "remaining_time": "1:46:02"} +{"current_steps": 1015, "total_steps": 2766, "loss": 1.4107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007029531996538212, "epoch": 1.1, "percentage": 36.7, "elapsed_time": "1:01:17", "remaining_time": "1:45:43"} +{"current_steps": 1020, "total_steps": 2766, "loss": 1.38, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007003549051892738, "epoch": 1.11, "percentage": 36.88, "elapsed_time": "1:01:34", "remaining_time": "1:45:24"} +{"current_steps": 1025, "total_steps": 2766, "loss": 1.3408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006977501492118391, "epoch": 1.11, "percentage": 37.06, "elapsed_time": "1:01:52", "remaining_time": "1:45:05"} +{"current_steps": 1030, "total_steps": 2766, "loss": 1.3704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006951390157257712, "epoch": 1.12, "percentage": 37.24, "elapsed_time": "1:02:10", "remaining_time": "1:44:46"} +{"current_steps": 1035, "total_steps": 2766, "loss": 1.345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006925215889410004, "epoch": 1.12, "percentage": 37.42, "elapsed_time": "1:02:27", "remaining_time": "1:44:27"} +{"current_steps": 1040, "total_steps": 2766, "loss": 1.3414, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006898979532704186, "epoch": 1.13, "percentage": 37.6, "elapsed_time": "1:02:45", "remaining_time": "1:44:08"} +{"current_steps": 1045, "total_steps": 2766, "loss": 1.3131, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006872681933271559, "epoch": 1.13, "percentage": 37.78, "elapsed_time": "1:03:02", "remaining_time": "1:43:49"} +{"current_steps": 1050, "total_steps": 2766, "loss": 1.3363, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006846323939218526, "epoch": 1.14, "percentage": 37.96, "elapsed_time": "1:03:20", "remaining_time": "1:43:31"} +{"current_steps": 1055, "total_steps": 2766, "loss": 1.3659, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006819906400599234, "epoch": 1.14, "percentage": 38.14, "elapsed_time": "1:03:38", "remaining_time": "1:43:12"} +{"current_steps": 1060, "total_steps": 2766, "loss": 1.3145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006793430169388163, "epoch": 1.15, "percentage": 38.32, "elapsed_time": "1:03:55", "remaining_time": "1:42:53"} +{"current_steps": 1065, "total_steps": 2766, "loss": 1.3727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006766896099452652, "epoch": 1.15, "percentage": 38.5, "elapsed_time": "1:04:13", "remaining_time": "1:42:34"} +{"current_steps": 1070, "total_steps": 2766, "loss": 1.3478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006740305046525351, "epoch": 1.16, "percentage": 38.68, "elapsed_time": "1:04:30", "remaining_time": "1:42:15"} +{"current_steps": 1075, "total_steps": 2766, "loss": 1.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006713657868176639, "epoch": 1.16, "percentage": 38.86, "elapsed_time": "1:04:48", "remaining_time": "1:41:56"} +{"current_steps": 1080, "total_steps": 2766, "loss": 1.3501, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006686955423786951, "epoch": 1.17, "percentage": 39.05, "elapsed_time": "1:05:05", "remaining_time": "1:41:37"} +{"current_steps": 1085, "total_steps": 2766, "loss": 1.3782, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006660198574519078, "epoch": 1.18, "percentage": 39.23, "elapsed_time": "1:05:23", "remaining_time": "1:41:18"} +{"current_steps": 1090, "total_steps": 2766, "loss": 1.3767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000663338818329038, "epoch": 1.18, "percentage": 39.41, "elapsed_time": "1:05:41", "remaining_time": "1:40:59"} +{"current_steps": 1095, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006606525114744965, "epoch": 1.19, "percentage": 39.59, "elapsed_time": "1:05:58", "remaining_time": "1:40:41"} +{"current_steps": 1100, "total_steps": 2766, "loss": 1.2234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006579610235225805, "epoch": 1.19, "percentage": 39.77, "elapsed_time": "1:06:16", "remaining_time": "1:40:22"} +{"current_steps": 1100, "total_steps": 2766, "loss": null, "eval_loss": 1.4341663122177124, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.19, "percentage": 39.77, "elapsed_time": "1:06:16", "remaining_time": "1:40:22"} +{"current_steps": 1105, "total_steps": 2766, "loss": 1.4083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006552644412746791, "epoch": 1.2, "percentage": 39.95, "elapsed_time": "1:06:44", "remaining_time": "1:40:19"} +{"current_steps": 1110, "total_steps": 2766, "loss": 1.4225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006525628516964741, "epoch": 1.2, "percentage": 40.13, "elapsed_time": "1:07:02", "remaining_time": "1:40:00"} +{"current_steps": 1115, "total_steps": 2766, "loss": 1.3677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006498563419151354, "epoch": 1.21, "percentage": 40.31, "elapsed_time": "1:07:19", "remaining_time": "1:39:41"} +{"current_steps": 1120, "total_steps": 2766, "loss": 1.2836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006471449992165113, "epoch": 1.21, "percentage": 40.49, "elapsed_time": "1:07:37", "remaining_time": "1:39:22"} +{"current_steps": 1125, "total_steps": 2766, "loss": 1.3428, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006444289110423129, "epoch": 1.22, "percentage": 40.67, "elapsed_time": "1:07:54", "remaining_time": "1:39:03"} +{"current_steps": 1130, "total_steps": 2766, "loss": 1.3192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006417081649872952, "epoch": 1.22, "percentage": 40.85, "elapsed_time": "1:08:12", "remaining_time": "1:38:45"} +{"current_steps": 1135, "total_steps": 2766, "loss": 1.3084, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006389828487964305, "epoch": 1.23, "percentage": 41.03, "elapsed_time": "1:08:30", "remaining_time": "1:38:26"} +{"current_steps": 1140, "total_steps": 2766, "loss": 1.3702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00063625305036208, "epoch": 1.24, "percentage": 41.21, "elapsed_time": "1:08:47", "remaining_time": "1:38:07"} +{"current_steps": 1145, "total_steps": 2766, "loss": 1.3054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000633518857721159, "epoch": 1.24, "percentage": 41.4, "elapsed_time": "1:09:05", "remaining_time": "1:37:48"} +{"current_steps": 1150, "total_steps": 2766, "loss": 1.3211, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006307803590522972, "epoch": 1.25, "percentage": 41.58, "elapsed_time": "1:09:22", "remaining_time": "1:37:29"} +{"current_steps": 1155, "total_steps": 2766, "loss": 1.3319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006280376426729947, "epoch": 1.25, "percentage": 41.76, "elapsed_time": "1:09:40", "remaining_time": "1:37:10"} +{"current_steps": 1160, "total_steps": 2766, "loss": 1.4346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006252907970367749, "epoch": 1.26, "percentage": 41.94, "elapsed_time": "1:09:57", "remaining_time": "1:36:52"} +{"current_steps": 1165, "total_steps": 2766, "loss": 1.3938, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006225399107303309, "epoch": 1.26, "percentage": 42.12, "elapsed_time": "1:10:15", "remaining_time": "1:36:33"} +{"current_steps": 1170, "total_steps": 2766, "loss": 1.4371, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006197850724706682, "epoch": 1.27, "percentage": 42.3, "elapsed_time": "1:10:33", "remaining_time": "1:36:14"} +{"current_steps": 1175, "total_steps": 2766, "loss": 1.2925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006170263711022451, "epoch": 1.27, "percentage": 42.48, "elapsed_time": "1:10:50", "remaining_time": "1:35:55"} +{"current_steps": 1180, "total_steps": 2766, "loss": 1.3135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006142638955941057, "epoch": 1.28, "percentage": 42.66, "elapsed_time": "1:11:08", "remaining_time": "1:35:36"} +{"current_steps": 1185, "total_steps": 2766, "loss": 1.3572, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006114977350370114, "epoch": 1.28, "percentage": 42.84, "elapsed_time": "1:11:25", "remaining_time": "1:35:18"} +{"current_steps": 1190, "total_steps": 2766, "loss": 1.3918, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006087279786405684, "epoch": 1.29, "percentage": 43.02, "elapsed_time": "1:11:43", "remaining_time": "1:34:59"} +{"current_steps": 1195, "total_steps": 2766, "loss": 1.3732, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006059547157303491, "epoch": 1.3, "percentage": 43.2, "elapsed_time": "1:12:01", "remaining_time": "1:34:40"} +{"current_steps": 1200, "total_steps": 2766, "loss": 1.3541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006031780357450124, "epoch": 1.3, "percentage": 43.38, "elapsed_time": "1:12:18", "remaining_time": "1:34:21"} +{"current_steps": 1200, "total_steps": 2766, "loss": null, "eval_loss": 1.4208500385284424, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.3, "percentage": 43.38, "elapsed_time": "1:12:18", "remaining_time": "1:34:21"} +{"current_steps": 1205, "total_steps": 2766, "loss": 1.2997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006003980282334191, "epoch": 1.31, "percentage": 43.56, "elapsed_time": "1:12:46", "remaining_time": "1:34:17"} +{"current_steps": 1210, "total_steps": 2766, "loss": 1.2832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005976147828517439, "epoch": 1.31, "percentage": 43.75, "elapsed_time": "1:13:04", "remaining_time": "1:33:58"} +{"current_steps": 1215, "total_steps": 2766, "loss": 1.3863, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005948283893605839, "epoch": 1.32, "percentage": 43.93, "elapsed_time": "1:13:22", "remaining_time": "1:33:39"} +{"current_steps": 1220, "total_steps": 2766, "loss": 1.3599, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005920389376220633, "epoch": 1.32, "percentage": 44.11, "elapsed_time": "1:13:39", "remaining_time": "1:33:20"} +{"current_steps": 1225, "total_steps": 2766, "loss": 1.3085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005892465175969366, "epoch": 1.33, "percentage": 44.29, "elapsed_time": "1:13:57", "remaining_time": "1:33:01"} +{"current_steps": 1230, "total_steps": 2766, "loss": 1.3355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000586451219341686, "epoch": 1.33, "percentage": 44.47, "elapsed_time": "1:14:14", "remaining_time": "1:32:43"} +{"current_steps": 1235, "total_steps": 2766, "loss": 1.291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005836531330056176, "epoch": 1.34, "percentage": 44.65, "elapsed_time": "1:14:32", "remaining_time": "1:32:24"} +{"current_steps": 1240, "total_steps": 2766, "loss": 1.3286, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005808523488279542, "epoch": 1.34, "percentage": 44.83, "elapsed_time": "1:14:50", "remaining_time": "1:32:05"} +{"current_steps": 1245, "total_steps": 2766, "loss": 1.3704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005780489571349249, "epoch": 1.35, "percentage": 45.01, "elapsed_time": "1:15:07", "remaining_time": "1:31:46"} +{"current_steps": 1250, "total_steps": 2766, "loss": 1.3263, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000575243048336852, "epoch": 1.35, "percentage": 45.19, "elapsed_time": "1:15:25", "remaining_time": "1:31:28"} +{"current_steps": 1255, "total_steps": 2766, "loss": 1.3357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005724347129252354, "epoch": 1.36, "percentage": 45.37, "elapsed_time": "1:15:42", "remaining_time": "1:31:09"} +{"current_steps": 1260, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005696240414698337, "epoch": 1.37, "percentage": 45.55, "elapsed_time": "1:16:00", "remaining_time": "1:30:50"} +{"current_steps": 1265, "total_steps": 2766, "loss": 1.2568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005668111246157441, "epoch": 1.37, "percentage": 45.73, "elapsed_time": "1:16:17", "remaining_time": "1:30:31"} +{"current_steps": 1270, "total_steps": 2766, "loss": 1.3212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005639960530804787, "epoch": 1.38, "percentage": 45.91, "elapsed_time": "1:16:35", "remaining_time": "1:30:13"} +{"current_steps": 1275, "total_steps": 2766, "loss": 1.3358, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005611789176510384, "epoch": 1.38, "percentage": 46.1, "elapsed_time": "1:16:53", "remaining_time": "1:29:54"} +{"current_steps": 1280, "total_steps": 2766, "loss": 1.3618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005583598091809859, "epoch": 1.39, "percentage": 46.28, "elapsed_time": "1:17:10", "remaining_time": "1:29:35"} +{"current_steps": 1285, "total_steps": 2766, "loss": 1.3273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005555388185875146, "epoch": 1.39, "percentage": 46.46, "elapsed_time": "1:17:28", "remaining_time": "1:29:17"} +{"current_steps": 1290, "total_steps": 2766, "loss": 1.284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005527160368485172, "epoch": 1.4, "percentage": 46.64, "elapsed_time": "1:17:45", "remaining_time": "1:28:58"} +{"current_steps": 1295, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005498915549996516, "epoch": 1.4, "percentage": 46.82, "elapsed_time": "1:18:03", "remaining_time": "1:28:39"} +{"current_steps": 1300, "total_steps": 2766, "loss": 1.2796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005470654641314045, "epoch": 1.41, "percentage": 47.0, "elapsed_time": "1:18:21", "remaining_time": "1:28:21"} +{"current_steps": 1300, "total_steps": 2766, "loss": null, "eval_loss": 1.4054052829742432, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.41, "percentage": 47.0, "elapsed_time": "1:18:21", "remaining_time": "1:28:21"} +{"current_steps": 1305, "total_steps": 2766, "loss": 1.3107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005442378553861545, "epoch": 1.41, "percentage": 47.18, "elapsed_time": "1:18:49", "remaining_time": "1:28:14"} +{"current_steps": 1310, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005414088199552319, "epoch": 1.42, "percentage": 47.36, "elapsed_time": "1:19:06", "remaining_time": "1:27:55"} +{"current_steps": 1315, "total_steps": 2766, "loss": 1.3326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000538578449075978, "epoch": 1.43, "percentage": 47.54, "elapsed_time": "1:19:24", "remaining_time": "1:27:37"} +{"current_steps": 1320, "total_steps": 2766, "loss": 1.3383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005357468340288031, "epoch": 1.43, "percentage": 47.72, "elapsed_time": "1:19:42", "remaining_time": "1:27:18"} +{"current_steps": 1325, "total_steps": 2766, "loss": 1.336, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000532914066134242, "epoch": 1.44, "percentage": 47.9, "elapsed_time": "1:19:59", "remaining_time": "1:26:59"} +{"current_steps": 1330, "total_steps": 2766, "loss": 1.3949, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005300802367500093, "epoch": 1.44, "percentage": 48.08, "elapsed_time": "1:20:17", "remaining_time": "1:26:41"} +{"current_steps": 1335, "total_steps": 2766, "loss": 1.3214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005272454372680532, "epoch": 1.45, "percentage": 48.26, "elapsed_time": "1:20:34", "remaining_time": "1:26:22"} +{"current_steps": 1340, "total_steps": 2766, "loss": 1.376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005244097591116077, "epoch": 1.45, "percentage": 48.45, "elapsed_time": "1:20:52", "remaining_time": "1:26:03"} +{"current_steps": 1345, "total_steps": 2766, "loss": 1.2345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005215732937322439, "epoch": 1.46, "percentage": 48.63, "elapsed_time": "1:21:09", "remaining_time": "1:25:45"} +{"current_steps": 1350, "total_steps": 2766, "loss": 1.4495, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005187361326069224, "epoch": 1.46, "percentage": 48.81, "elapsed_time": "1:21:27", "remaining_time": "1:25:26"} +{"current_steps": 1355, "total_steps": 2766, "loss": 1.3978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005158983672350405, "epoch": 1.47, "percentage": 48.99, "elapsed_time": "1:21:45", "remaining_time": "1:25:07"} +{"current_steps": 1360, "total_steps": 2766, "loss": 1.2517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005130600891354833, "epoch": 1.47, "percentage": 49.17, "elapsed_time": "1:22:02", "remaining_time": "1:24:49"} +{"current_steps": 1365, "total_steps": 2766, "loss": 1.3823, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005102213898436715, "epoch": 1.48, "percentage": 49.35, "elapsed_time": "1:22:20", "remaining_time": "1:24:30"} +{"current_steps": 1370, "total_steps": 2766, "loss": 1.3219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005073823609086091, "epoch": 1.48, "percentage": 49.53, "elapsed_time": "1:22:37", "remaining_time": "1:24:11"} +{"current_steps": 1375, "total_steps": 2766, "loss": 1.3354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005045430938899315, "epoch": 1.49, "percentage": 49.71, "elapsed_time": "1:22:55", "remaining_time": "1:23:53"} +{"current_steps": 1380, "total_steps": 2766, "loss": 1.3054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005017036803549523, "epoch": 1.5, "percentage": 49.89, "elapsed_time": "1:23:13", "remaining_time": "1:23:34"} +{"current_steps": 1385, "total_steps": 2766, "loss": 1.2346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004988642118757102, "epoch": 1.5, "percentage": 50.07, "elapsed_time": "1:23:30", "remaining_time": "1:23:16"} +{"current_steps": 1390, "total_steps": 2766, "loss": 1.274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004960247800260161, "epoch": 1.51, "percentage": 50.25, "elapsed_time": "1:23:48", "remaining_time": "1:22:57"} +{"current_steps": 1395, "total_steps": 2766, "loss": 1.4231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004931854763784994, "epoch": 1.51, "percentage": 50.43, "elapsed_time": "1:24:05", "remaining_time": "1:22:38"} +{"current_steps": 1400, "total_steps": 2766, "loss": 1.2872, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000490346392501655, "epoch": 1.52, "percentage": 50.61, "elapsed_time": "1:24:23", "remaining_time": "1:22:20"} +{"current_steps": 1400, "total_steps": 2766, "loss": null, "eval_loss": 1.3990795612335205, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.52, "percentage": 50.61, "elapsed_time": "1:24:23", "remaining_time": "1:22:20"} +{"current_steps": 1405, "total_steps": 2766, "loss": 1.4041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00048750761995688984, "epoch": 1.52, "percentage": 50.8, "elapsed_time": "1:24:51", "remaining_time": "1:22:12"} +{"current_steps": 1410, "total_steps": 2766, "loss": 1.3405, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004846692502955709, "epoch": 1.53, "percentage": 50.98, "elapsed_time": "1:25:09", "remaining_time": "1:21:53"} +{"current_steps": 1415, "total_steps": 2766, "loss": 1.3198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00048183137505607154, "epoch": 1.53, "percentage": 51.16, "elapsed_time": "1:25:26", "remaining_time": "1:21:34"} +{"current_steps": 1420, "total_steps": 2766, "loss": 1.3528, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047899408576082016, "epoch": 1.54, "percentage": 51.34, "elapsed_time": "1:25:44", "remaining_time": "1:21:16"} +{"current_steps": 1425, "total_steps": 2766, "loss": 1.3095, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004761574739133478, "epoch": 1.54, "percentage": 51.52, "elapsed_time": "1:26:02", "remaining_time": "1:20:57"} +{"current_steps": 1430, "total_steps": 2766, "loss": 1.3278, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047332163099533787, "epoch": 1.55, "percentage": 51.7, "elapsed_time": "1:26:19", "remaining_time": "1:20:39"} +{"current_steps": 1435, "total_steps": 2766, "loss": 1.3305, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047048664846367587, "epoch": 1.56, "percentage": 51.88, "elapsed_time": "1:26:37", "remaining_time": "1:20:20"} +{"current_steps": 1440, "total_steps": 2766, "loss": 1.3997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004676526177474991, "epoch": 1.56, "percentage": 52.06, "elapsed_time": "1:26:54", "remaining_time": "1:20:01"} +{"current_steps": 1445, "total_steps": 2766, "loss": 1.341, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00046481963024524846, "epoch": 1.57, "percentage": 52.24, "elapsed_time": "1:27:12", "remaining_time": "1:19:43"} +{"current_steps": 1450, "total_steps": 2766, "loss": 1.3008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00046198777732172133, "epoch": 1.57, "percentage": 52.42, "elapsed_time": "1:27:29", "remaining_time": "1:19:24"} +{"current_steps": 1455, "total_steps": 2766, "loss": 1.2643, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045915715030512405, "epoch": 1.58, "percentage": 52.6, "elapsed_time": "1:27:47", "remaining_time": "1:19:06"} +{"current_steps": 1460, "total_steps": 2766, "loss": 1.3169, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004563278404841273, "epoch": 1.58, "percentage": 52.78, "elapsed_time": "1:28:05", "remaining_time": "1:18:47"} +{"current_steps": 1465, "total_steps": 2766, "loss": 1.3062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045349993910492154, "epoch": 1.59, "percentage": 52.96, "elapsed_time": "1:28:22", "remaining_time": "1:18:29"} +{"current_steps": 1470, "total_steps": 2766, "loss": 1.2876, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045067353736827495, "epoch": 1.59, "percentage": 53.15, "elapsed_time": "1:28:40", "remaining_time": "1:18:10"} +{"current_steps": 1475, "total_steps": 2766, "loss": 1.3534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004478487264265913, "epoch": 1.6, "percentage": 53.33, "elapsed_time": "1:28:57", "remaining_time": "1:17:51"} +{"current_steps": 1480, "total_steps": 2766, "loss": 1.318, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004450255973809707, "epoch": 1.6, "percentage": 53.51, "elapsed_time": "1:29:15", "remaining_time": "1:17:33"} +{"current_steps": 1485, "total_steps": 2766, "loss": 1.3195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000442204241278272, "epoch": 1.61, "percentage": 53.69, "elapsed_time": "1:29:33", "remaining_time": "1:17:14"} +{"current_steps": 1490, "total_steps": 2766, "loss": 1.3208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004393847491081756, "epoch": 1.61, "percentage": 53.87, "elapsed_time": "1:29:50", "remaining_time": "1:16:56"} +{"current_steps": 1495, "total_steps": 2766, "loss": 1.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004365672118002494, "epoch": 1.62, "percentage": 54.05, "elapsed_time": "1:30:08", "remaining_time": "1:16:37"} +{"current_steps": 1500, "total_steps": 2766, "loss": 1.3356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004337517202210168, "epoch": 1.63, "percentage": 54.23, "elapsed_time": "1:30:25", "remaining_time": "1:16:19"} +{"current_steps": 1500, "total_steps": 2766, "loss": null, "eval_loss": 1.3873966932296753, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.63, "percentage": 54.23, "elapsed_time": "1:30:25", "remaining_time": "1:16:19"} +{"current_steps": 1505, "total_steps": 2766, "loss": 1.3163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004309383651710254, "epoch": 1.63, "percentage": 54.41, "elapsed_time": "1:30:54", "remaining_time": "1:16:09"} +{"current_steps": 1510, "total_steps": 2766, "loss": 1.3119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00042812723738191896, "epoch": 1.64, "percentage": 54.59, "elapsed_time": "1:31:11", "remaining_time": "1:15:51"} +{"current_steps": 1515, "total_steps": 2766, "loss": 1.2777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004253184275135116, "epoch": 1.64, "percentage": 54.77, "elapsed_time": "1:31:29", "remaining_time": "1:15:32"} +{"current_steps": 1520, "total_steps": 2766, "loss": 1.3624, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004225120261508637, "epoch": 1.65, "percentage": 54.95, "elapsed_time": "1:31:46", "remaining_time": "1:15:14"} +{"current_steps": 1525, "total_steps": 2766, "loss": 1.3231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004197081238013602, "epoch": 1.65, "percentage": 55.13, "elapsed_time": "1:32:04", "remaining_time": "1:14:55"} +{"current_steps": 1530, "total_steps": 2766, "loss": 1.3807, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004169068108917924, "epoch": 1.66, "percentage": 55.31, "elapsed_time": "1:32:21", "remaining_time": "1:14:37"} +{"current_steps": 1535, "total_steps": 2766, "loss": 1.3301, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004141081777654412, "epoch": 1.66, "percentage": 55.5, "elapsed_time": "1:32:39", "remaining_time": "1:14:18"} +{"current_steps": 1540, "total_steps": 2766, "loss": 1.3032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004113123146791633, "epoch": 1.67, "percentage": 55.68, "elapsed_time": "1:32:57", "remaining_time": "1:13:59"} +{"current_steps": 1545, "total_steps": 2766, "loss": 1.2957, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000408519311800481, "epoch": 1.67, "percentage": 55.86, "elapsed_time": "1:33:14", "remaining_time": "1:13:41"} +{"current_steps": 1550, "total_steps": 2766, "loss": 1.3138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00040572925920467375, "epoch": 1.68, "percentage": 56.04, "elapsed_time": "1:33:32", "remaining_time": "1:13:22"} +{"current_steps": 1555, "total_steps": 2766, "loss": 1.2496, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004029422468718737, "epoch": 1.69, "percentage": 56.22, "elapsed_time": "1:33:49", "remaining_time": "1:13:04"} +{"current_steps": 1560, "total_steps": 2766, "loss": 1.3796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004001583646841632, "epoch": 1.69, "percentage": 56.4, "elapsed_time": "1:34:07", "remaining_time": "1:12:45"} +{"current_steps": 1565, "total_steps": 2766, "loss": 1.3492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00039737770242267637, "epoch": 1.7, "percentage": 56.58, "elapsed_time": "1:34:25", "remaining_time": "1:12:27"} +{"current_steps": 1570, "total_steps": 2766, "loss": 1.3138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00039460034976470396, "epoch": 1.7, "percentage": 56.76, "elapsed_time": "1:34:42", "remaining_time": "1:12:08"} +{"current_steps": 1575, "total_steps": 2766, "loss": 1.3172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003918263962808004, "epoch": 1.71, "percentage": 56.94, "elapsed_time": "1:35:00", "remaining_time": "1:11:50"} +{"current_steps": 1580, "total_steps": 2766, "loss": 1.3446, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003890559314318959, "epoch": 1.71, "percentage": 57.12, "elapsed_time": "1:35:17", "remaining_time": "1:11:31"} +{"current_steps": 1585, "total_steps": 2766, "loss": 1.3062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00038628904456641116, "epoch": 1.72, "percentage": 57.3, "elapsed_time": "1:35:35", "remaining_time": "1:11:13"} +{"current_steps": 1590, "total_steps": 2766, "loss": 1.2899, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00038352582491737547, "epoch": 1.72, "percentage": 57.48, "elapsed_time": "1:35:52", "remaining_time": "1:10:55"} +{"current_steps": 1595, "total_steps": 2766, "loss": 1.2942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003807663615995491, "epoch": 1.73, "percentage": 57.66, "elapsed_time": "1:36:10", "remaining_time": "1:10:36"} +{"current_steps": 1600, "total_steps": 2766, "loss": 1.2902, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003780107436065498, "epoch": 1.73, "percentage": 57.85, "elapsed_time": "1:36:28", "remaining_time": "1:10:18"} +{"current_steps": 1600, "total_steps": 2766, "loss": null, "eval_loss": 1.379552960395813, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.73, "percentage": 57.85, "elapsed_time": "1:36:28", "remaining_time": "1:10:18"} +{"current_steps": 1605, "total_steps": 2766, "loss": 1.3213, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00037525905980798183, "epoch": 1.74, "percentage": 58.03, "elapsed_time": "1:36:56", "remaining_time": "1:10:07"} +{"current_steps": 1610, "total_steps": 2766, "loss": 1.2286, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003725113989465705, "epoch": 1.74, "percentage": 58.21, "elapsed_time": "1:37:14", "remaining_time": "1:09:48"} +{"current_steps": 1615, "total_steps": 2766, "loss": 1.3394, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00036976784963530017, "epoch": 1.75, "percentage": 58.39, "elapsed_time": "1:37:31", "remaining_time": "1:09:30"} +{"current_steps": 1620, "total_steps": 2766, "loss": 1.2879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003670285003545564, "epoch": 1.76, "percentage": 58.57, "elapsed_time": "1:37:49", "remaining_time": "1:09:11"} +{"current_steps": 1625, "total_steps": 2766, "loss": 1.3369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00036429343944927196, "epoch": 1.76, "percentage": 58.75, "elapsed_time": "1:38:06", "remaining_time": "1:08:53"} +{"current_steps": 1630, "total_steps": 2766, "loss": 1.3393, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003615627551260785, "epoch": 1.77, "percentage": 58.93, "elapsed_time": "1:38:24", "remaining_time": "1:08:34"} +{"current_steps": 1635, "total_steps": 2766, "loss": 1.3437, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003588365354504612, "epoch": 1.77, "percentage": 59.11, "elapsed_time": "1:38:41", "remaining_time": "1:08:16"} +{"current_steps": 1640, "total_steps": 2766, "loss": 1.2843, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035611486834391894, "epoch": 1.78, "percentage": 59.29, "elapsed_time": "1:38:59", "remaining_time": "1:07:57"} +{"current_steps": 1645, "total_steps": 2766, "loss": 1.3463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035339784158112893, "epoch": 1.78, "percentage": 59.47, "elapsed_time": "1:39:17", "remaining_time": "1:07:39"} +{"current_steps": 1650, "total_steps": 2766, "loss": 1.2847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035068554278711494, "epoch": 1.79, "percentage": 59.65, "elapsed_time": "1:39:34", "remaining_time": "1:07:21"} +{"current_steps": 1655, "total_steps": 2766, "loss": 1.2493, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00034797805943442313, "epoch": 1.79, "percentage": 59.83, "elapsed_time": "1:39:52", "remaining_time": "1:07:02"} +{"current_steps": 1660, "total_steps": 2766, "loss": 1.3471, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003452754788402996, "epoch": 1.8, "percentage": 60.01, "elapsed_time": "1:40:09", "remaining_time": "1:06:44"} +{"current_steps": 1665, "total_steps": 2766, "loss": 1.2983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00034257788816387475, "epoch": 1.8, "percentage": 60.2, "elapsed_time": "1:40:27", "remaining_time": "1:06:25"} +{"current_steps": 1670, "total_steps": 2766, "loss": 1.3259, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003398853744033529, "epoch": 1.81, "percentage": 60.38, "elapsed_time": "1:40:45", "remaining_time": "1:06:07"} +{"current_steps": 1675, "total_steps": 2766, "loss": 1.333, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003371980243932056, "epoch": 1.82, "percentage": 60.56, "elapsed_time": "1:41:02", "remaining_time": "1:05:48"} +{"current_steps": 1680, "total_steps": 2766, "loss": 1.3071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033451592480137195, "epoch": 1.82, "percentage": 60.74, "elapsed_time": "1:41:20", "remaining_time": "1:05:30"} +{"current_steps": 1685, "total_steps": 2766, "loss": 1.3238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033183916212646346, "epoch": 1.83, "percentage": 60.92, "elapsed_time": "1:41:37", "remaining_time": "1:05:11"} +{"current_steps": 1690, "total_steps": 2766, "loss": 1.3129, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003291678226949741, "epoch": 1.83, "percentage": 61.1, "elapsed_time": "1:41:55", "remaining_time": "1:04:53"} +{"current_steps": 1695, "total_steps": 2766, "loss": 1.3235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003265019926584964, "epoch": 1.84, "percentage": 61.28, "elapsed_time": "1:42:12", "remaining_time": "1:04:35"} +{"current_steps": 1700, "total_steps": 2766, "loss": 1.3016, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00032384175799094297, "epoch": 1.84, "percentage": 61.46, "elapsed_time": "1:42:30", "remaining_time": "1:04:16"} +{"current_steps": 1700, "total_steps": 2766, "loss": null, "eval_loss": 1.3693352937698364, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.84, "percentage": 61.46, "elapsed_time": "1:42:30", "remaining_time": "1:04:16"} +{"current_steps": 1705, "total_steps": 2766, "loss": 1.2658, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003211872044857743, "epoch": 1.85, "percentage": 61.64, "elapsed_time": "1:42:58", "remaining_time": "1:04:05"} +{"current_steps": 1710, "total_steps": 2766, "loss": 1.274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00031853841775323103, "epoch": 1.85, "percentage": 61.82, "elapsed_time": "1:43:16", "remaining_time": "1:03:46"} +{"current_steps": 1715, "total_steps": 2766, "loss": 1.2629, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00031589548321757366, "epoch": 1.86, "percentage": 62.0, "elapsed_time": "1:43:33", "remaining_time": "1:03:28"} +{"current_steps": 1720, "total_steps": 2766, "loss": 1.3052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003132584861143274, "epoch": 1.86, "percentage": 62.18, "elapsed_time": "1:43:51", "remaining_time": "1:03:09"} +{"current_steps": 1725, "total_steps": 2766, "loss": 1.3099, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003106275114875332, "epoch": 1.87, "percentage": 62.36, "elapsed_time": "1:44:09", "remaining_time": "1:02:51"} +{"current_steps": 1730, "total_steps": 2766, "loss": 1.2878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003080026441870051, "epoch": 1.87, "percentage": 62.55, "elapsed_time": "1:44:26", "remaining_time": "1:02:32"} +{"current_steps": 1735, "total_steps": 2766, "loss": 1.2815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030538396886559393, "epoch": 1.88, "percentage": 62.73, "elapsed_time": "1:44:44", "remaining_time": "1:02:14"} +{"current_steps": 1740, "total_steps": 2766, "loss": 1.2896, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030277156997645706, "epoch": 1.89, "percentage": 62.91, "elapsed_time": "1:45:01", "remaining_time": "1:01:55"} +{"current_steps": 1745, "total_steps": 2766, "loss": 1.3545, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030016553177033466, "epoch": 1.89, "percentage": 63.09, "elapsed_time": "1:45:19", "remaining_time": "1:01:37"} +{"current_steps": 1750, "total_steps": 2766, "loss": 1.29, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002975659382928332, "epoch": 1.9, "percentage": 63.27, "elapsed_time": "1:45:37", "remaining_time": "1:01:19"} +{"current_steps": 1755, "total_steps": 2766, "loss": 1.3543, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00029497287338171385, "epoch": 1.9, "percentage": 63.45, "elapsed_time": "1:45:54", "remaining_time": "1:01:00"} +{"current_steps": 1760, "total_steps": 2766, "loss": 1.3202, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00029238642066418995, "epoch": 1.91, "percentage": 63.63, "elapsed_time": "1:46:12", "remaining_time": "1:00:42"} +{"current_steps": 1765, "total_steps": 2766, "loss": 1.3261, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002898066635542288, "epoch": 1.91, "percentage": 63.81, "elapsed_time": "1:46:29", "remaining_time": "1:00:23"} +{"current_steps": 1770, "total_steps": 2766, "loss": 1.2256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002872336852498627, "epoch": 1.92, "percentage": 63.99, "elapsed_time": "1:46:47", "remaining_time": "1:00:05"} +{"current_steps": 1775, "total_steps": 2766, "loss": 1.3423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002846675687305045, "epoch": 1.92, "percentage": 64.17, "elapsed_time": "1:47:04", "remaining_time": "0:59:47"} +{"current_steps": 1780, "total_steps": 2766, "loss": 1.2896, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002821083967542727, "epoch": 1.93, "percentage": 64.35, "elapsed_time": "1:47:22", "remaining_time": "0:59:28"} +{"current_steps": 1785, "total_steps": 2766, "loss": 1.2312, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027955625185532217, "epoch": 1.93, "percentage": 64.53, "elapsed_time": "1:47:40", "remaining_time": "0:59:10"} +{"current_steps": 1790, "total_steps": 2766, "loss": 1.2822, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027701121634118143, "epoch": 1.94, "percentage": 64.71, "elapsed_time": "1:47:57", "remaining_time": "0:58:51"} +{"current_steps": 1795, "total_steps": 2766, "loss": 1.319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027447337229009937, "epoch": 1.95, "percentage": 64.9, "elapsed_time": "1:48:15", "remaining_time": "0:58:33"} +{"current_steps": 1800, "total_steps": 2766, "loss": 1.3727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027194280154839824, "epoch": 1.95, "percentage": 65.08, "elapsed_time": "1:48:32", "remaining_time": "0:58:15"} +{"current_steps": 1800, "total_steps": 2766, "loss": null, "eval_loss": 1.3620151281356812, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.95, "percentage": 65.08, "elapsed_time": "1:48:32", "remaining_time": "0:58:15"} +{"current_steps": 1805, "total_steps": 2766, "loss": 1.251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002694195857278326, "epoch": 1.96, "percentage": 65.26, "elapsed_time": "1:49:01", "remaining_time": "0:58:02"} +{"current_steps": 1810, "total_steps": 2766, "loss": 1.2324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002669038062029592, "epoch": 1.96, "percentage": 65.44, "elapsed_time": "1:49:18", "remaining_time": "0:57:44"} +{"current_steps": 1815, "total_steps": 2766, "loss": 1.2644, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002643955441085115, "epoch": 1.97, "percentage": 65.62, "elapsed_time": "1:49:36", "remaining_time": "0:57:25"} +{"current_steps": 1820, "total_steps": 2766, "loss": 1.2582, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000261894880336783, "epoch": 1.97, "percentage": 65.8, "elapsed_time": "1:49:53", "remaining_time": "0:57:07"} +{"current_steps": 1825, "total_steps": 2766, "loss": 1.3433, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002594018955350191, "epoch": 1.98, "percentage": 65.98, "elapsed_time": "1:50:11", "remaining_time": "0:56:49"} +{"current_steps": 1830, "total_steps": 2766, "loss": 1.3069, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025691667010281616, "epoch": 1.98, "percentage": 66.16, "elapsed_time": "1:50:29", "remaining_time": "0:56:30"} +{"current_steps": 1835, "total_steps": 2766, "loss": 1.2895, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025443928418952724, "epoch": 1.99, "percentage": 66.34, "elapsed_time": "1:50:46", "remaining_time": "0:56:12"} +{"current_steps": 1840, "total_steps": 2766, "loss": 1.2799, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002519698176916791, "epoch": 1.99, "percentage": 66.52, "elapsed_time": "1:51:04", "remaining_time": "0:55:53"} +{"current_steps": 1845, "total_steps": 2766, "loss": 1.3044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000249508350250395, "epoch": 2.0, "percentage": 66.7, "elapsed_time": "1:51:21", "remaining_time": "0:55:35"} +{"current_steps": 1850, "total_steps": 2766, "loss": 1.2324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002470549612488247, "epoch": 2.0, "percentage": 66.88, "elapsed_time": "1:51:39", "remaining_time": "0:55:17"} +{"current_steps": 1855, "total_steps": 2766, "loss": 1.2357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002446097298095867, "epoch": 2.01, "percentage": 67.06, "elapsed_time": "1:51:57", "remaining_time": "0:54:58"} +{"current_steps": 1860, "total_steps": 2766, "loss": 1.2329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00024217273479221514, "epoch": 2.02, "percentage": 67.25, "elapsed_time": "1:52:14", "remaining_time": "0:54:40"} +{"current_steps": 1865, "total_steps": 2766, "loss": 1.2486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023974405479061623, "epoch": 2.02, "percentage": 67.43, "elapsed_time": "1:52:32", "remaining_time": "0:54:22"} +{"current_steps": 1870, "total_steps": 2766, "loss": 1.1959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002373237681305348, "epoch": 2.03, "percentage": 67.61, "elapsed_time": "1:52:49", "remaining_time": "0:54:03"} +{"current_steps": 1875, "total_steps": 2766, "loss": 1.2485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023491195286702777, "epoch": 2.03, "percentage": 67.79, "elapsed_time": "1:53:07", "remaining_time": "0:53:45"} +{"current_steps": 1880, "total_steps": 2766, "loss": 1.2585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023250868678194536, "epoch": 2.04, "percentage": 67.97, "elapsed_time": "1:53:24", "remaining_time": "0:53:27"} +{"current_steps": 1885, "total_steps": 2766, "loss": 1.2108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023011404738142532, "epoch": 2.04, "percentage": 68.15, "elapsed_time": "1:53:42", "remaining_time": "0:53:08"} +{"current_steps": 1890, "total_steps": 2766, "loss": 1.188, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002277281118933916, "epoch": 2.05, "percentage": 68.33, "elapsed_time": "1:54:00", "remaining_time": "0:52:50"} +{"current_steps": 1895, "total_steps": 2766, "loss": 1.2197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022535095726506344, "epoch": 2.05, "percentage": 68.51, "elapsed_time": "1:54:17", "remaining_time": "0:52:32"} +{"current_steps": 1900, "total_steps": 2766, "loss": 1.1352, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022298266016047513, "epoch": 2.06, "percentage": 68.69, "elapsed_time": "1:54:35", "remaining_time": "0:52:13"} +{"current_steps": 1900, "total_steps": 2766, "loss": null, "eval_loss": 1.3637186288833618, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.06, "percentage": 68.69, "elapsed_time": "1:54:35", "remaining_time": "0:52:13"} +{"current_steps": 1905, "total_steps": 2766, "loss": 1.2265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002206232969580027, "epoch": 2.06, "percentage": 68.87, "elapsed_time": "1:55:03", "remaining_time": "0:52:00"} +{"current_steps": 1910, "total_steps": 2766, "loss": 1.2631, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021827294374790034, "epoch": 2.07, "percentage": 69.05, "elapsed_time": "1:55:21", "remaining_time": "0:51:41"} +{"current_steps": 1915, "total_steps": 2766, "loss": 1.1309, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021593167632984756, "epoch": 2.08, "percentage": 69.23, "elapsed_time": "1:55:38", "remaining_time": "0:51:23"} +{"current_steps": 1920, "total_steps": 2766, "loss": 1.2877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021359957021050392, "epoch": 2.08, "percentage": 69.41, "elapsed_time": "1:55:56", "remaining_time": "0:51:05"} +{"current_steps": 1925, "total_steps": 2766, "loss": 1.2993, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021127670060107362, "epoch": 2.09, "percentage": 69.6, "elapsed_time": "1:56:13", "remaining_time": "0:50:46"} +{"current_steps": 1930, "total_steps": 2766, "loss": 1.2244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020896314241488075, "epoch": 2.09, "percentage": 69.78, "elapsed_time": "1:56:31", "remaining_time": "0:50:28"} +{"current_steps": 1935, "total_steps": 2766, "loss": 1.1812, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002066589702649529, "epoch": 2.1, "percentage": 69.96, "elapsed_time": "1:56:49", "remaining_time": "0:50:10"} +{"current_steps": 1940, "total_steps": 2766, "loss": 1.2113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020436425846161437, "epoch": 2.1, "percentage": 70.14, "elapsed_time": "1:57:06", "remaining_time": "0:49:51"} +{"current_steps": 1945, "total_steps": 2766, "loss": 1.1754, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020207908101009054, "epoch": 2.11, "percentage": 70.32, "elapsed_time": "1:57:24", "remaining_time": "0:49:33"} +{"current_steps": 1950, "total_steps": 2766, "loss": 1.1897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019980351160812083, "epoch": 2.11, "percentage": 70.5, "elapsed_time": "1:57:41", "remaining_time": "0:49:15"} +{"current_steps": 1955, "total_steps": 2766, "loss": 1.1978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001975376236435813, "epoch": 2.12, "percentage": 70.68, "elapsed_time": "1:57:59", "remaining_time": "0:48:56"} +{"current_steps": 1960, "total_steps": 2766, "loss": 1.1937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019528149019211883, "epoch": 2.12, "percentage": 70.86, "elapsed_time": "1:58:17", "remaining_time": "0:48:38"} +{"current_steps": 1965, "total_steps": 2766, "loss": 1.2093, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019303518401479414, "epoch": 2.13, "percentage": 71.04, "elapsed_time": "1:58:34", "remaining_time": "0:48:20"} +{"current_steps": 1970, "total_steps": 2766, "loss": 1.2119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019079877755573442, "epoch": 2.13, "percentage": 71.22, "elapsed_time": "1:58:52", "remaining_time": "0:48:01"} +{"current_steps": 1975, "total_steps": 2766, "loss": 1.1933, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001885723429397983, "epoch": 2.14, "percentage": 71.4, "elapsed_time": "1:59:09", "remaining_time": "0:47:43"} +{"current_steps": 1980, "total_steps": 2766, "loss": 1.2046, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018635595197024886, "epoch": 2.15, "percentage": 71.58, "elapsed_time": "1:59:27", "remaining_time": "0:47:25"} +{"current_steps": 1985, "total_steps": 2766, "loss": 1.1605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018414967612643814, "epoch": 2.15, "percentage": 71.76, "elapsed_time": "1:59:44", "remaining_time": "0:47:06"} +{"current_steps": 1990, "total_steps": 2766, "loss": 1.1764, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001819535865615018, "epoch": 2.16, "percentage": 71.95, "elapsed_time": "2:00:02", "remaining_time": "0:46:48"} +{"current_steps": 1995, "total_steps": 2766, "loss": 1.2094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017976775410006508, "epoch": 2.16, "percentage": 72.13, "elapsed_time": "2:00:20", "remaining_time": "0:46:30"} +{"current_steps": 2000, "total_steps": 2766, "loss": 1.146, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000177592249235958, "epoch": 2.17, "percentage": 72.31, "elapsed_time": "2:00:37", "remaining_time": "0:46:12"} +{"current_steps": 2000, "total_steps": 2766, "loss": null, "eval_loss": 1.3614530563354492, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.17, "percentage": 72.31, "elapsed_time": "2:00:37", "remaining_time": "0:46:12"} +{"current_steps": 2005, "total_steps": 2766, "loss": 1.2674, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017542714212994188, "epoch": 2.17, "percentage": 72.49, "elapsed_time": "2:01:06", "remaining_time": "0:45:57"} +{"current_steps": 2010, "total_steps": 2766, "loss": 1.2817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017327250260744698, "epoch": 2.18, "percentage": 72.67, "elapsed_time": "2:01:23", "remaining_time": "0:45:39"} +{"current_steps": 2015, "total_steps": 2766, "loss": 1.2693, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017112840015632086, "epoch": 2.18, "percentage": 72.85, "elapsed_time": "2:01:41", "remaining_time": "0:45:21"} +{"current_steps": 2020, "total_steps": 2766, "loss": 1.253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016899490392458628, "epoch": 2.19, "percentage": 73.03, "elapsed_time": "2:01:58", "remaining_time": "0:45:02"} +{"current_steps": 2025, "total_steps": 2766, "loss": 1.208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016687208271821253, "epoch": 2.19, "percentage": 73.21, "elapsed_time": "2:02:16", "remaining_time": "0:44:44"} +{"current_steps": 2030, "total_steps": 2766, "loss": 1.1818, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016476000499889514, "epoch": 2.2, "percentage": 73.39, "elapsed_time": "2:02:33", "remaining_time": "0:44:26"} +{"current_steps": 2035, "total_steps": 2766, "loss": 1.1945, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001626587388818491, "epoch": 2.21, "percentage": 73.57, "elapsed_time": "2:02:51", "remaining_time": "0:44:07"} +{"current_steps": 2040, "total_steps": 2766, "loss": 1.2225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001605683521336116, "epoch": 2.21, "percentage": 73.75, "elapsed_time": "2:03:09", "remaining_time": "0:43:49"} +{"current_steps": 2045, "total_steps": 2766, "loss": 1.1726, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015848891216985596, "epoch": 2.22, "percentage": 73.93, "elapsed_time": "2:03:26", "remaining_time": "0:43:31"} +{"current_steps": 2050, "total_steps": 2766, "loss": 1.1651, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015642048605321856, "epoch": 2.22, "percentage": 74.11, "elapsed_time": "2:03:44", "remaining_time": "0:43:13"} +{"current_steps": 2055, "total_steps": 2766, "loss": 1.2148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001543631404911356, "epoch": 2.23, "percentage": 74.3, "elapsed_time": "2:04:01", "remaining_time": "0:42:54"} +{"current_steps": 2060, "total_steps": 2766, "loss": 1.191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015231694183369106, "epoch": 2.23, "percentage": 74.48, "elapsed_time": "2:04:19", "remaining_time": "0:42:36"} +{"current_steps": 2065, "total_steps": 2766, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001502819560714781, "epoch": 2.24, "percentage": 74.66, "elapsed_time": "2:04:37", "remaining_time": "0:42:18"} +{"current_steps": 2070, "total_steps": 2766, "loss": 1.1924, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014825824883347018, "epoch": 2.24, "percentage": 74.84, "elapsed_time": "2:04:54", "remaining_time": "0:41:59"} +{"current_steps": 2075, "total_steps": 2766, "loss": 1.1714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014624588538490413, "epoch": 2.25, "percentage": 75.02, "elapsed_time": "2:05:12", "remaining_time": "0:41:41"} +{"current_steps": 2080, "total_steps": 2766, "loss": 1.2641, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014424493062517623, "epoch": 2.25, "percentage": 75.2, "elapsed_time": "2:05:29", "remaining_time": "0:41:23"} +{"current_steps": 2085, "total_steps": 2766, "loss": 1.2721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014225544908574872, "epoch": 2.26, "percentage": 75.38, "elapsed_time": "2:05:47", "remaining_time": "0:41:05"} +{"current_steps": 2090, "total_steps": 2766, "loss": 1.2431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014027750492806817, "epoch": 2.26, "percentage": 75.56, "elapsed_time": "2:06:04", "remaining_time": "0:40:46"} +{"current_steps": 2095, "total_steps": 2766, "loss": 1.2983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013831116194149712, "epoch": 2.27, "percentage": 75.74, "elapsed_time": "2:06:22", "remaining_time": "0:40:28"} +{"current_steps": 2100, "total_steps": 2766, "loss": 1.2144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013635648354125662, "epoch": 2.28, "percentage": 75.92, "elapsed_time": "2:06:40", "remaining_time": "0:40:10"} +{"current_steps": 2100, "total_steps": 2766, "loss": null, "eval_loss": 1.3538448810577393, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.28, "percentage": 75.92, "elapsed_time": "2:06:40", "remaining_time": "0:40:10"} +{"current_steps": 2105, "total_steps": 2766, "loss": 1.1463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001344135327663804, "epoch": 2.28, "percentage": 76.1, "elapsed_time": "2:07:08", "remaining_time": "0:39:55"} +{"current_steps": 2110, "total_steps": 2766, "loss": 1.2751, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013248237227768246, "epoch": 2.29, "percentage": 76.28, "elapsed_time": "2:07:26", "remaining_time": "0:39:37"} +{"current_steps": 2115, "total_steps": 2766, "loss": 1.2196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013056306435573633, "epoch": 2.29, "percentage": 76.46, "elapsed_time": "2:07:43", "remaining_time": "0:39:18"} +{"current_steps": 2120, "total_steps": 2766, "loss": 1.1964, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012865567089886642, "epoch": 2.3, "percentage": 76.64, "elapsed_time": "2:08:01", "remaining_time": "0:39:00"} +{"current_steps": 2125, "total_steps": 2766, "loss": 1.1749, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012676025342115105, "epoch": 2.3, "percentage": 76.83, "elapsed_time": "2:08:18", "remaining_time": "0:38:42"} +{"current_steps": 2130, "total_steps": 2766, "loss": 1.2615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012487687305043978, "epoch": 2.31, "percentage": 77.01, "elapsed_time": "2:08:36", "remaining_time": "0:38:24"} +{"current_steps": 2135, "total_steps": 2766, "loss": 1.2064, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012300559052638122, "epoch": 2.31, "percentage": 77.19, "elapsed_time": "2:08:53", "remaining_time": "0:38:05"} +{"current_steps": 2140, "total_steps": 2766, "loss": 1.1642, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012114646619846425, "epoch": 2.32, "percentage": 77.37, "elapsed_time": "2:09:11", "remaining_time": "0:37:47"} +{"current_steps": 2145, "total_steps": 2766, "loss": 1.1704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011929956002407194, "epoch": 2.32, "percentage": 77.55, "elapsed_time": "2:09:29", "remaining_time": "0:37:29"} +{"current_steps": 2150, "total_steps": 2766, "loss": 1.1668, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011746493156654814, "epoch": 2.33, "percentage": 77.73, "elapsed_time": "2:09:46", "remaining_time": "0:37:10"} +{"current_steps": 2155, "total_steps": 2766, "loss": 1.1584, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011564263999327546, "epoch": 2.34, "percentage": 77.91, "elapsed_time": "2:10:04", "remaining_time": "0:36:52"} +{"current_steps": 2160, "total_steps": 2766, "loss": 1.2412, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011383274407376848, "epoch": 2.34, "percentage": 78.09, "elapsed_time": "2:10:21", "remaining_time": "0:36:34"} +{"current_steps": 2165, "total_steps": 2766, "loss": 1.1688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001120353021777778, "epoch": 2.35, "percentage": 78.27, "elapsed_time": "2:10:39", "remaining_time": "0:36:16"} +{"current_steps": 2170, "total_steps": 2766, "loss": 1.2097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011025037227340711, "epoch": 2.35, "percentage": 78.45, "elapsed_time": "2:10:57", "remaining_time": "0:35:57"} +{"current_steps": 2175, "total_steps": 2766, "loss": 1.2057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010847801192524454, "epoch": 2.36, "percentage": 78.63, "elapsed_time": "2:11:14", "remaining_time": "0:35:39"} +{"current_steps": 2180, "total_steps": 2766, "loss": 1.2296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010671827829250585, "epoch": 2.36, "percentage": 78.81, "elapsed_time": "2:11:32", "remaining_time": "0:35:21"} +{"current_steps": 2185, "total_steps": 2766, "loss": 1.2547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010497122812719068, "epoch": 2.37, "percentage": 78.99, "elapsed_time": "2:11:49", "remaining_time": "0:35:03"} +{"current_steps": 2190, "total_steps": 2766, "loss": 1.1746, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010323691777225286, "epoch": 2.37, "percentage": 79.18, "elapsed_time": "2:12:07", "remaining_time": "0:34:45"} +{"current_steps": 2195, "total_steps": 2766, "loss": 1.1466, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010151540315978314, "epoch": 2.38, "percentage": 79.36, "elapsed_time": "2:12:24", "remaining_time": "0:34:26"} +{"current_steps": 2200, "total_steps": 2766, "loss": 1.1551, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.98067398092049e-05, "epoch": 2.38, "percentage": 79.54, "elapsed_time": "2:12:42", "remaining_time": "0:34:08"} +{"current_steps": 2200, "total_steps": 2766, "loss": null, "eval_loss": 1.349250316619873, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.38, "percentage": 79.54, "elapsed_time": "2:12:42", "remaining_time": "0:34:08"} +{"current_steps": 2205, "total_steps": 2766, "loss": 1.158, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.811098282548447e-05, "epoch": 2.39, "percentage": 79.72, "elapsed_time": "2:13:10", "remaining_time": "0:33:53"} +{"current_steps": 2210, "total_steps": 2766, "loss": 1.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.642818689735305e-05, "epoch": 2.4, "percentage": 79.9, "elapsed_time": "2:13:28", "remaining_time": "0:33:34"} +{"current_steps": 2215, "total_steps": 2766, "loss": 1.2504, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.475840629554394e-05, "epoch": 2.4, "percentage": 80.08, "elapsed_time": "2:13:46", "remaining_time": "0:33:16"} +{"current_steps": 2220, "total_steps": 2766, "loss": 1.1439, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.310169487104131e-05, "epoch": 2.41, "percentage": 80.26, "elapsed_time": "2:14:03", "remaining_time": "0:32:58"} +{"current_steps": 2225, "total_steps": 2766, "loss": 1.2758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.145810605334454e-05, "epoch": 2.41, "percentage": 80.44, "elapsed_time": "2:14:21", "remaining_time": "0:32:40"} +{"current_steps": 2230, "total_steps": 2766, "loss": 1.1992, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.982769284874386e-05, "epoch": 2.42, "percentage": 80.62, "elapsed_time": "2:14:38", "remaining_time": "0:32:21"} +{"current_steps": 2235, "total_steps": 2766, "loss": 1.2177, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.821050783861212e-05, "epoch": 2.42, "percentage": 80.8, "elapsed_time": "2:14:56", "remaining_time": "0:32:03"} +{"current_steps": 2240, "total_steps": 2766, "loss": 1.1942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.660660317770841e-05, "epoch": 2.43, "percentage": 80.98, "elapsed_time": "2:15:13", "remaining_time": "0:31:45"} +{"current_steps": 2245, "total_steps": 2766, "loss": 1.163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.501603059249563e-05, "epoch": 2.43, "percentage": 81.16, "elapsed_time": "2:15:31", "remaining_time": "0:31:27"} +{"current_steps": 2250, "total_steps": 2766, "loss": 1.239, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.343884137947333e-05, "epoch": 2.44, "percentage": 81.34, "elapsed_time": "2:15:49", "remaining_time": "0:31:08"} +{"current_steps": 2255, "total_steps": 2766, "loss": 1.1455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.187508640352265e-05, "epoch": 2.44, "percentage": 81.53, "elapsed_time": "2:16:06", "remaining_time": "0:30:50"} +{"current_steps": 2260, "total_steps": 2766, "loss": 1.2165, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.032481609626575e-05, "epoch": 2.45, "percentage": 81.71, "elapsed_time": "2:16:24", "remaining_time": "0:30:32"} +{"current_steps": 2265, "total_steps": 2766, "loss": 1.1982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.878808045444014e-05, "epoch": 2.45, "percentage": 81.89, "elapsed_time": "2:16:41", "remaining_time": "0:30:14"} +{"current_steps": 2270, "total_steps": 2766, "loss": 1.212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.726492903828575e-05, "epoch": 2.46, "percentage": 82.07, "elapsed_time": "2:16:59", "remaining_time": "0:29:55"} +{"current_steps": 2275, "total_steps": 2766, "loss": 1.2453, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.575541096994637e-05, "epoch": 2.47, "percentage": 82.25, "elapsed_time": "2:17:17", "remaining_time": "0:29:37"} +{"current_steps": 2280, "total_steps": 2766, "loss": 1.2607, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.4259574931886e-05, "epoch": 2.47, "percentage": 82.43, "elapsed_time": "2:17:34", "remaining_time": "0:29:19"} +{"current_steps": 2285, "total_steps": 2766, "loss": 1.1936, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.27774691653188e-05, "epoch": 2.48, "percentage": 82.61, "elapsed_time": "2:17:52", "remaining_time": "0:29:01"} +{"current_steps": 2290, "total_steps": 2766, "loss": 1.2702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.130914146865247e-05, "epoch": 2.48, "percentage": 82.79, "elapsed_time": "2:18:09", "remaining_time": "0:28:43"} +{"current_steps": 2295, "total_steps": 2766, "loss": 1.133, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.985463919594781e-05, "epoch": 2.49, "percentage": 82.97, "elapsed_time": "2:18:27", "remaining_time": "0:28:24"} +{"current_steps": 2300, "total_steps": 2766, "loss": 1.2135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.841400925539104e-05, "epoch": 2.49, "percentage": 83.15, "elapsed_time": "2:18:44", "remaining_time": "0:28:06"} +{"current_steps": 2300, "total_steps": 2766, "loss": null, "eval_loss": 1.3470078706741333, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.49, "percentage": 83.15, "elapsed_time": "2:18:44", "remaining_time": "0:28:06"} +{"current_steps": 2305, "total_steps": 2766, "loss": 1.1986, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.698729810778065e-05, "epoch": 2.5, "percentage": 83.33, "elapsed_time": "2:19:13", "remaining_time": "0:27:50"} +{"current_steps": 2310, "total_steps": 2766, "loss": 1.2254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.557455176502986e-05, "epoch": 2.5, "percentage": 83.51, "elapsed_time": "2:19:30", "remaining_time": "0:27:32"} +{"current_steps": 2315, "total_steps": 2766, "loss": 1.212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.417581578868198e-05, "epoch": 2.51, "percentage": 83.69, "elapsed_time": "2:19:48", "remaining_time": "0:27:14"} +{"current_steps": 2320, "total_steps": 2766, "loss": 1.1517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.279113528844127e-05, "epoch": 2.51, "percentage": 83.88, "elapsed_time": "2:20:06", "remaining_time": "0:26:55"} +{"current_steps": 2325, "total_steps": 2766, "loss": 1.1889, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.14205549207184e-05, "epoch": 2.52, "percentage": 84.06, "elapsed_time": "2:20:23", "remaining_time": "0:26:37"} +{"current_steps": 2330, "total_steps": 2766, "loss": 1.2348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.006411888718982e-05, "epoch": 2.53, "percentage": 84.24, "elapsed_time": "2:20:41", "remaining_time": "0:26:19"} +{"current_steps": 2335, "total_steps": 2766, "loss": 1.1862, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.872187093337239e-05, "epoch": 2.53, "percentage": 84.42, "elapsed_time": "2:20:58", "remaining_time": "0:26:01"} +{"current_steps": 2340, "total_steps": 2766, "loss": 1.2143, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.739385434721295e-05, "epoch": 2.54, "percentage": 84.6, "elapsed_time": "2:21:16", "remaining_time": "0:25:43"} +{"current_steps": 2345, "total_steps": 2766, "loss": 1.242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.608011195769186e-05, "epoch": 2.54, "percentage": 84.78, "elapsed_time": "2:21:33", "remaining_time": "0:25:24"} +{"current_steps": 2350, "total_steps": 2766, "loss": 1.1817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.478068613344151e-05, "epoch": 2.55, "percentage": 84.96, "elapsed_time": "2:21:51", "remaining_time": "0:25:06"} +{"current_steps": 2355, "total_steps": 2766, "loss": 1.1916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.3495618781380764e-05, "epoch": 2.55, "percentage": 85.14, "elapsed_time": "2:22:09", "remaining_time": "0:24:48"} +{"current_steps": 2360, "total_steps": 2766, "loss": 1.1231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.2224951345362703e-05, "epoch": 2.56, "percentage": 85.32, "elapsed_time": "2:22:26", "remaining_time": "0:24:30"} +{"current_steps": 2365, "total_steps": 2766, "loss": 1.2113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.096872480483816e-05, "epoch": 2.56, "percentage": 85.5, "elapsed_time": "2:22:44", "remaining_time": "0:24:12"} +{"current_steps": 2370, "total_steps": 2766, "loss": 1.164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.972697967353445e-05, "epoch": 2.57, "percentage": 85.68, "elapsed_time": "2:23:01", "remaining_time": "0:23:53"} +{"current_steps": 2375, "total_steps": 2766, "loss": 1.1947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8499755998148656e-05, "epoch": 2.57, "percentage": 85.86, "elapsed_time": "2:23:19", "remaining_time": "0:23:35"} +{"current_steps": 2380, "total_steps": 2766, "loss": 1.2219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.728709335705561e-05, "epoch": 2.58, "percentage": 86.04, "elapsed_time": "2:23:37", "remaining_time": "0:23:17"} +{"current_steps": 2385, "total_steps": 2766, "loss": 1.2104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6089030859032376e-05, "epoch": 2.58, "percentage": 86.23, "elapsed_time": "2:23:54", "remaining_time": "0:22:59"} +{"current_steps": 2390, "total_steps": 2766, "loss": 1.2077, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.490560714199637e-05, "epoch": 2.59, "percentage": 86.41, "elapsed_time": "2:24:12", "remaining_time": "0:22:41"} +{"current_steps": 2395, "total_steps": 2766, "loss": 1.1758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.373686037175917e-05, "epoch": 2.6, "percentage": 86.59, "elapsed_time": "2:24:29", "remaining_time": "0:22:23"} +{"current_steps": 2400, "total_steps": 2766, "loss": 1.2094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.258282824079618e-05, "epoch": 2.6, "percentage": 86.77, "elapsed_time": "2:24:47", "remaining_time": "0:22:04"} +{"current_steps": 2400, "total_steps": 2766, "loss": null, "eval_loss": 1.3436678647994995, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.6, "percentage": 86.77, "elapsed_time": "2:24:47", "remaining_time": "0:22:04"} +{"current_steps": 2405, "total_steps": 2766, "loss": 1.2, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1443547967030816e-05, "epoch": 2.61, "percentage": 86.95, "elapsed_time": "2:25:15", "remaining_time": "0:21:48"} +{"current_steps": 2410, "total_steps": 2766, "loss": 1.2246, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.031905629263371e-05, "epoch": 2.61, "percentage": 87.13, "elapsed_time": "2:25:33", "remaining_time": "0:21:30"} +{"current_steps": 2415, "total_steps": 2766, "loss": 1.198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.92093894828387e-05, "epoch": 2.62, "percentage": 87.31, "elapsed_time": "2:25:50", "remaining_time": "0:21:11"} +{"current_steps": 2420, "total_steps": 2766, "loss": 1.269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.811458332477252e-05, "epoch": 2.62, "percentage": 87.49, "elapsed_time": "2:26:08", "remaining_time": "0:20:53"} +{"current_steps": 2425, "total_steps": 2766, "loss": 1.189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.703467312630088e-05, "epoch": 2.63, "percentage": 87.67, "elapsed_time": "2:26:26", "remaining_time": "0:20:35"} +{"current_steps": 2430, "total_steps": 2766, "loss": 1.1938, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.596969371488995e-05, "epoch": 2.63, "percentage": 87.85, "elapsed_time": "2:26:43", "remaining_time": "0:20:17"} +{"current_steps": 2435, "total_steps": 2766, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.491967943648289e-05, "epoch": 2.64, "percentage": 88.03, "elapsed_time": "2:27:01", "remaining_time": "0:19:59"} +{"current_steps": 2440, "total_steps": 2766, "loss": 1.1145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.388466415439234e-05, "epoch": 2.64, "percentage": 88.21, "elapsed_time": "2:27:18", "remaining_time": "0:19:40"} +{"current_steps": 2445, "total_steps": 2766, "loss": 1.1678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2864681248208184e-05, "epoch": 2.65, "percentage": 88.39, "elapsed_time": "2:27:36", "remaining_time": "0:19:22"} +{"current_steps": 2450, "total_steps": 2766, "loss": 1.2627, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.185976361272125e-05, "epoch": 2.66, "percentage": 88.58, "elapsed_time": "2:27:53", "remaining_time": "0:19:04"} +{"current_steps": 2455, "total_steps": 2766, "loss": 1.1344, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.086994365686246e-05, "epoch": 2.66, "percentage": 88.76, "elapsed_time": "2:28:11", "remaining_time": "0:18:46"} +{"current_steps": 2460, "total_steps": 2766, "loss": 1.2325, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9895253302657188e-05, "epoch": 2.67, "percentage": 88.94, "elapsed_time": "2:28:29", "remaining_time": "0:18:28"} +{"current_steps": 2465, "total_steps": 2766, "loss": 1.2533, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8935723984196304e-05, "epoch": 2.67, "percentage": 89.12, "elapsed_time": "2:28:46", "remaining_time": "0:18:10"} +{"current_steps": 2470, "total_steps": 2766, "loss": 1.226, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7991386646622207e-05, "epoch": 2.68, "percentage": 89.3, "elapsed_time": "2:29:04", "remaining_time": "0:17:51"} +{"current_steps": 2475, "total_steps": 2766, "loss": 1.0925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7062271745130595e-05, "epoch": 2.68, "percentage": 89.48, "elapsed_time": "2:29:21", "remaining_time": "0:17:33"} +{"current_steps": 2480, "total_steps": 2766, "loss": 1.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.614840924398876e-05, "epoch": 2.69, "percentage": 89.66, "elapsed_time": "2:29:39", "remaining_time": "0:17:15"} +{"current_steps": 2485, "total_steps": 2766, "loss": 1.214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5249828615568794e-05, "epoch": 2.69, "percentage": 89.84, "elapsed_time": "2:29:57", "remaining_time": "0:16:57"} +{"current_steps": 2490, "total_steps": 2766, "loss": 1.2427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.436655883939737e-05, "epoch": 2.7, "percentage": 90.02, "elapsed_time": "2:30:14", "remaining_time": "0:16:39"} +{"current_steps": 2495, "total_steps": 2766, "loss": 1.2447, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3498628401221078e-05, "epoch": 2.7, "percentage": 90.2, "elapsed_time": "2:30:32", "remaining_time": "0:16:21"} +{"current_steps": 2500, "total_steps": 2766, "loss": 1.1835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2646065292087403e-05, "epoch": 2.71, "percentage": 90.38, "elapsed_time": "2:30:49", "remaining_time": "0:16:02"} +{"current_steps": 2500, "total_steps": 2766, "loss": null, "eval_loss": 1.343565821647644, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.71, "percentage": 90.38, "elapsed_time": "2:30:49", "remaining_time": "0:16:02"} +{"current_steps": 2505, "total_steps": 2766, "loss": 1.2457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1808897007442762e-05, "epoch": 2.71, "percentage": 90.56, "elapsed_time": "2:31:18", "remaining_time": "0:15:45"} +{"current_steps": 2510, "total_steps": 2766, "loss": 1.2248, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.098715054624506e-05, "epoch": 2.72, "percentage": 90.74, "elapsed_time": "2:31:35", "remaining_time": "0:15:27"} +{"current_steps": 2515, "total_steps": 2766, "loss": 1.2419, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0180852410093153e-05, "epoch": 2.73, "percentage": 90.93, "elapsed_time": "2:31:53", "remaining_time": "0:15:09"} +{"current_steps": 2520, "total_steps": 2766, "loss": 1.1763, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.939002860237249e-05, "epoch": 2.73, "percentage": 91.11, "elapsed_time": "2:32:10", "remaining_time": "0:14:51"} +{"current_steps": 2525, "total_steps": 2766, "loss": 1.2035, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8614704627416045e-05, "epoch": 2.74, "percentage": 91.29, "elapsed_time": "2:32:28", "remaining_time": "0:14:33"} +{"current_steps": 2530, "total_steps": 2766, "loss": 1.1767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7854905489681993e-05, "epoch": 2.74, "percentage": 91.47, "elapsed_time": "2:32:45", "remaining_time": "0:14:15"} +{"current_steps": 2535, "total_steps": 2766, "loss": 1.254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7110655692947397e-05, "epoch": 2.75, "percentage": 91.65, "elapsed_time": "2:33:03", "remaining_time": "0:13:56"} +{"current_steps": 2540, "total_steps": 2766, "loss": 1.1941, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.638197923951784e-05, "epoch": 2.75, "percentage": 91.83, "elapsed_time": "2:33:21", "remaining_time": "0:13:38"} +{"current_steps": 2545, "total_steps": 2766, "loss": 1.2568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5668899629453225e-05, "epoch": 2.76, "percentage": 92.01, "elapsed_time": "2:33:38", "remaining_time": "0:13:20"} +{"current_steps": 2550, "total_steps": 2766, "loss": 1.2237, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4971439859810199e-05, "epoch": 2.76, "percentage": 92.19, "elapsed_time": "2:33:56", "remaining_time": "0:13:02"} +{"current_steps": 2555, "total_steps": 2766, "loss": 1.172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.428962242390025e-05, "epoch": 2.77, "percentage": 92.37, "elapsed_time": "2:34:13", "remaining_time": "0:12:44"} +{"current_steps": 2560, "total_steps": 2766, "loss": 1.1835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3623469310564408e-05, "epoch": 2.77, "percentage": 92.55, "elapsed_time": "2:34:31", "remaining_time": "0:12:26"} +{"current_steps": 2565, "total_steps": 2766, "loss": 1.1335, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2973002003463797e-05, "epoch": 2.78, "percentage": 92.73, "elapsed_time": "2:34:49", "remaining_time": "0:12:07"} +{"current_steps": 2570, "total_steps": 2766, "loss": 1.1968, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2338241480387369e-05, "epoch": 2.79, "percentage": 92.91, "elapsed_time": "2:35:06", "remaining_time": "0:11:49"} +{"current_steps": 2575, "total_steps": 2766, "loss": 1.1962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1719208212574939e-05, "epoch": 2.79, "percentage": 93.09, "elapsed_time": "2:35:24", "remaining_time": "0:11:31"} +{"current_steps": 2580, "total_steps": 2766, "loss": 1.2107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.111592216405688e-05, "epoch": 2.8, "percentage": 93.28, "elapsed_time": "2:35:41", "remaining_time": "0:11:13"} +{"current_steps": 2585, "total_steps": 2766, "loss": 1.2148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0528402791010582e-05, "epoch": 2.8, "percentage": 93.46, "elapsed_time": "2:35:59", "remaining_time": "0:10:55"} +{"current_steps": 2590, "total_steps": 2766, "loss": 1.1443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.956669041133015e-06, "epoch": 2.81, "percentage": 93.64, "elapsed_time": "2:36:17", "remaining_time": "0:10:37"} +{"current_steps": 2595, "total_steps": 2766, "loss": 1.1805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.400739353029209e-06, "epoch": 2.81, "percentage": 93.82, "elapsed_time": "2:36:34", "remaining_time": "0:10:19"} +{"current_steps": 2600, "total_steps": 2766, "loss": 1.2061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.860631655618124e-06, "epoch": 2.82, "percentage": 94.0, "elapsed_time": "2:36:52", "remaining_time": "0:10:00"} +{"current_steps": 2600, "total_steps": 2766, "loss": null, "eval_loss": 1.3423666954040527, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.82, "percentage": 94.0, "elapsed_time": "2:36:52", "remaining_time": "0:10:00"} +{"current_steps": 2605, "total_steps": 2766, "loss": 1.2418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.336363367554112e-06, "epoch": 2.82, "percentage": 94.18, "elapsed_time": "2:37:20", "remaining_time": "0:09:43"} +{"current_steps": 2610, "total_steps": 2766, "loss": 1.2532, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.827951396665312e-06, "epoch": 2.83, "percentage": 94.36, "elapsed_time": "2:37:38", "remaining_time": "0:09:25"} +{"current_steps": 2615, "total_steps": 2766, "loss": 1.2836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.335412139408248e-06, "epoch": 2.83, "percentage": 94.54, "elapsed_time": "2:37:55", "remaining_time": "0:09:07"} +{"current_steps": 2620, "total_steps": 2766, "loss": 1.2918, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.85876148033926e-06, "epoch": 2.84, "percentage": 94.72, "elapsed_time": "2:38:13", "remaining_time": "0:08:49"} +{"current_steps": 2625, "total_steps": 2766, "loss": 1.1381, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.398014791601847e-06, "epoch": 2.84, "percentage": 94.9, "elapsed_time": "2:38:30", "remaining_time": "0:08:30"} +{"current_steps": 2630, "total_steps": 2766, "loss": 1.1686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.953186932431298e-06, "epoch": 2.85, "percentage": 95.08, "elapsed_time": "2:38:48", "remaining_time": "0:08:12"} +{"current_steps": 2635, "total_steps": 2766, "loss": 1.2104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.524292248675289e-06, "epoch": 2.86, "percentage": 95.26, "elapsed_time": "2:39:05", "remaining_time": "0:07:54"} +{"current_steps": 2640, "total_steps": 2766, "loss": 1.2651, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.111344572331145e-06, "epoch": 2.86, "percentage": 95.44, "elapsed_time": "2:39:23", "remaining_time": "0:07:36"} +{"current_steps": 2645, "total_steps": 2766, "loss": 1.1296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.714357221099974e-06, "epoch": 2.87, "percentage": 95.63, "elapsed_time": "2:39:41", "remaining_time": "0:07:18"} +{"current_steps": 2650, "total_steps": 2766, "loss": 1.1534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.333342997957013e-06, "epoch": 2.87, "percentage": 95.81, "elapsed_time": "2:39:58", "remaining_time": "0:07:00"} +{"current_steps": 2655, "total_steps": 2766, "loss": 1.2065, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.96831419073862e-06, "epoch": 2.88, "percentage": 95.99, "elapsed_time": "2:40:16", "remaining_time": "0:06:42"} +{"current_steps": 2660, "total_steps": 2766, "loss": 1.2118, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6192825717464294e-06, "epoch": 2.88, "percentage": 96.17, "elapsed_time": "2:40:33", "remaining_time": "0:06:23"} +{"current_steps": 2665, "total_steps": 2766, "loss": 1.1139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2862593973670975e-06, "epoch": 2.89, "percentage": 96.35, "elapsed_time": "2:40:51", "remaining_time": "0:06:05"} +{"current_steps": 2670, "total_steps": 2766, "loss": 1.1702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.969255407709648e-06, "epoch": 2.89, "percentage": 96.53, "elapsed_time": "2:41:09", "remaining_time": "0:05:47"} +{"current_steps": 2675, "total_steps": 2766, "loss": 1.1821, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.668280826259195e-06, "epoch": 2.9, "percentage": 96.71, "elapsed_time": "2:41:26", "remaining_time": "0:05:29"} +{"current_steps": 2680, "total_steps": 2766, "loss": 1.1826, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.383345359546818e-06, "epoch": 2.9, "percentage": 96.89, "elapsed_time": "2:41:44", "remaining_time": "0:05:11"} +{"current_steps": 2685, "total_steps": 2766, "loss": 1.236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1144581968369213e-06, "epoch": 2.91, "percentage": 97.07, "elapsed_time": "2:42:01", "remaining_time": "0:04:53"} +{"current_steps": 2690, "total_steps": 2766, "loss": 1.2308, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.861628009830696e-06, "epoch": 2.92, "percentage": 97.25, "elapsed_time": "2:42:19", "remaining_time": "0:04:35"} +{"current_steps": 2695, "total_steps": 2766, "loss": 1.1957, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6248629523865077e-06, "epoch": 2.92, "percentage": 97.43, "elapsed_time": "2:42:37", "remaining_time": "0:04:17"} +{"current_steps": 2700, "total_steps": 2766, "loss": 1.1613, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4041706602567206e-06, "epoch": 2.93, "percentage": 97.61, "elapsed_time": "2:42:54", "remaining_time": "0:03:58"} +{"current_steps": 2700, "total_steps": 2766, "loss": null, "eval_loss": 1.3419121503829956, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.93, "percentage": 97.61, "elapsed_time": "2:42:54", "remaining_time": "0:03:58"} +{"current_steps": 2705, "total_steps": 2766, "loss": 1.152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1995582508418924e-06, "epoch": 2.93, "percentage": 97.79, "elapsed_time": "2:43:22", "remaining_time": "0:03:41"} +{"current_steps": 2710, "total_steps": 2766, "loss": 1.2173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0110323229608476e-06, "epoch": 2.94, "percentage": 97.98, "elapsed_time": "2:43:40", "remaining_time": "0:03:22"} +{"current_steps": 2715, "total_steps": 2766, "loss": 1.2202, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.385989566379593e-07, "epoch": 2.94, "percentage": 98.16, "elapsed_time": "2:43:58", "remaining_time": "0:03:04"} +{"current_steps": 2720, "total_steps": 2766, "loss": 1.1492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.82263712907083e-07, "epoch": 2.95, "percentage": 98.34, "elapsed_time": "2:44:15", "remaining_time": "0:02:46"} +{"current_steps": 2725, "total_steps": 2766, "loss": 1.2548, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.420316336323117e-07, "epoch": 2.95, "percentage": 98.52, "elapsed_time": "2:44:33", "remaining_time": "0:02:28"} +{"current_steps": 2730, "total_steps": 2766, "loss": 1.2085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1790724134521676e-07, "epoch": 2.96, "percentage": 98.7, "elapsed_time": "2:44:50", "remaining_time": "0:02:10"} +{"current_steps": 2735, "total_steps": 2766, "loss": 1.2253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.098945390991315e-07, "epoch": 2.96, "percentage": 98.88, "elapsed_time": "2:45:08", "remaining_time": "0:01:52"} +{"current_steps": 2740, "total_steps": 2766, "loss": 1.1604, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.179970103398654e-07, "epoch": 2.97, "percentage": 99.06, "elapsed_time": "2:45:25", "remaining_time": "0:01:34"} +{"current_steps": 2745, "total_steps": 2766, "loss": 1.2269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4221761879351648e-07, "epoch": 2.97, "percentage": 99.24, "elapsed_time": "2:45:43", "remaining_time": "0:01:16"} +{"current_steps": 2750, "total_steps": 2766, "loss": 1.2413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.25588083709361e-08, "epoch": 2.98, "percentage": 99.42, "elapsed_time": "2:46:01", "remaining_time": "0:00:57"} +{"current_steps": 2755, "total_steps": 2766, "loss": 1.2255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9022503088737006e-08, "epoch": 2.99, "percentage": 99.6, "elapsed_time": "2:46:18", "remaining_time": "0:00:39"} +{"current_steps": 2760, "total_steps": 2766, "loss": 1.2192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1610107007398175e-08, "epoch": 2.99, "percentage": 99.78, "elapsed_time": "2:46:36", "remaining_time": "0:00:21"} +{"current_steps": 2765, "total_steps": 2766, "loss": 1.1758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2250418585677564e-10, "epoch": 3.0, "percentage": 99.96, "elapsed_time": "2:46:53", "remaining_time": "0:00:03"} +{"current_steps": 2766, "total_steps": 2766, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:46:57", "remaining_time": "0:00:00"} +{"current_steps": 19, "total_steps": 19, "loss": null, "eval_loss": 1.3419121503829956, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:47:07", "remaining_time": "0:00:00"} diff --git a/PT/.ipynb_checkpoints/trainer_state-checkpoint.json b/PT/.ipynb_checkpoints/trainer_state-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..bcb159af38153c9a102aadb872540e4199b71f63 --- /dev/null +++ b/PT/.ipynb_checkpoints/trainer_state-checkpoint.json @@ -0,0 +1,3562 @@ +{ + "best_metric": 1.3419121503829956, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2700", + "epoch": 2.9975616364128963, + "eval_steps": 100, + "global_step": 2766, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + }, + { + "epoch": 2.61, + "learning_rate": 4.1443547967030816e-05, + "loss": 1.2, + "step": 2405 + }, + { + "epoch": 2.61, + "learning_rate": 4.031905629263371e-05, + "loss": 1.2246, + "step": 2410 + }, + { + "epoch": 2.62, + "learning_rate": 3.92093894828387e-05, + "loss": 1.198, + "step": 2415 + }, + { + "epoch": 2.62, + "learning_rate": 3.811458332477252e-05, + "loss": 1.269, + "step": 2420 + }, + { + "epoch": 2.63, + "learning_rate": 3.703467312630088e-05, + "loss": 1.189, + "step": 2425 + }, + { + "epoch": 2.63, + "learning_rate": 3.596969371488995e-05, + "loss": 1.1938, + "step": 2430 + }, + { + "epoch": 2.64, + "learning_rate": 3.491967943648289e-05, + "loss": 1.2421, + "step": 2435 + }, + { + "epoch": 2.64, + "learning_rate": 3.388466415439234e-05, + "loss": 1.1145, + "step": 2440 + }, + { + "epoch": 2.65, + "learning_rate": 3.2864681248208184e-05, + "loss": 1.1678, + "step": 2445 + }, + { + "epoch": 2.66, + "learning_rate": 3.185976361272125e-05, + "loss": 1.2627, + "step": 2450 + }, + { + "epoch": 2.66, + "learning_rate": 3.086994365686246e-05, + "loss": 1.1344, + "step": 2455 + }, + { + "epoch": 2.67, + "learning_rate": 2.9895253302657188e-05, + "loss": 1.2325, + "step": 2460 + }, + { + "epoch": 2.67, + "learning_rate": 2.8935723984196304e-05, + "loss": 1.2533, + "step": 2465 + }, + { + "epoch": 2.68, + "learning_rate": 2.7991386646622207e-05, + "loss": 1.226, + "step": 2470 + }, + { + "epoch": 2.68, + "learning_rate": 2.7062271745130595e-05, + "loss": 1.0925, + "step": 2475 + }, + { + "epoch": 2.69, + "learning_rate": 2.614840924398876e-05, + "loss": 1.1444, + "step": 2480 + }, + { + "epoch": 2.69, + "learning_rate": 2.5249828615568794e-05, + "loss": 1.214, + "step": 2485 + }, + { + "epoch": 2.7, + "learning_rate": 2.436655883939737e-05, + "loss": 1.2427, + "step": 2490 + }, + { + "epoch": 2.7, + "learning_rate": 2.3498628401221078e-05, + "loss": 1.2447, + "step": 2495 + }, + { + "epoch": 2.71, + "learning_rate": 2.2646065292087403e-05, + "loss": 1.1835, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_loss": 1.343565821647644, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2500 + }, + { + "epoch": 2.71, + "learning_rate": 2.1808897007442762e-05, + "loss": 1.2457, + "step": 2505 + }, + { + "epoch": 2.72, + "learning_rate": 2.098715054624506e-05, + "loss": 1.2248, + "step": 2510 + }, + { + "epoch": 2.73, + "learning_rate": 2.0180852410093153e-05, + "loss": 1.2419, + "step": 2515 + }, + { + "epoch": 2.73, + "learning_rate": 1.939002860237249e-05, + "loss": 1.1763, + "step": 2520 + }, + { + "epoch": 2.74, + "learning_rate": 1.8614704627416045e-05, + "loss": 1.2035, + "step": 2525 + }, + { + "epoch": 2.74, + "learning_rate": 1.7854905489681993e-05, + "loss": 1.1767, + "step": 2530 + }, + { + "epoch": 2.75, + "learning_rate": 1.7110655692947397e-05, + "loss": 1.254, + "step": 2535 + }, + { + "epoch": 2.75, + "learning_rate": 1.638197923951784e-05, + "loss": 1.1941, + "step": 2540 + }, + { + "epoch": 2.76, + "learning_rate": 1.5668899629453225e-05, + "loss": 1.2568, + "step": 2545 + }, + { + "epoch": 2.76, + "learning_rate": 1.4971439859810199e-05, + "loss": 1.2237, + "step": 2550 + }, + { + "epoch": 2.77, + "learning_rate": 1.428962242390025e-05, + "loss": 1.172, + "step": 2555 + }, + { + "epoch": 2.77, + "learning_rate": 1.3623469310564408e-05, + "loss": 1.1835, + "step": 2560 + }, + { + "epoch": 2.78, + "learning_rate": 1.2973002003463797e-05, + "loss": 1.1335, + "step": 2565 + }, + { + "epoch": 2.79, + "learning_rate": 1.2338241480387369e-05, + "loss": 1.1968, + "step": 2570 + }, + { + "epoch": 2.79, + "learning_rate": 1.1719208212574939e-05, + "loss": 1.1962, + "step": 2575 + }, + { + "epoch": 2.8, + "learning_rate": 1.111592216405688e-05, + "loss": 1.2107, + "step": 2580 + }, + { + "epoch": 2.8, + "learning_rate": 1.0528402791010582e-05, + "loss": 1.2148, + "step": 2585 + }, + { + "epoch": 2.81, + "learning_rate": 9.956669041133015e-06, + "loss": 1.1443, + "step": 2590 + }, + { + "epoch": 2.81, + "learning_rate": 9.400739353029209e-06, + "loss": 1.1805, + "step": 2595 + }, + { + "epoch": 2.82, + "learning_rate": 8.860631655618124e-06, + "loss": 1.2061, + "step": 2600 + }, + { + "epoch": 2.82, + "eval_loss": 1.3423666954040527, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2600 + }, + { + "epoch": 2.82, + "learning_rate": 8.336363367554112e-06, + "loss": 1.2418, + "step": 2605 + }, + { + "epoch": 2.83, + "learning_rate": 7.827951396665312e-06, + "loss": 1.2532, + "step": 2610 + }, + { + "epoch": 2.83, + "learning_rate": 7.335412139408248e-06, + "loss": 1.2836, + "step": 2615 + }, + { + "epoch": 2.84, + "learning_rate": 6.85876148033926e-06, + "loss": 1.2918, + "step": 2620 + }, + { + "epoch": 2.84, + "learning_rate": 6.398014791601847e-06, + "loss": 1.1381, + "step": 2625 + }, + { + "epoch": 2.85, + "learning_rate": 5.953186932431298e-06, + "loss": 1.1686, + "step": 2630 + }, + { + "epoch": 2.86, + "learning_rate": 5.524292248675289e-06, + "loss": 1.2104, + "step": 2635 + }, + { + "epoch": 2.86, + "learning_rate": 5.111344572331145e-06, + "loss": 1.2651, + "step": 2640 + }, + { + "epoch": 2.87, + "learning_rate": 4.714357221099974e-06, + "loss": 1.1296, + "step": 2645 + }, + { + "epoch": 2.87, + "learning_rate": 4.333342997957013e-06, + "loss": 1.1534, + "step": 2650 + }, + { + "epoch": 2.88, + "learning_rate": 3.96831419073862e-06, + "loss": 1.2065, + "step": 2655 + }, + { + "epoch": 2.88, + "learning_rate": 3.6192825717464294e-06, + "loss": 1.2118, + "step": 2660 + }, + { + "epoch": 2.89, + "learning_rate": 3.2862593973670975e-06, + "loss": 1.1139, + "step": 2665 + }, + { + "epoch": 2.89, + "learning_rate": 2.969255407709648e-06, + "loss": 1.1702, + "step": 2670 + }, + { + "epoch": 2.9, + "learning_rate": 2.668280826259195e-06, + "loss": 1.1821, + "step": 2675 + }, + { + "epoch": 2.9, + "learning_rate": 2.383345359546818e-06, + "loss": 1.1826, + "step": 2680 + }, + { + "epoch": 2.91, + "learning_rate": 2.1144581968369213e-06, + "loss": 1.236, + "step": 2685 + }, + { + "epoch": 2.92, + "learning_rate": 1.861628009830696e-06, + "loss": 1.2308, + "step": 2690 + }, + { + "epoch": 2.92, + "learning_rate": 1.6248629523865077e-06, + "loss": 1.1957, + "step": 2695 + }, + { + "epoch": 2.93, + "learning_rate": 1.4041706602567206e-06, + "loss": 1.1613, + "step": 2700 + }, + { + "epoch": 2.93, + "eval_loss": 1.3419121503829956, + "eval_runtime": 10.672, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2700 + }, + { + "epoch": 2.93, + "learning_rate": 1.1995582508418924e-06, + "loss": 1.152, + "step": 2705 + }, + { + "epoch": 2.94, + "learning_rate": 1.0110323229608476e-06, + "loss": 1.2173, + "step": 2710 + }, + { + "epoch": 2.94, + "learning_rate": 8.385989566379593e-07, + "loss": 1.2202, + "step": 2715 + }, + { + "epoch": 2.95, + "learning_rate": 6.82263712907083e-07, + "loss": 1.1492, + "step": 2720 + }, + { + "epoch": 2.95, + "learning_rate": 5.420316336323117e-07, + "loss": 1.2548, + "step": 2725 + }, + { + "epoch": 2.96, + "learning_rate": 4.1790724134521676e-07, + "loss": 1.2085, + "step": 2730 + }, + { + "epoch": 2.96, + "learning_rate": 3.098945390991315e-07, + "loss": 1.2253, + "step": 2735 + }, + { + "epoch": 2.97, + "learning_rate": 2.179970103398654e-07, + "loss": 1.1604, + "step": 2740 + }, + { + "epoch": 2.97, + "learning_rate": 1.4221761879351648e-07, + "loss": 1.2269, + "step": 2745 + }, + { + "epoch": 2.98, + "learning_rate": 8.25588083709361e-08, + "loss": 1.2413, + "step": 2750 + }, + { + "epoch": 2.99, + "learning_rate": 3.9022503088737006e-08, + "loss": 1.2255, + "step": 2755 + }, + { + "epoch": 2.99, + "learning_rate": 1.1610107007398175e-08, + "loss": 1.2192, + "step": 2760 + }, + { + "epoch": 3.0, + "learning_rate": 3.2250418585677564e-10, + "loss": 1.1758, + "step": 2765 + }, + { + "epoch": 3.0, + "step": 2766, + "total_flos": 9.118407110185452e+17, + "train_loss": 1.3472231076148138, + "train_runtime": 10021.615, + "train_samples_per_second": 4.42, + "train_steps_per_second": 0.276 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 9.118407110185452e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/README.md b/PT/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/adapter_config.json b/PT/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/adapter_model.bin b/PT/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f605850fce20203ba7257c723b003ceb595b340 --- /dev/null +++ b/PT/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe3a9d305afa1374c0d7c1da1c75af38231ab61d9862cde4ed44fefc836c91e +size 16821197 diff --git a/PT/all_results.json b/PT/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2e3df8f499e28bcd843d3226a9938a4186802e73 --- /dev/null +++ b/PT/all_results.json @@ -0,0 +1,12 @@ +{ + "epoch": 3.0, + "eval_loss": 1.3419121503829956, + "eval_runtime": 10.6724, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "perplexity": 3.826353077140856, + "train_loss": 1.3472231076148138, + "train_runtime": 10021.615, + "train_samples_per_second": 4.42, + "train_steps_per_second": 0.276 +} \ No newline at end of file diff --git a/PT/checkpoint-100/README.md b/PT/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-100/adapter_config.json b/PT/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-100/adapter_model.bin b/PT/checkpoint-100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ee5df2e6b2dff02858f83e3b3cb6f735fbc8cc99 --- /dev/null +++ b/PT/checkpoint-100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e64ad4f4fb7a9f9696ca468a7dadd744bf651d640f7b782c8f470cc81dc07590 +size 16821197 diff --git a/PT/checkpoint-100/finetuning_args.json b/PT/checkpoint-100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-100/optimizer.pt b/PT/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cbbffc6f4d7f457e22443469c81d88629d60b9d3 --- /dev/null +++ b/PT/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8a25a341da5df094e4fdc50492c6c58b9431270483cf2dfc8669a07631bf4d +size 33661637 diff --git a/PT/checkpoint-100/rng_state.pth b/PT/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..09ee0c02dfec95f5639d5078370e72bbf6954bca --- /dev/null +++ b/PT/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea4eb26f16a2d549c9e222fe1c63f2e84ab55e4f2fc94f3a4814b5e85dd515e +size 14575 diff --git a/PT/checkpoint-100/scheduler.pt b/PT/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..386a69d384f0d5feee55780be4b91825d7237021 --- /dev/null +++ b/PT/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba067af54d3620c71c97f9dcb1a629e8d838e8c8a20dd2cf3d481e5eaed1dd56 +size 627 diff --git a/PT/checkpoint-100/trainer_state.json b/PT/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a888d8263f0dcc3d7c082390b46bff21dea46514 --- /dev/null +++ b/PT/checkpoint-100/trainer_state.json @@ -0,0 +1,147 @@ +{ + "best_metric": 1.6186352968215942, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-100", + "epoch": 0.1083717149823896, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.29660416131072e+16, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-100/training_args.bin b/PT/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1000/README.md b/PT/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1000/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1000/adapter_config.json b/PT/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1000/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1000/adapter_model.bin b/PT/checkpoint-1000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..990e91c84a982b32ba01d9111518122594c3a905 --- /dev/null +++ b/PT/checkpoint-1000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9aee5f7297ade26099112c1280d3999370ce9bf8c4a0a2a2e946417d6e062b3 +size 16821197 diff --git a/PT/checkpoint-1000/finetuning_args.json b/PT/checkpoint-1000/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1000/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1000/optimizer.pt b/PT/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..37a072829d284ad9506d6702f76e375dca5fbcf7 --- /dev/null +++ b/PT/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c78ad2d1f67a150941160c9fedbffa03909f13635ba653b85300625d84482e +size 33661637 diff --git a/PT/checkpoint-1000/rng_state.pth b/PT/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..7e0595e8270a7d9a18bfbff5ef4e6da30d09ee1a --- /dev/null +++ b/PT/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80bdc69615d3c180f2a67d650f36262deba81d719ff2e53618656a662f8af967 +size 14575 diff --git a/PT/checkpoint-1000/scheduler.pt b/PT/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fccd7adc4e37024d46887b0cab171289646af801 --- /dev/null +++ b/PT/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bcb088942bba4359580381f9df28ae6d5001d209124c797c6bda9b3cb67f724 +size 627 diff --git a/PT/checkpoint-1000/trainer_state.json b/PT/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..73a26f70e9f45e6bce6f9129f4b1523257913ffa --- /dev/null +++ b/PT/checkpoint-1000/trainer_state.json @@ -0,0 +1,1299 @@ +{ + "best_metric": 1.4371482133865356, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1000", + "epoch": 1.083717149823896, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.29660416131072e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1000/training_args.bin b/PT/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1100/README.md b/PT/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1100/adapter_config.json b/PT/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1100/adapter_model.bin b/PT/checkpoint-1100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..72599b9d79b238f73709e9a6b7268ba4cd8b2651 --- /dev/null +++ b/PT/checkpoint-1100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:786422b27506dc35c31b50bb98897ad38ee6f6da37e55fa382403e772a2d3221 +size 16821197 diff --git a/PT/checkpoint-1100/finetuning_args.json b/PT/checkpoint-1100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1100/optimizer.pt b/PT/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a6ca58eeec740660755f71fa1906cc188acbbed --- /dev/null +++ b/PT/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cee19bdbba1758412d986c546868938abfdeb0cff0a5de8503fdc24d8bba3be +size 33661637 diff --git a/PT/checkpoint-1100/rng_state.pth b/PT/checkpoint-1100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e55dd0f154c59d7a8c4c5d73526439a341e3997 --- /dev/null +++ b/PT/checkpoint-1100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54b1dacc16ffe8fd7546f373d2c466209350184e9dfbfde3184ee6554b697d30 +size 14575 diff --git a/PT/checkpoint-1100/scheduler.pt b/PT/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e278ba8de18098ad5164d1ef1709d3dffc82008 --- /dev/null +++ b/PT/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dc3b9c570d927a69a05a23c60e5464e04891fbfe2e88f930e20c96d011493c0 +size 627 diff --git a/PT/checkpoint-1100/trainer_state.json b/PT/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0129511de089b6145f74b75e8900ca98e1f9eca --- /dev/null +++ b/PT/checkpoint-1100/trainer_state.json @@ -0,0 +1,1427 @@ +{ + "best_metric": 1.4341663122177124, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1100", + "epoch": 1.1920888648062855, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.626264577441792e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1100/training_args.bin b/PT/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1200/README.md b/PT/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1200/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1200/adapter_config.json b/PT/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1200/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1200/adapter_model.bin b/PT/checkpoint-1200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e2bdfc7d5c7bbbf5375eccfbf993f39992016659 --- /dev/null +++ b/PT/checkpoint-1200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a87b2839fabef023a0db26ca4ffa31984ea9c004e1f65a8e495697d5e3145a2 +size 16821197 diff --git a/PT/checkpoint-1200/finetuning_args.json b/PT/checkpoint-1200/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1200/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1200/optimizer.pt b/PT/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a11fc24d1ed007b200280f9307a33eef6e0a975b --- /dev/null +++ b/PT/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:868d29b7cdc2c911e783980e8fe684c11d156da1ba064f49edc4bef16e000ff5 +size 33661637 diff --git a/PT/checkpoint-1200/rng_state.pth b/PT/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..34ba3490351dcb5f5cbada739b27299695ff7df3 --- /dev/null +++ b/PT/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0956d8070f5f9ada01c112db42149503879020cbcbd7eeb72b2574fa029496cd +size 14575 diff --git a/PT/checkpoint-1200/scheduler.pt b/PT/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6cb5095d0a96a5a43f0d5c49d64718d562e43aec --- /dev/null +++ b/PT/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:025f589e8ee8469c0312794fece8bc042cb759a81718376bea43d569128aef2e +size 627 diff --git a/PT/checkpoint-1200/trainer_state.json b/PT/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..91070846c205533ee82757d531e9a1fbac80535b --- /dev/null +++ b/PT/checkpoint-1200/trainer_state.json @@ -0,0 +1,1555 @@ +{ + "best_metric": 1.4208500385284424, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1200", + "epoch": 1.300460579788675, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.955924993572864e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1200/training_args.bin b/PT/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1300/README.md b/PT/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1300/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1300/adapter_config.json b/PT/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1300/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1300/adapter_model.bin b/PT/checkpoint-1300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6d68b57bb579c30bfe5ba2e4cd57ab97a845be99 --- /dev/null +++ b/PT/checkpoint-1300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c4c2a4a0211e165b6c691ed707a060d933872f9a952cd7cfa616c2167c99099 +size 16821197 diff --git a/PT/checkpoint-1300/finetuning_args.json b/PT/checkpoint-1300/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1300/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1300/optimizer.pt b/PT/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5322f17508da6430118fc00108f159195fff6a7c --- /dev/null +++ b/PT/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac34d94c72e90e6c66d25a5c1ac3685ecc7bb7a62666a997160ae4cc96f8b838 +size 33661637 diff --git a/PT/checkpoint-1300/rng_state.pth b/PT/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8600b462f278db5e199dad4cf9e64bd14e8e347 --- /dev/null +++ b/PT/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a185cc423536818910de0be0f941cd28d682132a8fe9c368661ee5102299abbf +size 14575 diff --git a/PT/checkpoint-1300/scheduler.pt b/PT/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94886cae97b22875d9d714f15afe79b100758167 --- /dev/null +++ b/PT/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:538bec71c7352734aa2e4ef86d84aabfb5c8875bf93539d9537d21e952d1ef49 +size 627 diff --git a/PT/checkpoint-1300/trainer_state.json b/PT/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db7e4acfae3ea0e81089705e939b0cf585e73e0d --- /dev/null +++ b/PT/checkpoint-1300/trainer_state.json @@ -0,0 +1,1683 @@ +{ + "best_metric": 1.4054052829742432, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1300", + "epoch": 1.4088322947710648, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.285585409703936e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1300/training_args.bin b/PT/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1400/README.md b/PT/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1400/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1400/adapter_config.json b/PT/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1400/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1400/adapter_model.bin b/PT/checkpoint-1400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..17fefc0577330c2f4f15bc7e669089cdddb71e7f --- /dev/null +++ b/PT/checkpoint-1400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2227ecff339f4d2ce069d504459492f633329d711e660aea4f544680d73f7318 +size 16821197 diff --git a/PT/checkpoint-1400/finetuning_args.json b/PT/checkpoint-1400/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1400/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1400/optimizer.pt b/PT/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..369b2ae1735213902f79db209ff64e0f2771b663 --- /dev/null +++ b/PT/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d37353ab10195f4b5f521ea7471a050772954a0474471d0105fd66d872762f1 +size 33661637 diff --git a/PT/checkpoint-1400/rng_state.pth b/PT/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b9088b8c57d2b59badef81ad6e70ac3ac93f7af6 --- /dev/null +++ b/PT/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d55d0893ef0c812a15c31b34f8a03645bd86953328f20705b9e0d7aa008327 +size 14575 diff --git a/PT/checkpoint-1400/scheduler.pt b/PT/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..136c1b95c61e3d9f8e9527d3ff1993c783154ce5 --- /dev/null +++ b/PT/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:756e31d55572525c2d7c047328ab958956e3c65c6996de4d02570da6c4e34387 +size 627 diff --git a/PT/checkpoint-1400/trainer_state.json b/PT/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e1436019fb8aad08916947230e23d3b30e9671b1 --- /dev/null +++ b/PT/checkpoint-1400/trainer_state.json @@ -0,0 +1,1811 @@ +{ + "best_metric": 1.3990795612335205, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1400", + "epoch": 1.5172040097534545, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.615245825835008e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1400/training_args.bin b/PT/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1500/README.md b/PT/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1500/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1500/adapter_config.json b/PT/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1500/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1500/adapter_model.bin b/PT/checkpoint-1500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b336e90aac036de7ee2ec14ff686efee09cf7fa0 --- /dev/null +++ b/PT/checkpoint-1500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d77cdbac61984c82d08f1e56bc1cbf40a9d394ad43e5310795fdb8bb80c9faf5 +size 16821197 diff --git a/PT/checkpoint-1500/finetuning_args.json b/PT/checkpoint-1500/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1500/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1500/optimizer.pt b/PT/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..62eea29841532c71d5855338df82899784baafb6 --- /dev/null +++ b/PT/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04de54aee2e37e9bc5872307d14945574dffa2619913b0e693655d9383d6529a +size 33661637 diff --git a/PT/checkpoint-1500/rng_state.pth b/PT/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a4af82e992cf4b38291589196153353f66873c7 --- /dev/null +++ b/PT/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9af9ea4ecb330ad04f7453d46d1bd60b8d8d5b8ffa16e37c54e1d1de38324d71 +size 14575 diff --git a/PT/checkpoint-1500/scheduler.pt b/PT/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..93394a546aea603b69c130b8b04095ec63a38491 --- /dev/null +++ b/PT/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff3205f19072a58dccb3f00780fcff08dc0d62b2cc40a49de440730a35bb6a38 +size 627 diff --git a/PT/checkpoint-1500/trainer_state.json b/PT/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5d683339b892894b9b2a130d0073db74351ac71a --- /dev/null +++ b/PT/checkpoint-1500/trainer_state.json @@ -0,0 +1,1939 @@ +{ + "best_metric": 1.3873966932296753, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1500", + "epoch": 1.625575724735844, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.94490624196608e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1500/training_args.bin b/PT/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1600/README.md b/PT/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1600/adapter_config.json b/PT/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1600/adapter_model.bin b/PT/checkpoint-1600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d16a99d10e834b0b08af644a8180fa0f2a8d04a4 --- /dev/null +++ b/PT/checkpoint-1600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5479d4c5a0bd56422e6edab4831cb629d7891588bc4592332276b9add5136421 +size 16821197 diff --git a/PT/checkpoint-1600/finetuning_args.json b/PT/checkpoint-1600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1600/optimizer.pt b/PT/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4cba946b9f637e7a965cdb8f0fcb7ad775c4683 --- /dev/null +++ b/PT/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0ea708deb97b4719c5e9ee604ced9f3b72cb525895684fd8cea3e88d51b6af9 +size 33661637 diff --git a/PT/checkpoint-1600/rng_state.pth b/PT/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ea11319a9d3a3973a90155ee9866bf6ebece37f3 --- /dev/null +++ b/PT/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a337949019446126212df7c37fe4bd6937d49d1579643e33971abebf2341f99c +size 14575 diff --git a/PT/checkpoint-1600/scheduler.pt b/PT/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..63cac1747c78f13191f73fff2ef9858d4a3e89e3 --- /dev/null +++ b/PT/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809e3bfe93ea7398b520d48004e1e2cd4789ee7182e8b443ef7f94aac8155e8c +size 627 diff --git a/PT/checkpoint-1600/trainer_state.json b/PT/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f1dbc2fc4ff8d70839eebc3aff1c0c24640a00a --- /dev/null +++ b/PT/checkpoint-1600/trainer_state.json @@ -0,0 +1,2067 @@ +{ + "best_metric": 1.379552960395813, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1600", + "epoch": 1.7339474397182335, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.274566658097152e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1600/training_args.bin b/PT/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1700/README.md b/PT/checkpoint-1700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1700/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1700/adapter_config.json b/PT/checkpoint-1700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1700/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1700/adapter_model.bin b/PT/checkpoint-1700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..2b49bc9e4e6b6c74e7ffd51d8abc26687d131158 --- /dev/null +++ b/PT/checkpoint-1700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cb52a95b80ef6e26682609eaee01310b9c255773b8b4cb6087d463d0a357431 +size 16821197 diff --git a/PT/checkpoint-1700/finetuning_args.json b/PT/checkpoint-1700/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1700/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1700/optimizer.pt b/PT/checkpoint-1700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..015925d3fe07bd37d4cd2f4335d634ef7ae3eb23 --- /dev/null +++ b/PT/checkpoint-1700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f15643575a429f5e26e08bf3198f91dc8b136ab3c9a7fc26726e475e64eecf +size 33661637 diff --git a/PT/checkpoint-1700/rng_state.pth b/PT/checkpoint-1700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e2203fef9f28a061a9ee6c7727616b24d5f7392d --- /dev/null +++ b/PT/checkpoint-1700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197c2d6c2c11cd5730ab503e6833d3877fc243571d57a158a2b132375cb47c1b +size 14575 diff --git a/PT/checkpoint-1700/scheduler.pt b/PT/checkpoint-1700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c9a87f1a7d8ed6c7540dd551944df3a0208ecfb9 --- /dev/null +++ b/PT/checkpoint-1700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c518f177a3abf24a13404b2df1952f6e3f65f2fe8f7ddd04e393fca5af516fd +size 627 diff --git a/PT/checkpoint-1700/trainer_state.json b/PT/checkpoint-1700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0897c03e368680a1ad3e90a2983ff39005aba66a --- /dev/null +++ b/PT/checkpoint-1700/trainer_state.json @@ -0,0 +1,2195 @@ +{ + "best_metric": 1.3693352937698364, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1700", + "epoch": 1.8423191547006232, + "eval_steps": 100, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.604227074228224e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1700/training_args.bin b/PT/checkpoint-1700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1800/README.md b/PT/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1800/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1800/adapter_config.json b/PT/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1800/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1800/adapter_model.bin b/PT/checkpoint-1800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..1c3b692cd902c1aa4d7d227938849ef2d3241b2b --- /dev/null +++ b/PT/checkpoint-1800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0593760c0804b04ab27edbb2baae40411610f7a65ec7c47caba49006ea406168 +size 16821197 diff --git a/PT/checkpoint-1800/finetuning_args.json b/PT/checkpoint-1800/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1800/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1800/optimizer.pt b/PT/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b00decb7d8894d1f2440e7a4d18713c15c0feda --- /dev/null +++ b/PT/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdf680341624170f3e748d033c29f7609f7548983f0717832117c65990e80277 +size 33661637 diff --git a/PT/checkpoint-1800/rng_state.pth b/PT/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1bbceb8c86a34e03bd4f215fde6fd061617fa085 --- /dev/null +++ b/PT/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34cdee73e9815c0dee4a4df2c4189f2f82c9ff06fb0c4ae6d7f15acf2d303b60 +size 14575 diff --git a/PT/checkpoint-1800/scheduler.pt b/PT/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac35095ac113aeca911b86d8ebd2df14a33f078c --- /dev/null +++ b/PT/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0ba829ee7d6220f075bd3ef17fe11526da368b4c49d0f3db3027bc4369e6a2b +size 627 diff --git a/PT/checkpoint-1800/trainer_state.json b/PT/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d5eb07e6ba5b19f7d0b5efa53b7b0cb2e11aaa96 --- /dev/null +++ b/PT/checkpoint-1800/trainer_state.json @@ -0,0 +1,2323 @@ +{ + "best_metric": 1.3620151281356812, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1800", + "epoch": 1.9506908696830128, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.933887490359296e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1800/training_args.bin b/PT/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-1900/README.md b/PT/checkpoint-1900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-1900/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-1900/adapter_config.json b/PT/checkpoint-1900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-1900/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-1900/adapter_model.bin b/PT/checkpoint-1900/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bea9274117f03c273eea6157b62e5c3c349f081c --- /dev/null +++ b/PT/checkpoint-1900/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf58dd17cd3535c76d001a4870d0bd529e21b775b214c1977fd781eb3167900 +size 16821197 diff --git a/PT/checkpoint-1900/finetuning_args.json b/PT/checkpoint-1900/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-1900/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-1900/optimizer.pt b/PT/checkpoint-1900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a667841d0e29d9f94f12b45692eba64f2f226b8 --- /dev/null +++ b/PT/checkpoint-1900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9d8d403b9978bb470a8e16827e90304dd83d9826504ba123c7c0859593d6d5 +size 33661637 diff --git a/PT/checkpoint-1900/rng_state.pth b/PT/checkpoint-1900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e4fa00bd86324976145fc4cbbfc96e8f2c4ae9b --- /dev/null +++ b/PT/checkpoint-1900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ef35bb746180cc90852a23739225c499603055cea713563ac6579c87fad4d8 +size 14575 diff --git a/PT/checkpoint-1900/scheduler.pt b/PT/checkpoint-1900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9fdcc0de4ca81f07aae8b045214467801c473105 --- /dev/null +++ b/PT/checkpoint-1900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c69ff6375f1286fd7ba83ee3e48838606d54a35142306ef62efe6994f84bd20d +size 627 diff --git a/PT/checkpoint-1900/trainer_state.json b/PT/checkpoint-1900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..790f25bfa77cd1a39b0da34b7b3ecaa967384ad0 --- /dev/null +++ b/PT/checkpoint-1900/trainer_state.json @@ -0,0 +1,2451 @@ +{ + "best_metric": 1.3620151281356812, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-1800", + "epoch": 2.0590625846654023, + "eval_steps": 100, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.263547906490368e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-1900/training_args.bin b/PT/checkpoint-1900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-1900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-200/README.md b/PT/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-200/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-200/adapter_config.json b/PT/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-200/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-200/adapter_model.bin b/PT/checkpoint-200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..10330df66799a4005abc2d2e3015feaddb80ae6b --- /dev/null +++ b/PT/checkpoint-200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39b1bcfa7d21d33713bde1537e009adfc112c27f9cdc8a0a34e78c4d87ae9b95 +size 16821197 diff --git a/PT/checkpoint-200/finetuning_args.json b/PT/checkpoint-200/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-200/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-200/optimizer.pt b/PT/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3be623c25c548b83a0b9fabe0ee3b8a71864f6ef --- /dev/null +++ b/PT/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f96e3a48c44d11d280f99ee702943710bb0ef34cdc4cc34e770b025e7bf7c0 +size 33661637 diff --git a/PT/checkpoint-200/rng_state.pth b/PT/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..772f25160cb3d13b4e7f72d18dc3d6f4a7937d00 --- /dev/null +++ b/PT/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:716f219d8bb6015286f7b797d8a42401d36b2358edb762232a8989e50eea25b9 +size 14575 diff --git a/PT/checkpoint-200/scheduler.pt b/PT/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..562cf0d846b8d161ba0b7d0ef82cb12a41518392 --- /dev/null +++ b/PT/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70996fd124f5116a79a6ae8231d45e921f8e9bfdeb21ea3dc386bfc88edbd40e +size 627 diff --git a/PT/checkpoint-200/trainer_state.json b/PT/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d385f365c452e279d23299769b82136ded9c3c76 --- /dev/null +++ b/PT/checkpoint-200/trainer_state.json @@ -0,0 +1,275 @@ +{ + "best_metric": 1.5717933177947998, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-200", + "epoch": 0.2167434299647792, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.59320832262144e+16, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-200/training_args.bin b/PT/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2000/README.md b/PT/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2000/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2000/adapter_config.json b/PT/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2000/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2000/adapter_model.bin b/PT/checkpoint-2000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..44715377a3cfca567a42269ea7bfe7cf8acfeadc --- /dev/null +++ b/PT/checkpoint-2000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49181251e5dd9593192faff1f90ca7205fdda17224bef039cb63d2e633f251a +size 16821197 diff --git a/PT/checkpoint-2000/finetuning_args.json b/PT/checkpoint-2000/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2000/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2000/optimizer.pt b/PT/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..da68b9375761cae30b5ed0f69b395d7d0e0cd738 --- /dev/null +++ b/PT/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7162b819f5808bd03031dc6b22acc8bf1e31a0d8cb14291ac4dc1e468e59ebe +size 33661637 diff --git a/PT/checkpoint-2000/rng_state.pth b/PT/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e9526cf573843688db95ca2952c4af5f7aa4f495 --- /dev/null +++ b/PT/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b40ebaf3ca1a230a308f4f0131d0e7918a4c25037255b674ee47cd160ada01fe +size 14575 diff --git a/PT/checkpoint-2000/scheduler.pt b/PT/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7ef30c742fc7c43e67bdc2934d13e43ae634374 --- /dev/null +++ b/PT/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ff19c607009427c50a34e80c15225bc9b660b3dc0fd88391afc40d9e1a3e170 +size 627 diff --git a/PT/checkpoint-2000/trainer_state.json b/PT/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eb406bd3fcf0f617b14cff89b25cd110821e68cc --- /dev/null +++ b/PT/checkpoint-2000/trainer_state.json @@ -0,0 +1,2579 @@ +{ + "best_metric": 1.3614530563354492, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2000", + "epoch": 2.167434299647792, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.59320832262144e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2000/training_args.bin b/PT/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2100/README.md b/PT/checkpoint-2100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2100/adapter_config.json b/PT/checkpoint-2100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2100/adapter_model.bin b/PT/checkpoint-2100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b2c85dcfb04e9a217ab5fc7a0b55231235f05877 --- /dev/null +++ b/PT/checkpoint-2100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4019f5a171a3ce6c27ff87e8b9dd4fbd6d95885bca147463db730839634f683d +size 16821197 diff --git a/PT/checkpoint-2100/finetuning_args.json b/PT/checkpoint-2100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2100/optimizer.pt b/PT/checkpoint-2100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe4cfb5f42c3e3800d3f9eb4ba29e9bd5d767ce2 --- /dev/null +++ b/PT/checkpoint-2100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc67475ffb856cbfbd5913f8aaa5f7171d9ec5bfc92da84bfa5a81e931838b7c +size 33661637 diff --git a/PT/checkpoint-2100/rng_state.pth b/PT/checkpoint-2100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2bd1e404e89b0fe426fde56fce76fa91342af2db --- /dev/null +++ b/PT/checkpoint-2100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd99b4288dbb684977ae9ae7e5e5e26ef8d219d33ad2868fed2e90b6a0ccc9f +size 14575 diff --git a/PT/checkpoint-2100/scheduler.pt b/PT/checkpoint-2100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..56bb699a22ced2f5ad7ad16a90ef85505d4d4608 --- /dev/null +++ b/PT/checkpoint-2100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e635572188a54e9d6cef777353612a863e47b41561c0cb400d69ae03c1d8ea1 +size 627 diff --git a/PT/checkpoint-2100/trainer_state.json b/PT/checkpoint-2100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e61209c4aecfc642d11721d2a9640c158af025d --- /dev/null +++ b/PT/checkpoint-2100/trainer_state.json @@ -0,0 +1,2707 @@ +{ + "best_metric": 1.3538448810577393, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2100", + "epoch": 2.2758060146301817, + "eval_steps": 100, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.922868738752512e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2100/training_args.bin b/PT/checkpoint-2100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2200/README.md b/PT/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2200/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2200/adapter_config.json b/PT/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2200/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2200/adapter_model.bin b/PT/checkpoint-2200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..dfa21f18d1b8d1964958ddd6f33431c14e20318e --- /dev/null +++ b/PT/checkpoint-2200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbcd198a70002a3bf664de89a18a59756986a86573151382355ae7269e4392a +size 16821197 diff --git a/PT/checkpoint-2200/finetuning_args.json b/PT/checkpoint-2200/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2200/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2200/optimizer.pt b/PT/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df31d291a2bfe37de5df77044ddeafd9232af9f5 --- /dev/null +++ b/PT/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a826065eaae3836011c4038833f185464c39cc678a596838b13ec437a5b23cef +size 33661637 diff --git a/PT/checkpoint-2200/rng_state.pth b/PT/checkpoint-2200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..58ab7897a779b466183a9bd99be96343961911eb --- /dev/null +++ b/PT/checkpoint-2200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf316590893f9ef4529c757be272ebc8fb341b8c91df16f6f6ae1214c3d95f55 +size 14575 diff --git a/PT/checkpoint-2200/scheduler.pt b/PT/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d096b1d05edc7c49595fc600937b60bfb2a7ec31 --- /dev/null +++ b/PT/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:208966107253438f0c80d952d6cdc6593d2b7ca26cec80d4db22343b62316a0e +size 627 diff --git a/PT/checkpoint-2200/trainer_state.json b/PT/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4292bb3a6875e51a432a3699f48c4f9c608b582b --- /dev/null +++ b/PT/checkpoint-2200/trainer_state.json @@ -0,0 +1,2835 @@ +{ + "best_metric": 1.349250316619873, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2200", + "epoch": 2.384177729612571, + "eval_steps": 100, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 7.252529154883584e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2200/training_args.bin b/PT/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2300/README.md b/PT/checkpoint-2300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2300/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2300/adapter_config.json b/PT/checkpoint-2300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2300/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2300/adapter_model.bin b/PT/checkpoint-2300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ad28073470129fd6179633dfe55a3f7f9664b6fb --- /dev/null +++ b/PT/checkpoint-2300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da09eb4b7d4fcb5a7f134e116d4677f9df52b8edf2973268ead989e6194f5695 +size 16821197 diff --git a/PT/checkpoint-2300/finetuning_args.json b/PT/checkpoint-2300/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2300/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2300/optimizer.pt b/PT/checkpoint-2300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d529c9464d3b9ffc897022f364511e1afa7afdc --- /dev/null +++ b/PT/checkpoint-2300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eda6c407755c5d68f3be9eac0d2f1e1cdf5b4fd658ed3a48c42f31c325e35301 +size 33661637 diff --git a/PT/checkpoint-2300/rng_state.pth b/PT/checkpoint-2300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..314c8323579451fa5d9c29c3dbcd038840984533 --- /dev/null +++ b/PT/checkpoint-2300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b35e6bb67ffa8891f378771ce194bbec014d50118e0de99064b8ce172edfa5d4 +size 14575 diff --git a/PT/checkpoint-2300/scheduler.pt b/PT/checkpoint-2300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a45968b80d7795ecd568877691c760e90e767a1e --- /dev/null +++ b/PT/checkpoint-2300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e651282824d28533ac4067aeda7f6c9dea2c977721041d00f17e81b7ad597da1 +size 627 diff --git a/PT/checkpoint-2300/trainer_state.json b/PT/checkpoint-2300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d56cdf5b72182656135c0ef90d4bf0e086763ded --- /dev/null +++ b/PT/checkpoint-2300/trainer_state.json @@ -0,0 +1,2963 @@ +{ + "best_metric": 1.3470078706741333, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2300", + "epoch": 2.4925494445949608, + "eval_steps": 100, + "global_step": 2300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 7.582189571014656e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2300/training_args.bin b/PT/checkpoint-2300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2400/README.md b/PT/checkpoint-2400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2400/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2400/adapter_config.json b/PT/checkpoint-2400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2400/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2400/adapter_model.bin b/PT/checkpoint-2400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7ebd7493f5a8282323d8b507ec9085288c1251ab --- /dev/null +++ b/PT/checkpoint-2400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29443cd6e0e5327c71bc9a4803f8eb0fc627ccca4ebb148c989fe67cd2683a0 +size 16821197 diff --git a/PT/checkpoint-2400/finetuning_args.json b/PT/checkpoint-2400/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2400/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2400/optimizer.pt b/PT/checkpoint-2400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..85807d113070e7ecca5a581b448e8bf8f8659fb7 --- /dev/null +++ b/PT/checkpoint-2400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d80e91250065b6ce18d956bc65e597d8d00e05e73b3f4a715470d5f36977f27f +size 33661637 diff --git a/PT/checkpoint-2400/rng_state.pth b/PT/checkpoint-2400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..83844a5b68b25d236c263ffd25a2978d9eb9cdd4 --- /dev/null +++ b/PT/checkpoint-2400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea1bc9dcb85837571d264426910a298fae0746e6ab6c53618deb708270ec8865 +size 14575 diff --git a/PT/checkpoint-2400/scheduler.pt b/PT/checkpoint-2400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c455fb829b747d9ad26b71d0aab96ccec95c133d --- /dev/null +++ b/PT/checkpoint-2400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d621bf32cb8aa583f95febd79709f28f047f4b0f4c94d6fafa62a1b845cdd17d +size 627 diff --git a/PT/checkpoint-2400/trainer_state.json b/PT/checkpoint-2400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8546d7b04640b90567fbb52c35c5683ff6c86f40 --- /dev/null +++ b/PT/checkpoint-2400/trainer_state.json @@ -0,0 +1,3091 @@ +{ + "best_metric": 1.3436678647994995, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2400", + "epoch": 2.60092115957735, + "eval_steps": 100, + "global_step": 2400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 7.911849987145728e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2400/training_args.bin b/PT/checkpoint-2400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2500/README.md b/PT/checkpoint-2500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2500/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2500/adapter_config.json b/PT/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2500/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2500/adapter_model.bin b/PT/checkpoint-2500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..bd15889f2361b86913d4453acbda8fe74897a4ec --- /dev/null +++ b/PT/checkpoint-2500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b38c300430a412b45be278583da7ca183df9ad9722835ea175a332a9ff70ec7 +size 16821197 diff --git a/PT/checkpoint-2500/finetuning_args.json b/PT/checkpoint-2500/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2500/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2500/optimizer.pt b/PT/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec9ee644c19b59fcf2bec48917ffc87eede475f9 --- /dev/null +++ b/PT/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1162e14a0927619009e63de5a56b9b2f0cc052420c0ee7f49cc3121451dc0d8b +size 33661637 diff --git a/PT/checkpoint-2500/rng_state.pth b/PT/checkpoint-2500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1ddaca078aac8478785037f07e0f49a0d6065dfd --- /dev/null +++ b/PT/checkpoint-2500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9939614a90481578d5985af89631a4db6bb0bb7dc04627618fc1d9a4257cddf0 +size 14575 diff --git a/PT/checkpoint-2500/scheduler.pt b/PT/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..970c6d90dc683516fd97e3ce751324d0f32cc1d0 --- /dev/null +++ b/PT/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7334c729e1f90d25bf24d894a8f4b395a37ac0c1a9e23cd25544c29c670beaa +size 627 diff --git a/PT/checkpoint-2500/trainer_state.json b/PT/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ccc0105964da77f562431a8e76e0c32661b35346 --- /dev/null +++ b/PT/checkpoint-2500/trainer_state.json @@ -0,0 +1,3219 @@ +{ + "best_metric": 1.343565821647644, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2500", + "epoch": 2.7092928745597398, + "eval_steps": 100, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + }, + { + "epoch": 2.61, + "learning_rate": 4.1443547967030816e-05, + "loss": 1.2, + "step": 2405 + }, + { + "epoch": 2.61, + "learning_rate": 4.031905629263371e-05, + "loss": 1.2246, + "step": 2410 + }, + { + "epoch": 2.62, + "learning_rate": 3.92093894828387e-05, + "loss": 1.198, + "step": 2415 + }, + { + "epoch": 2.62, + "learning_rate": 3.811458332477252e-05, + "loss": 1.269, + "step": 2420 + }, + { + "epoch": 2.63, + "learning_rate": 3.703467312630088e-05, + "loss": 1.189, + "step": 2425 + }, + { + "epoch": 2.63, + "learning_rate": 3.596969371488995e-05, + "loss": 1.1938, + "step": 2430 + }, + { + "epoch": 2.64, + "learning_rate": 3.491967943648289e-05, + "loss": 1.2421, + "step": 2435 + }, + { + "epoch": 2.64, + "learning_rate": 3.388466415439234e-05, + "loss": 1.1145, + "step": 2440 + }, + { + "epoch": 2.65, + "learning_rate": 3.2864681248208184e-05, + "loss": 1.1678, + "step": 2445 + }, + { + "epoch": 2.66, + "learning_rate": 3.185976361272125e-05, + "loss": 1.2627, + "step": 2450 + }, + { + "epoch": 2.66, + "learning_rate": 3.086994365686246e-05, + "loss": 1.1344, + "step": 2455 + }, + { + "epoch": 2.67, + "learning_rate": 2.9895253302657188e-05, + "loss": 1.2325, + "step": 2460 + }, + { + "epoch": 2.67, + "learning_rate": 2.8935723984196304e-05, + "loss": 1.2533, + "step": 2465 + }, + { + "epoch": 2.68, + "learning_rate": 2.7991386646622207e-05, + "loss": 1.226, + "step": 2470 + }, + { + "epoch": 2.68, + "learning_rate": 2.7062271745130595e-05, + "loss": 1.0925, + "step": 2475 + }, + { + "epoch": 2.69, + "learning_rate": 2.614840924398876e-05, + "loss": 1.1444, + "step": 2480 + }, + { + "epoch": 2.69, + "learning_rate": 2.5249828615568794e-05, + "loss": 1.214, + "step": 2485 + }, + { + "epoch": 2.7, + "learning_rate": 2.436655883939737e-05, + "loss": 1.2427, + "step": 2490 + }, + { + "epoch": 2.7, + "learning_rate": 2.3498628401221078e-05, + "loss": 1.2447, + "step": 2495 + }, + { + "epoch": 2.71, + "learning_rate": 2.2646065292087403e-05, + "loss": 1.1835, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_loss": 1.343565821647644, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2500 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 8.2415104032768e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2500/training_args.bin b/PT/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2600/README.md b/PT/checkpoint-2600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2600/adapter_config.json b/PT/checkpoint-2600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2600/adapter_model.bin b/PT/checkpoint-2600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c54e1ab99aa8f3df02c38d55e535e5d47b02eabe --- /dev/null +++ b/PT/checkpoint-2600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:579d617a0beb45da3ee7c98c1a72fcc91c624f20b3de223b97fab9e1d5d25553 +size 16821197 diff --git a/PT/checkpoint-2600/finetuning_args.json b/PT/checkpoint-2600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2600/optimizer.pt b/PT/checkpoint-2600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3a63c90860958539340fe5e668ef07dea1c44b3 --- /dev/null +++ b/PT/checkpoint-2600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52e55900ff9cbba5fb5574437848ca1d430b45c75ba642e04f05bdfd48a5fa37 +size 33661637 diff --git a/PT/checkpoint-2600/rng_state.pth b/PT/checkpoint-2600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b618f6a57ce12f855dccfedc02849013f6d0b31 --- /dev/null +++ b/PT/checkpoint-2600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8d72b8a46554ef83ec4d1bc63936d36603c2a7d7650d2c908ce157b71af89f +size 14575 diff --git a/PT/checkpoint-2600/scheduler.pt b/PT/checkpoint-2600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c160be4348401c63443bfcdaee2a3bd0d4d4939 --- /dev/null +++ b/PT/checkpoint-2600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:369f9ef9470484e4bba63c61dc786c2b089c04c6a14764bb2f1a6f742388b283 +size 627 diff --git a/PT/checkpoint-2600/trainer_state.json b/PT/checkpoint-2600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..00230f67ba83f66ee46c974f9fe560b0b4927bf6 --- /dev/null +++ b/PT/checkpoint-2600/trainer_state.json @@ -0,0 +1,3347 @@ +{ + "best_metric": 1.3423666954040527, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2600", + "epoch": 2.8176645895421295, + "eval_steps": 100, + "global_step": 2600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + }, + { + "epoch": 2.61, + "learning_rate": 4.1443547967030816e-05, + "loss": 1.2, + "step": 2405 + }, + { + "epoch": 2.61, + "learning_rate": 4.031905629263371e-05, + "loss": 1.2246, + "step": 2410 + }, + { + "epoch": 2.62, + "learning_rate": 3.92093894828387e-05, + "loss": 1.198, + "step": 2415 + }, + { + "epoch": 2.62, + "learning_rate": 3.811458332477252e-05, + "loss": 1.269, + "step": 2420 + }, + { + "epoch": 2.63, + "learning_rate": 3.703467312630088e-05, + "loss": 1.189, + "step": 2425 + }, + { + "epoch": 2.63, + "learning_rate": 3.596969371488995e-05, + "loss": 1.1938, + "step": 2430 + }, + { + "epoch": 2.64, + "learning_rate": 3.491967943648289e-05, + "loss": 1.2421, + "step": 2435 + }, + { + "epoch": 2.64, + "learning_rate": 3.388466415439234e-05, + "loss": 1.1145, + "step": 2440 + }, + { + "epoch": 2.65, + "learning_rate": 3.2864681248208184e-05, + "loss": 1.1678, + "step": 2445 + }, + { + "epoch": 2.66, + "learning_rate": 3.185976361272125e-05, + "loss": 1.2627, + "step": 2450 + }, + { + "epoch": 2.66, + "learning_rate": 3.086994365686246e-05, + "loss": 1.1344, + "step": 2455 + }, + { + "epoch": 2.67, + "learning_rate": 2.9895253302657188e-05, + "loss": 1.2325, + "step": 2460 + }, + { + "epoch": 2.67, + "learning_rate": 2.8935723984196304e-05, + "loss": 1.2533, + "step": 2465 + }, + { + "epoch": 2.68, + "learning_rate": 2.7991386646622207e-05, + "loss": 1.226, + "step": 2470 + }, + { + "epoch": 2.68, + "learning_rate": 2.7062271745130595e-05, + "loss": 1.0925, + "step": 2475 + }, + { + "epoch": 2.69, + "learning_rate": 2.614840924398876e-05, + "loss": 1.1444, + "step": 2480 + }, + { + "epoch": 2.69, + "learning_rate": 2.5249828615568794e-05, + "loss": 1.214, + "step": 2485 + }, + { + "epoch": 2.7, + "learning_rate": 2.436655883939737e-05, + "loss": 1.2427, + "step": 2490 + }, + { + "epoch": 2.7, + "learning_rate": 2.3498628401221078e-05, + "loss": 1.2447, + "step": 2495 + }, + { + "epoch": 2.71, + "learning_rate": 2.2646065292087403e-05, + "loss": 1.1835, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_loss": 1.343565821647644, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2500 + }, + { + "epoch": 2.71, + "learning_rate": 2.1808897007442762e-05, + "loss": 1.2457, + "step": 2505 + }, + { + "epoch": 2.72, + "learning_rate": 2.098715054624506e-05, + "loss": 1.2248, + "step": 2510 + }, + { + "epoch": 2.73, + "learning_rate": 2.0180852410093153e-05, + "loss": 1.2419, + "step": 2515 + }, + { + "epoch": 2.73, + "learning_rate": 1.939002860237249e-05, + "loss": 1.1763, + "step": 2520 + }, + { + "epoch": 2.74, + "learning_rate": 1.8614704627416045e-05, + "loss": 1.2035, + "step": 2525 + }, + { + "epoch": 2.74, + "learning_rate": 1.7854905489681993e-05, + "loss": 1.1767, + "step": 2530 + }, + { + "epoch": 2.75, + "learning_rate": 1.7110655692947397e-05, + "loss": 1.254, + "step": 2535 + }, + { + "epoch": 2.75, + "learning_rate": 1.638197923951784e-05, + "loss": 1.1941, + "step": 2540 + }, + { + "epoch": 2.76, + "learning_rate": 1.5668899629453225e-05, + "loss": 1.2568, + "step": 2545 + }, + { + "epoch": 2.76, + "learning_rate": 1.4971439859810199e-05, + "loss": 1.2237, + "step": 2550 + }, + { + "epoch": 2.77, + "learning_rate": 1.428962242390025e-05, + "loss": 1.172, + "step": 2555 + }, + { + "epoch": 2.77, + "learning_rate": 1.3623469310564408e-05, + "loss": 1.1835, + "step": 2560 + }, + { + "epoch": 2.78, + "learning_rate": 1.2973002003463797e-05, + "loss": 1.1335, + "step": 2565 + }, + { + "epoch": 2.79, + "learning_rate": 1.2338241480387369e-05, + "loss": 1.1968, + "step": 2570 + }, + { + "epoch": 2.79, + "learning_rate": 1.1719208212574939e-05, + "loss": 1.1962, + "step": 2575 + }, + { + "epoch": 2.8, + "learning_rate": 1.111592216405688e-05, + "loss": 1.2107, + "step": 2580 + }, + { + "epoch": 2.8, + "learning_rate": 1.0528402791010582e-05, + "loss": 1.2148, + "step": 2585 + }, + { + "epoch": 2.81, + "learning_rate": 9.956669041133015e-06, + "loss": 1.1443, + "step": 2590 + }, + { + "epoch": 2.81, + "learning_rate": 9.400739353029209e-06, + "loss": 1.1805, + "step": 2595 + }, + { + "epoch": 2.82, + "learning_rate": 8.860631655618124e-06, + "loss": 1.2061, + "step": 2600 + }, + { + "epoch": 2.82, + "eval_loss": 1.3423666954040527, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2600 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 8.571170819407872e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2600/training_args.bin b/PT/checkpoint-2600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-2700/README.md b/PT/checkpoint-2700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-2700/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-2700/adapter_config.json b/PT/checkpoint-2700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-2700/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-2700/adapter_model.bin b/PT/checkpoint-2700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7f605850fce20203ba7257c723b003ceb595b340 --- /dev/null +++ b/PT/checkpoint-2700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fe3a9d305afa1374c0d7c1da1c75af38231ab61d9862cde4ed44fefc836c91e +size 16821197 diff --git a/PT/checkpoint-2700/finetuning_args.json b/PT/checkpoint-2700/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-2700/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-2700/optimizer.pt b/PT/checkpoint-2700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1352e3e63189b0bd0be1227cb400d23879ff0ac6 --- /dev/null +++ b/PT/checkpoint-2700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77a5d49dde70b530a22484d39a73107a74356a6df9dacea978cddc98a0ae2c0f +size 33661637 diff --git a/PT/checkpoint-2700/rng_state.pth b/PT/checkpoint-2700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d528250611f4f1cfc8ce788083f8744a08c6b16e --- /dev/null +++ b/PT/checkpoint-2700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54cfbb215dad9e26f25c41f5cf960b249f08a4175534e87074c067686948b014 +size 14575 diff --git a/PT/checkpoint-2700/scheduler.pt b/PT/checkpoint-2700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3c901334fdfca18cc6a90915bbfb961803be72e --- /dev/null +++ b/PT/checkpoint-2700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d47dd9475987bd7432d7e7ff8a0a6c1f49acb15379f0b979c67ca2407f284bf +size 627 diff --git a/PT/checkpoint-2700/trainer_state.json b/PT/checkpoint-2700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..385dc73268b89869e2240b1f89f28a5ab358711e --- /dev/null +++ b/PT/checkpoint-2700/trainer_state.json @@ -0,0 +1,3475 @@ +{ + "best_metric": 1.3419121503829956, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2700", + "epoch": 2.9260363045245192, + "eval_steps": 100, + "global_step": 2700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + }, + { + "epoch": 2.61, + "learning_rate": 4.1443547967030816e-05, + "loss": 1.2, + "step": 2405 + }, + { + "epoch": 2.61, + "learning_rate": 4.031905629263371e-05, + "loss": 1.2246, + "step": 2410 + }, + { + "epoch": 2.62, + "learning_rate": 3.92093894828387e-05, + "loss": 1.198, + "step": 2415 + }, + { + "epoch": 2.62, + "learning_rate": 3.811458332477252e-05, + "loss": 1.269, + "step": 2420 + }, + { + "epoch": 2.63, + "learning_rate": 3.703467312630088e-05, + "loss": 1.189, + "step": 2425 + }, + { + "epoch": 2.63, + "learning_rate": 3.596969371488995e-05, + "loss": 1.1938, + "step": 2430 + }, + { + "epoch": 2.64, + "learning_rate": 3.491967943648289e-05, + "loss": 1.2421, + "step": 2435 + }, + { + "epoch": 2.64, + "learning_rate": 3.388466415439234e-05, + "loss": 1.1145, + "step": 2440 + }, + { + "epoch": 2.65, + "learning_rate": 3.2864681248208184e-05, + "loss": 1.1678, + "step": 2445 + }, + { + "epoch": 2.66, + "learning_rate": 3.185976361272125e-05, + "loss": 1.2627, + "step": 2450 + }, + { + "epoch": 2.66, + "learning_rate": 3.086994365686246e-05, + "loss": 1.1344, + "step": 2455 + }, + { + "epoch": 2.67, + "learning_rate": 2.9895253302657188e-05, + "loss": 1.2325, + "step": 2460 + }, + { + "epoch": 2.67, + "learning_rate": 2.8935723984196304e-05, + "loss": 1.2533, + "step": 2465 + }, + { + "epoch": 2.68, + "learning_rate": 2.7991386646622207e-05, + "loss": 1.226, + "step": 2470 + }, + { + "epoch": 2.68, + "learning_rate": 2.7062271745130595e-05, + "loss": 1.0925, + "step": 2475 + }, + { + "epoch": 2.69, + "learning_rate": 2.614840924398876e-05, + "loss": 1.1444, + "step": 2480 + }, + { + "epoch": 2.69, + "learning_rate": 2.5249828615568794e-05, + "loss": 1.214, + "step": 2485 + }, + { + "epoch": 2.7, + "learning_rate": 2.436655883939737e-05, + "loss": 1.2427, + "step": 2490 + }, + { + "epoch": 2.7, + "learning_rate": 2.3498628401221078e-05, + "loss": 1.2447, + "step": 2495 + }, + { + "epoch": 2.71, + "learning_rate": 2.2646065292087403e-05, + "loss": 1.1835, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_loss": 1.343565821647644, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2500 + }, + { + "epoch": 2.71, + "learning_rate": 2.1808897007442762e-05, + "loss": 1.2457, + "step": 2505 + }, + { + "epoch": 2.72, + "learning_rate": 2.098715054624506e-05, + "loss": 1.2248, + "step": 2510 + }, + { + "epoch": 2.73, + "learning_rate": 2.0180852410093153e-05, + "loss": 1.2419, + "step": 2515 + }, + { + "epoch": 2.73, + "learning_rate": 1.939002860237249e-05, + "loss": 1.1763, + "step": 2520 + }, + { + "epoch": 2.74, + "learning_rate": 1.8614704627416045e-05, + "loss": 1.2035, + "step": 2525 + }, + { + "epoch": 2.74, + "learning_rate": 1.7854905489681993e-05, + "loss": 1.1767, + "step": 2530 + }, + { + "epoch": 2.75, + "learning_rate": 1.7110655692947397e-05, + "loss": 1.254, + "step": 2535 + }, + { + "epoch": 2.75, + "learning_rate": 1.638197923951784e-05, + "loss": 1.1941, + "step": 2540 + }, + { + "epoch": 2.76, + "learning_rate": 1.5668899629453225e-05, + "loss": 1.2568, + "step": 2545 + }, + { + "epoch": 2.76, + "learning_rate": 1.4971439859810199e-05, + "loss": 1.2237, + "step": 2550 + }, + { + "epoch": 2.77, + "learning_rate": 1.428962242390025e-05, + "loss": 1.172, + "step": 2555 + }, + { + "epoch": 2.77, + "learning_rate": 1.3623469310564408e-05, + "loss": 1.1835, + "step": 2560 + }, + { + "epoch": 2.78, + "learning_rate": 1.2973002003463797e-05, + "loss": 1.1335, + "step": 2565 + }, + { + "epoch": 2.79, + "learning_rate": 1.2338241480387369e-05, + "loss": 1.1968, + "step": 2570 + }, + { + "epoch": 2.79, + "learning_rate": 1.1719208212574939e-05, + "loss": 1.1962, + "step": 2575 + }, + { + "epoch": 2.8, + "learning_rate": 1.111592216405688e-05, + "loss": 1.2107, + "step": 2580 + }, + { + "epoch": 2.8, + "learning_rate": 1.0528402791010582e-05, + "loss": 1.2148, + "step": 2585 + }, + { + "epoch": 2.81, + "learning_rate": 9.956669041133015e-06, + "loss": 1.1443, + "step": 2590 + }, + { + "epoch": 2.81, + "learning_rate": 9.400739353029209e-06, + "loss": 1.1805, + "step": 2595 + }, + { + "epoch": 2.82, + "learning_rate": 8.860631655618124e-06, + "loss": 1.2061, + "step": 2600 + }, + { + "epoch": 2.82, + "eval_loss": 1.3423666954040527, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2600 + }, + { + "epoch": 2.82, + "learning_rate": 8.336363367554112e-06, + "loss": 1.2418, + "step": 2605 + }, + { + "epoch": 2.83, + "learning_rate": 7.827951396665312e-06, + "loss": 1.2532, + "step": 2610 + }, + { + "epoch": 2.83, + "learning_rate": 7.335412139408248e-06, + "loss": 1.2836, + "step": 2615 + }, + { + "epoch": 2.84, + "learning_rate": 6.85876148033926e-06, + "loss": 1.2918, + "step": 2620 + }, + { + "epoch": 2.84, + "learning_rate": 6.398014791601847e-06, + "loss": 1.1381, + "step": 2625 + }, + { + "epoch": 2.85, + "learning_rate": 5.953186932431298e-06, + "loss": 1.1686, + "step": 2630 + }, + { + "epoch": 2.86, + "learning_rate": 5.524292248675289e-06, + "loss": 1.2104, + "step": 2635 + }, + { + "epoch": 2.86, + "learning_rate": 5.111344572331145e-06, + "loss": 1.2651, + "step": 2640 + }, + { + "epoch": 2.87, + "learning_rate": 4.714357221099974e-06, + "loss": 1.1296, + "step": 2645 + }, + { + "epoch": 2.87, + "learning_rate": 4.333342997957013e-06, + "loss": 1.1534, + "step": 2650 + }, + { + "epoch": 2.88, + "learning_rate": 3.96831419073862e-06, + "loss": 1.2065, + "step": 2655 + }, + { + "epoch": 2.88, + "learning_rate": 3.6192825717464294e-06, + "loss": 1.2118, + "step": 2660 + }, + { + "epoch": 2.89, + "learning_rate": 3.2862593973670975e-06, + "loss": 1.1139, + "step": 2665 + }, + { + "epoch": 2.89, + "learning_rate": 2.969255407709648e-06, + "loss": 1.1702, + "step": 2670 + }, + { + "epoch": 2.9, + "learning_rate": 2.668280826259195e-06, + "loss": 1.1821, + "step": 2675 + }, + { + "epoch": 2.9, + "learning_rate": 2.383345359546818e-06, + "loss": 1.1826, + "step": 2680 + }, + { + "epoch": 2.91, + "learning_rate": 2.1144581968369213e-06, + "loss": 1.236, + "step": 2685 + }, + { + "epoch": 2.92, + "learning_rate": 1.861628009830696e-06, + "loss": 1.2308, + "step": 2690 + }, + { + "epoch": 2.92, + "learning_rate": 1.6248629523865077e-06, + "loss": 1.1957, + "step": 2695 + }, + { + "epoch": 2.93, + "learning_rate": 1.4041706602567206e-06, + "loss": 1.1613, + "step": 2700 + }, + { + "epoch": 2.93, + "eval_loss": 1.3419121503829956, + "eval_runtime": 10.672, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2700 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 8.900831235538944e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-2700/training_args.bin b/PT/checkpoint-2700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-2700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-300/README.md b/PT/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-300/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-300/adapter_config.json b/PT/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-300/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-300/adapter_model.bin b/PT/checkpoint-300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..11c9f1cbf85813f509097affa23e53a9324e6bc7 --- /dev/null +++ b/PT/checkpoint-300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e303237c1e275debfd54a1dd25aae283bdd51395137413c07f0fbc9cc8c73a48 +size 16821197 diff --git a/PT/checkpoint-300/finetuning_args.json b/PT/checkpoint-300/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-300/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-300/optimizer.pt b/PT/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0573d9e27e4051dbc76fea4a3326290b6c6b08b --- /dev/null +++ b/PT/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b55b03e6c73222f0732899c2ee2e17447a845c4879c1e78143877c6f6c5fe6 +size 33661637 diff --git a/PT/checkpoint-300/rng_state.pth b/PT/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ed3f136da9365c104ec216f6000d244363939f83 --- /dev/null +++ b/PT/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc5466e07a5669235c13287f136216bca1c3eea39beb7bc5ce9623c484e4faa +size 14575 diff --git a/PT/checkpoint-300/scheduler.pt b/PT/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..31d1adf1cfefc302641c15f1d5c21b0e6572dcc5 --- /dev/null +++ b/PT/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d49dd541be5be023a4d470892c88fbc252714d2438c8ff5deaa607fe72601bb +size 627 diff --git a/PT/checkpoint-300/trainer_state.json b/PT/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..76f3151acaf200b38078692a0e766161198fa930 --- /dev/null +++ b/PT/checkpoint-300/trainer_state.json @@ -0,0 +1,403 @@ +{ + "best_metric": 1.5345921516418457, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-300", + "epoch": 0.3251151449471688, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 9.88981248393216e+16, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-300/training_args.bin b/PT/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-400/README.md b/PT/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-400/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-400/adapter_config.json b/PT/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-400/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-400/adapter_model.bin b/PT/checkpoint-400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b04b6852c783c5808b8486b91d41bde3ab9d2869 --- /dev/null +++ b/PT/checkpoint-400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49f22c29d38955fe8443e13eb77706be734901c9814399b9bafe02c236dcde15 +size 16821197 diff --git a/PT/checkpoint-400/finetuning_args.json b/PT/checkpoint-400/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-400/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-400/optimizer.pt b/PT/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4dd786e0242cb4faef1cf71a3b9df1974b42ce6c --- /dev/null +++ b/PT/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c963bc4f26b62c94837c5cfde55713b398db8cfeb0095ec6b863acb9086fa108 +size 33661637 diff --git a/PT/checkpoint-400/rng_state.pth b/PT/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3607c7d6ef57ef7e03de040eb22048b302ae5e5 --- /dev/null +++ b/PT/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab45b54cc08eb7282509f1aa0b9e9e1ddfe69be01ac70284fd555109f1653b9 +size 14575 diff --git a/PT/checkpoint-400/scheduler.pt b/PT/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c076922aa684afdcb471a1395b7344ee5aed2ea7 --- /dev/null +++ b/PT/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e69a40944ddb1a7575b6a8ae917bbd6340ad29d8470cf0caad364b6d73877a +size 627 diff --git a/PT/checkpoint-400/trainer_state.json b/PT/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c66b5236ad650407549ac35db31862c5d0d0e20a --- /dev/null +++ b/PT/checkpoint-400/trainer_state.json @@ -0,0 +1,531 @@ +{ + "best_metric": 1.5193477869033813, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-400", + "epoch": 0.4334868599295584, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.318641664524288e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-400/training_args.bin b/PT/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-500/README.md b/PT/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-500/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-500/adapter_config.json b/PT/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-500/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-500/adapter_model.bin b/PT/checkpoint-500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..fa1fd2cb26f5f9ffb44d7cf838738be6ccebdece --- /dev/null +++ b/PT/checkpoint-500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f8c3fd75d1109627b2c6a5fa107e68424857f8e05c8bb3b3109a669b6525015 +size 16821197 diff --git a/PT/checkpoint-500/finetuning_args.json b/PT/checkpoint-500/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-500/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-500/optimizer.pt b/PT/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c752c21edd6f1bd3da8a17aaa25bdd8b0892707d --- /dev/null +++ b/PT/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:418ac3d6652ff6cadbe61753612cb5dc552e9b6d3bb7cf349f8dc4c2f148b90d +size 33661637 diff --git a/PT/checkpoint-500/rng_state.pth b/PT/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b5cc5727b9a462e93321bfb8228d5efe5bc86154 --- /dev/null +++ b/PT/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a0ff66876a2f7f9509cd2b90baea2475bbac89323af2d42ff9483e8f5aec5d +size 14575 diff --git a/PT/checkpoint-500/scheduler.pt b/PT/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e9aebda5719944a4ebaa74e89ed214fbf4fc982 --- /dev/null +++ b/PT/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aae2c685756d33aa18d7d40ba2a49636f0aadb7740cd897ef15e1374154af70 +size 627 diff --git a/PT/checkpoint-500/trainer_state.json b/PT/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..941cf223df417fc0c5b9850a05ab267608b2e4a7 --- /dev/null +++ b/PT/checkpoint-500/trainer_state.json @@ -0,0 +1,659 @@ +{ + "best_metric": 1.5050214529037476, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-500", + "epoch": 0.541858574911948, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.64830208065536e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-500/training_args.bin b/PT/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-600/README.md b/PT/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-600/adapter_config.json b/PT/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-600/adapter_model.bin b/PT/checkpoint-600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf60c5d4dbf3686983e18fb2225b3daddeacff47 --- /dev/null +++ b/PT/checkpoint-600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a448f7b747dceb45e2a30755b8aea253f3feb4ad3513bbe9a72b8b95ff61d9da +size 16821197 diff --git a/PT/checkpoint-600/finetuning_args.json b/PT/checkpoint-600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-600/optimizer.pt b/PT/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..92db958fe32f16e03613ca8f42de2b4b99b73b14 --- /dev/null +++ b/PT/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64e7d37bf7a066164c4297caad6c9dd308aa9468091c5297b94ac98fc88668f9 +size 33661637 diff --git a/PT/checkpoint-600/rng_state.pth b/PT/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f076abccf82ad391a6e6a1bcf4239cf20c6ff961 --- /dev/null +++ b/PT/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25276c64f8e61828ca69578e234daafb0c286e0f4c8c5f01a312d10e36e0d7e1 +size 14575 diff --git a/PT/checkpoint-600/scheduler.pt b/PT/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..69378976904300646f0f57684518621820d883d8 --- /dev/null +++ b/PT/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb4047741503b5bf7b0c6f5ca5e2a8c17869d7563d759a900c347650df47791 +size 627 diff --git a/PT/checkpoint-600/trainer_state.json b/PT/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e65274b475fa2603c75b9c47bae1d0715a8e773 --- /dev/null +++ b/PT/checkpoint-600/trainer_state.json @@ -0,0 +1,787 @@ +{ + "best_metric": 1.486396074295044, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-600", + "epoch": 0.6502302898943376, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.977962496786432e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-600/training_args.bin b/PT/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-700/README.md b/PT/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-700/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-700/adapter_config.json b/PT/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-700/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-700/adapter_model.bin b/PT/checkpoint-700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..d62b65d3391b34d1014e6b96ac629e592ffbd0e8 --- /dev/null +++ b/PT/checkpoint-700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3652297aec21637c3b67994ea110569630b2f86b8c05abfba4e56f7add07df3 +size 16821197 diff --git a/PT/checkpoint-700/finetuning_args.json b/PT/checkpoint-700/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-700/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-700/optimizer.pt b/PT/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aa51a71191839b7607acc662740fae5a3f4bb70c --- /dev/null +++ b/PT/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc20af00f7391bc4e2e3ddda5211210637db7a5651b500349c47a6d77043850c +size 33661637 diff --git a/PT/checkpoint-700/rng_state.pth b/PT/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f931644d2030673bd03126cad7c318e2952fb775 --- /dev/null +++ b/PT/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c44ce74472d01ca4cf1019200614eb1aec73c6ea029a01d2dc129d57a13f39d +size 14575 diff --git a/PT/checkpoint-700/scheduler.pt b/PT/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d74781559fe15e5cdad468302ce6eaede460ee80 --- /dev/null +++ b/PT/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:546f58473d46e63dadb618618207af5204a3c2c9f82bfeabf6c643413bd8dcda +size 627 diff --git a/PT/checkpoint-700/trainer_state.json b/PT/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e3365599530a1a8ce76a804e9780a615004f86 --- /dev/null +++ b/PT/checkpoint-700/trainer_state.json @@ -0,0 +1,915 @@ +{ + "best_metric": 1.4729957580566406, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-700", + "epoch": 0.7586020048767271, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.307622912917504e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-700/training_args.bin b/PT/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-800/README.md b/PT/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-800/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-800/adapter_config.json b/PT/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-800/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-800/adapter_model.bin b/PT/checkpoint-800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3a9cc2488032ca2049f9c5dc7c4c7a06a6017dfd --- /dev/null +++ b/PT/checkpoint-800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e416002daba5843e2fee6de2b34162c1eaebbbf2fb0dde86a7fadc2decb0cd61 +size 16821197 diff --git a/PT/checkpoint-800/finetuning_args.json b/PT/checkpoint-800/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-800/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-800/optimizer.pt b/PT/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..63939feccdb6128e3346350dc46fa36a826e30b9 --- /dev/null +++ b/PT/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbd8eb350798768425b1e1491351c49085d96ebf79d5f741145df7b6d45b1754 +size 33661637 diff --git a/PT/checkpoint-800/rng_state.pth b/PT/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e8b43e89f91999ef6a1b466a9fbb6407bedd9be --- /dev/null +++ b/PT/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c82274b078a41c4e61dc34ff0bf1692b2f83d20658fd7ea700d3050d60a60258 +size 14575 diff --git a/PT/checkpoint-800/scheduler.pt b/PT/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b956bfa661ea6b6b75ca1ac966cb0ebb162ed3d --- /dev/null +++ b/PT/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b044837cfbe55bfd3fca033441d39a62a5284841f5337e5dd52df0557fe49e7b +size 627 diff --git a/PT/checkpoint-800/trainer_state.json b/PT/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..563bbcc454304ad5aa8a03d727348e1d46c88c9e --- /dev/null +++ b/PT/checkpoint-800/trainer_state.json @@ -0,0 +1,1043 @@ +{ + "best_metric": 1.4600605964660645, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-800", + "epoch": 0.8669737198591168, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.637283329048576e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-800/training_args.bin b/PT/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/checkpoint-900/README.md b/PT/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/PT/checkpoint-900/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/PT/checkpoint-900/adapter_config.json b/PT/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/PT/checkpoint-900/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/PT/checkpoint-900/adapter_model.bin b/PT/checkpoint-900/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..57964c7b97efea8e9eeb542fc9e96d3c9789e084 --- /dev/null +++ b/PT/checkpoint-900/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e957d146218565eafd2db265c58561550f5d992c453a9d35923b96b6f642c836 +size 16821197 diff --git a/PT/checkpoint-900/finetuning_args.json b/PT/checkpoint-900/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/checkpoint-900/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/checkpoint-900/optimizer.pt b/PT/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2ba2abcd2792c21585d21237411c49d96c02001 --- /dev/null +++ b/PT/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f70bdd87be7ed99e09e1a96cbdec57ce36dbbd30afffca7891d87cb9f5341b0 +size 33661637 diff --git a/PT/checkpoint-900/rng_state.pth b/PT/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..19138d58d718f33aae208223a44d2deb95d1608d --- /dev/null +++ b/PT/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce32ab6647e5b81b645b0629a452b6c46cf3d2b735b8cfb71be07eb33ec5bec4 +size 14575 diff --git a/PT/checkpoint-900/scheduler.pt b/PT/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ef3a75c474a12e8ba4c33f49444b7cc99deb74d --- /dev/null +++ b/PT/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf05da155b68964906a32ec9d900efa42ecc975b5ef3bfd7d6cc38111dedc921 +size 627 diff --git a/PT/checkpoint-900/trainer_state.json b/PT/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9936fc340c2beb69a55993f9125dab69fede7a17 --- /dev/null +++ b/PT/checkpoint-900/trainer_state.json @@ -0,0 +1,1171 @@ +{ + "best_metric": 1.4479364156723022, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-900", + "epoch": 0.9753454348415064, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.966943745179648e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/checkpoint-900/training_args.bin b/PT/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/PT/eval_results.json b/PT/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..682c2e0364cdd2812e9dce664e454c8715ca8e15 --- /dev/null +++ b/PT/eval_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 3.0, + "eval_loss": 1.3419121503829956, + "eval_runtime": 10.6724, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "perplexity": 3.826353077140856 +} \ No newline at end of file diff --git a/PT/finetuning_args.json b/PT/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/PT/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/PT/train_results.json b/PT/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f5307f6c24b98b2c999ee45c8ee3e2b4176929ff --- /dev/null +++ b/PT/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "train_loss": 1.3472231076148138, + "train_runtime": 10021.615, + "train_samples_per_second": 4.42, + "train_steps_per_second": 0.276 +} \ No newline at end of file diff --git a/PT/trainer_log.jsonl b/PT/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..41e1a08e24a916eb1d8fe30e21ff61df365c1f76 --- /dev/null +++ b/PT/trainer_log.jsonl @@ -0,0 +1,582 @@ +{"current_steps": 5, "total_steps": 2766, "loss": 2.0025, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999919374161553, "epoch": 0.01, "percentage": 0.18, "elapsed_time": "0:00:18", "remaining_time": "2:48:35"} +{"current_steps": 10, "total_steps": 2766, "loss": 1.7737, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999677499246417, "epoch": 0.01, "percentage": 0.36, "elapsed_time": "0:00:35", "remaining_time": "2:44:54"} +{"current_steps": 15, "total_steps": 2766, "loss": 1.7391, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999274383055143, "epoch": 0.02, "percentage": 0.54, "elapsed_time": "0:00:53", "remaining_time": "2:43:29"} +{"current_steps": 20, "total_steps": 2766, "loss": 1.7959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009998710038588363, "epoch": 0.02, "percentage": 0.72, "elapsed_time": "0:01:11", "remaining_time": "2:42:37"} +{"current_steps": 25, "total_steps": 2766, "loss": 1.713, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009997984484046375, "epoch": 0.03, "percentage": 0.9, "elapsed_time": "0:01:28", "remaining_time": "2:41:59"} +{"current_steps": 30, "total_steps": 2766, "loss": 1.6441, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009997097742828556, "epoch": 0.03, "percentage": 1.08, "elapsed_time": "0:01:46", "remaining_time": "2:41:28"} +{"current_steps": 35, "total_steps": 2766, "loss": 1.704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009996049843532607, "epoch": 0.04, "percentage": 1.27, "elapsed_time": "0:02:03", "remaining_time": "2:41:01"} +{"current_steps": 40, "total_steps": 2766, "loss": 1.6532, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009994840819953633, "epoch": 0.04, "percentage": 1.45, "elapsed_time": "0:02:21", "remaining_time": "2:40:36"} +{"current_steps": 45, "total_steps": 2766, "loss": 1.6791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009993470711083048, "epoch": 0.05, "percentage": 1.63, "elapsed_time": "0:02:38", "remaining_time": "2:40:13"} +{"current_steps": 50, "total_steps": 2766, "loss": 1.6465, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009991939561107325, "epoch": 0.05, "percentage": 1.81, "elapsed_time": "0:02:56", "remaining_time": "2:39:51"} +{"current_steps": 55, "total_steps": 2766, "loss": 1.6511, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000999024741940656, "epoch": 0.06, "percentage": 1.99, "elapsed_time": "0:03:14", "remaining_time": "2:39:30"} +{"current_steps": 60, "total_steps": 2766, "loss": 1.6727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009988394340552898, "epoch": 0.07, "percentage": 2.17, "elapsed_time": "0:03:31", "remaining_time": "2:39:09"} +{"current_steps": 65, "total_steps": 2766, "loss": 1.6653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009986380384308746, "epoch": 0.07, "percentage": 2.35, "elapsed_time": "0:03:49", "remaining_time": "2:38:49"} +{"current_steps": 70, "total_steps": 2766, "loss": 1.6339, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009984205615624873, "epoch": 0.08, "percentage": 2.53, "elapsed_time": "0:04:06", "remaining_time": "2:38:29"} +{"current_steps": 75, "total_steps": 2766, "loss": 1.5562, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009981870104638294, "epoch": 0.08, "percentage": 2.71, "elapsed_time": "0:04:24", "remaining_time": "2:38:09"} +{"current_steps": 80, "total_steps": 2766, "loss": 1.6291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009979373926670028, "epoch": 0.09, "percentage": 2.89, "elapsed_time": "0:04:42", "remaining_time": "2:37:50"} +{"current_steps": 85, "total_steps": 2766, "loss": 1.625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009976717162222645, "epoch": 0.09, "percentage": 3.07, "elapsed_time": "0:04:59", "remaining_time": "2:37:31"} +{"current_steps": 90, "total_steps": 2766, "loss": 1.6008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009973899896977695, "epoch": 0.1, "percentage": 3.25, "elapsed_time": "0:05:17", "remaining_time": "2:37:12"} +{"current_steps": 95, "total_steps": 2766, "loss": 1.6821, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000997092222179292, "epoch": 0.1, "percentage": 3.43, "elapsed_time": "0:05:34", "remaining_time": "2:36:54"} +{"current_steps": 100, "total_steps": 2766, "loss": 1.582, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009967784232699352, "epoch": 0.11, "percentage": 3.62, "elapsed_time": "0:05:52", "remaining_time": "2:36:35"} +{"current_steps": 100, "total_steps": 2766, "loss": null, "eval_loss": 1.6186352968215942, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.11, "percentage": 3.62, "elapsed_time": "0:05:52", "remaining_time": "2:36:35"} +{"current_steps": 105, "total_steps": 2766, "loss": 1.5769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009964486030898186, "epoch": 0.11, "percentage": 3.8, "elapsed_time": "0:06:20", "remaining_time": "2:40:48"} +{"current_steps": 110, "total_steps": 2766, "loss": 1.5868, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009961027722757538, "epoch": 0.12, "percentage": 3.98, "elapsed_time": "0:06:38", "remaining_time": "2:40:17"} +{"current_steps": 115, "total_steps": 2766, "loss": 1.5601, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009957409419809006, "epoch": 0.12, "percentage": 4.16, "elapsed_time": "0:06:55", "remaining_time": "2:39:47"} +{"current_steps": 120, "total_steps": 2766, "loss": 1.6061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000995363123874407, "epoch": 0.13, "percentage": 4.34, "elapsed_time": "0:07:13", "remaining_time": "2:39:18"} +{"current_steps": 125, "total_steps": 2766, "loss": 1.6073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009949693301410341, "epoch": 0.14, "percentage": 4.52, "elapsed_time": "0:07:31", "remaining_time": "2:38:50"} +{"current_steps": 130, "total_steps": 2766, "loss": 1.4998, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009945595734807615, "epoch": 0.14, "percentage": 4.7, "elapsed_time": "0:07:48", "remaining_time": "2:38:22"} +{"current_steps": 135, "total_steps": 2766, "loss": 1.5295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009941338671083794, "epoch": 0.15, "percentage": 4.88, "elapsed_time": "0:08:06", "remaining_time": "2:37:56"} +{"current_steps": 140, "total_steps": 2766, "loss": 1.5418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009936922247530606, "epoch": 0.15, "percentage": 5.06, "elapsed_time": "0:08:23", "remaining_time": "2:37:30"} +{"current_steps": 145, "total_steps": 2766, "loss": 1.554, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009932346606579192, "epoch": 0.16, "percentage": 5.24, "elapsed_time": "0:08:41", "remaining_time": "2:37:04"} +{"current_steps": 150, "total_steps": 2766, "loss": 1.5509, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009927611895795513, "epoch": 0.16, "percentage": 5.42, "elapsed_time": "0:08:58", "remaining_time": "2:36:40"} +{"current_steps": 155, "total_steps": 2766, "loss": 1.6123, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009922718267875571, "epoch": 0.17, "percentage": 5.6, "elapsed_time": "0:09:16", "remaining_time": "2:36:15"} +{"current_steps": 160, "total_steps": 2766, "loss": 1.6267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009917665880640515, "epoch": 0.17, "percentage": 5.78, "elapsed_time": "0:09:34", "remaining_time": "2:35:51"} +{"current_steps": 165, "total_steps": 2766, "loss": 1.6116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009912454897031524, "epoch": 0.18, "percentage": 5.97, "elapsed_time": "0:09:51", "remaining_time": "2:35:28"} +{"current_steps": 170, "total_steps": 2766, "loss": 1.5618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009907085485104568, "epoch": 0.18, "percentage": 6.15, "elapsed_time": "0:10:09", "remaining_time": "2:35:04"} +{"current_steps": 175, "total_steps": 2766, "loss": 1.6085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009901557818024981, "epoch": 0.19, "percentage": 6.33, "elapsed_time": "0:10:26", "remaining_time": "2:34:41"} +{"current_steps": 180, "total_steps": 2766, "loss": 1.5829, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009895872074061885, "epoch": 0.2, "percentage": 6.51, "elapsed_time": "0:10:44", "remaining_time": "2:34:19"} +{"current_steps": 185, "total_steps": 2766, "loss": 1.5407, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009890028436582426, "epoch": 0.2, "percentage": 6.69, "elapsed_time": "0:11:02", "remaining_time": "2:33:57"} +{"current_steps": 190, "total_steps": 2766, "loss": 1.5568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009884027094045871, "epoch": 0.21, "percentage": 6.87, "elapsed_time": "0:11:19", "remaining_time": "2:33:34"} +{"current_steps": 195, "total_steps": 2766, "loss": 1.5831, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009877868239997532, "epoch": 0.21, "percentage": 7.05, "elapsed_time": "0:11:37", "remaining_time": "2:33:13"} +{"current_steps": 200, "total_steps": 2766, "loss": 1.5231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009871552073062516, "epoch": 0.22, "percentage": 7.23, "elapsed_time": "0:11:54", "remaining_time": "2:32:51"} +{"current_steps": 200, "total_steps": 2766, "loss": null, "eval_loss": 1.5717933177947998, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.22, "percentage": 7.23, "elapsed_time": "0:11:54", "remaining_time": "2:32:51"} +{"current_steps": 205, "total_steps": 2766, "loss": 1.5467, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009865078796939327, "epoch": 0.22, "percentage": 7.41, "elapsed_time": "0:12:23", "remaining_time": "2:34:43"} +{"current_steps": 210, "total_steps": 2766, "loss": 1.6403, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000985844862039329, "epoch": 0.23, "percentage": 7.59, "elapsed_time": "0:12:40", "remaining_time": "2:34:19"} +{"current_steps": 215, "total_steps": 2766, "loss": 1.5352, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009851661757249823, "epoch": 0.23, "percentage": 7.77, "elapsed_time": "0:12:58", "remaining_time": "2:33:54"} +{"current_steps": 220, "total_steps": 2766, "loss": 1.5616, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009844718426387537, "epoch": 0.24, "percentage": 7.95, "elapsed_time": "0:13:15", "remaining_time": "2:33:30"} +{"current_steps": 225, "total_steps": 2766, "loss": 1.5274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000983761885173118, "epoch": 0.24, "percentage": 8.13, "elapsed_time": "0:13:33", "remaining_time": "2:33:06"} +{"current_steps": 230, "total_steps": 2766, "loss": 1.6153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000983036326224442, "epoch": 0.25, "percentage": 8.32, "elapsed_time": "0:13:51", "remaining_time": "2:32:43"} +{"current_steps": 235, "total_steps": 2766, "loss": 1.5062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009822951891922448, "epoch": 0.25, "percentage": 8.5, "elapsed_time": "0:14:08", "remaining_time": "2:32:20"} +{"current_steps": 240, "total_steps": 2766, "loss": 1.6038, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009815384979784444, "epoch": 0.26, "percentage": 8.68, "elapsed_time": "0:14:26", "remaining_time": "2:31:57"} +{"current_steps": 245, "total_steps": 2766, "loss": 1.5097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000980766276986586, "epoch": 0.27, "percentage": 8.86, "elapsed_time": "0:14:43", "remaining_time": "2:31:34"} +{"current_steps": 250, "total_steps": 2766, "loss": 1.535, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009799785511210557, "epoch": 0.27, "percentage": 9.04, "elapsed_time": "0:15:01", "remaining_time": "2:31:11"} +{"current_steps": 255, "total_steps": 2766, "loss": 1.52, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000979175345786277, "epoch": 0.28, "percentage": 9.22, "elapsed_time": "0:15:18", "remaining_time": "2:30:49"} +{"current_steps": 260, "total_steps": 2766, "loss": 1.5678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009783566868858912, "epoch": 0.28, "percentage": 9.4, "elapsed_time": "0:15:36", "remaining_time": "2:30:27"} +{"current_steps": 265, "total_steps": 2766, "loss": 1.5536, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009775226008219224, "epoch": 0.29, "percentage": 9.58, "elapsed_time": "0:15:54", "remaining_time": "2:30:04"} +{"current_steps": 270, "total_steps": 2766, "loss": 1.4826, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009766731144939258, "epoch": 0.29, "percentage": 9.76, "elapsed_time": "0:16:11", "remaining_time": "2:29:43"} +{"current_steps": 275, "total_steps": 2766, "loss": 1.5537, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009758082552981204, "epoch": 0.3, "percentage": 9.94, "elapsed_time": "0:16:29", "remaining_time": "2:29:21"} +{"current_steps": 280, "total_steps": 2766, "loss": 1.5277, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009749280511265056, "epoch": 0.3, "percentage": 10.12, "elapsed_time": "0:16:46", "remaining_time": "2:28:59"} +{"current_steps": 285, "total_steps": 2766, "loss": 1.5445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009740325303659609, "epoch": 0.31, "percentage": 10.3, "elapsed_time": "0:17:04", "remaining_time": "2:28:38"} +{"current_steps": 290, "total_steps": 2766, "loss": 1.4944, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000973121721897331, "epoch": 0.31, "percentage": 10.48, "elapsed_time": "0:17:22", "remaining_time": "2:28:17"} +{"current_steps": 295, "total_steps": 2766, "loss": 1.5088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009721956550944948, "epoch": 0.32, "percentage": 10.67, "elapsed_time": "0:17:39", "remaining_time": "2:27:55"} +{"current_steps": 300, "total_steps": 2766, "loss": 1.585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009712543598234172, "epoch": 0.33, "percentage": 10.85, "elapsed_time": "0:17:57", "remaining_time": "2:27:34"} +{"current_steps": 300, "total_steps": 2766, "loss": null, "eval_loss": 1.5345921516418457, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.33, "percentage": 10.85, "elapsed_time": "0:17:57", "remaining_time": "2:27:34"} +{"current_steps": 305, "total_steps": 2766, "loss": 1.5427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009702978664411863, "epoch": 0.33, "percentage": 11.03, "elapsed_time": "0:18:25", "remaining_time": "2:28:40"} +{"current_steps": 310, "total_steps": 2766, "loss": 1.4475, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009693262057950345, "epoch": 0.34, "percentage": 11.21, "elapsed_time": "0:18:43", "remaining_time": "2:28:18"} +{"current_steps": 315, "total_steps": 2766, "loss": 1.5321, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009683394092213436, "epoch": 0.34, "percentage": 11.39, "elapsed_time": "0:19:00", "remaining_time": "2:27:55"} +{"current_steps": 320, "total_steps": 2766, "loss": 1.5171, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009673375085446339, "epoch": 0.35, "percentage": 11.57, "elapsed_time": "0:19:18", "remaining_time": "2:27:33"} +{"current_steps": 325, "total_steps": 2766, "loss": 1.5198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009663205360765382, "epoch": 0.35, "percentage": 11.75, "elapsed_time": "0:19:35", "remaining_time": "2:27:11"} +{"current_steps": 330, "total_steps": 2766, "loss": 1.492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00096528852461476, "epoch": 0.36, "percentage": 11.93, "elapsed_time": "0:19:53", "remaining_time": "2:26:49"} +{"current_steps": 335, "total_steps": 2766, "loss": 1.5036, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009642415074420146, "epoch": 0.36, "percentage": 12.11, "elapsed_time": "0:20:11", "remaining_time": "2:26:28"} +{"current_steps": 340, "total_steps": 2766, "loss": 1.5134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009631795183249573, "epoch": 0.37, "percentage": 12.29, "elapsed_time": "0:20:28", "remaining_time": "2:26:06"} +{"current_steps": 345, "total_steps": 2766, "loss": 1.5568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009621025915130932, "epoch": 0.37, "percentage": 12.47, "elapsed_time": "0:20:46", "remaining_time": "2:25:45"} +{"current_steps": 350, "total_steps": 2766, "loss": 1.503, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009610107617376733, "epoch": 0.38, "percentage": 12.65, "elapsed_time": "0:21:03", "remaining_time": "2:25:23"} +{"current_steps": 355, "total_steps": 2766, "loss": 1.4584, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009599040642105736, "epoch": 0.38, "percentage": 12.83, "elapsed_time": "0:21:21", "remaining_time": "2:25:02"} +{"current_steps": 360, "total_steps": 2766, "loss": 1.4832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000958782534623161, "epoch": 0.39, "percentage": 13.02, "elapsed_time": "0:21:38", "remaining_time": "2:24:41"} +{"current_steps": 365, "total_steps": 2766, "loss": 1.4598, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009576462091451406, "epoch": 0.4, "percentage": 13.2, "elapsed_time": "0:21:56", "remaining_time": "2:24:20"} +{"current_steps": 370, "total_steps": 2766, "loss": 1.5492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009564951244233901, "epoch": 0.4, "percentage": 13.38, "elapsed_time": "0:22:14", "remaining_time": "2:23:59"} +{"current_steps": 375, "total_steps": 2766, "loss": 1.5145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000955329317580778, "epoch": 0.41, "percentage": 13.56, "elapsed_time": "0:22:31", "remaining_time": "2:23:38"} +{"current_steps": 380, "total_steps": 2766, "loss": 1.589, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009541488262149661, "epoch": 0.41, "percentage": 13.74, "elapsed_time": "0:22:49", "remaining_time": "2:23:17"} +{"current_steps": 385, "total_steps": 2766, "loss": 1.6003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009529536883971963, "epoch": 0.42, "percentage": 13.92, "elapsed_time": "0:23:06", "remaining_time": "2:22:56"} +{"current_steps": 390, "total_steps": 2766, "loss": 1.55, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009517439426710646, "epoch": 0.42, "percentage": 14.1, "elapsed_time": "0:23:24", "remaining_time": "2:22:36"} +{"current_steps": 395, "total_steps": 2766, "loss": 1.5359, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009505196280512762, "epoch": 0.43, "percentage": 14.28, "elapsed_time": "0:23:42", "remaining_time": "2:22:15"} +{"current_steps": 400, "total_steps": 2766, "loss": 1.4854, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009492807840223881, "epoch": 0.43, "percentage": 14.46, "elapsed_time": "0:23:59", "remaining_time": "2:21:55"} +{"current_steps": 400, "total_steps": 2766, "loss": null, "eval_loss": 1.5193477869033813, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.43, "percentage": 14.46, "elapsed_time": "0:23:59", "remaining_time": "2:21:55"} +{"current_steps": 405, "total_steps": 2766, "loss": 1.4891, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009480274505375358, "epoch": 0.44, "percentage": 14.64, "elapsed_time": "0:24:27", "remaining_time": "2:22:37"} +{"current_steps": 410, "total_steps": 2766, "loss": 1.4719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009467596680171446, "epoch": 0.44, "percentage": 14.82, "elapsed_time": "0:24:45", "remaining_time": "2:22:16"} +{"current_steps": 415, "total_steps": 2766, "loss": 1.4939, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009454774773476257, "epoch": 0.45, "percentage": 15.0, "elapsed_time": "0:25:03", "remaining_time": "2:21:55"} +{"current_steps": 420, "total_steps": 2766, "loss": 1.4382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009441809198800587, "epoch": 0.46, "percentage": 15.18, "elapsed_time": "0:25:20", "remaining_time": "2:21:34"} +{"current_steps": 425, "total_steps": 2766, "loss": 1.4427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009428700374288564, "epoch": 0.46, "percentage": 15.37, "elapsed_time": "0:25:38", "remaining_time": "2:21:13"} +{"current_steps": 430, "total_steps": 2766, "loss": 1.4767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009415448722704175, "epoch": 0.47, "percentage": 15.55, "elapsed_time": "0:25:55", "remaining_time": "2:20:52"} +{"current_steps": 435, "total_steps": 2766, "loss": 1.4799, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009402054671417628, "epoch": 0.47, "percentage": 15.73, "elapsed_time": "0:26:13", "remaining_time": "2:20:31"} +{"current_steps": 440, "total_steps": 2766, "loss": 1.4608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009388518652391571, "epoch": 0.48, "percentage": 15.91, "elapsed_time": "0:26:31", "remaining_time": "2:20:10"} +{"current_steps": 445, "total_steps": 2766, "loss": 1.4937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009374841102167157, "epoch": 0.48, "percentage": 16.09, "elapsed_time": "0:26:48", "remaining_time": "2:19:49"} +{"current_steps": 450, "total_steps": 2766, "loss": 1.5468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009361022461849965, "epoch": 0.49, "percentage": 16.27, "elapsed_time": "0:27:06", "remaining_time": "2:19:29"} +{"current_steps": 455, "total_steps": 2766, "loss": 1.5481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009347063177095783, "epoch": 0.49, "percentage": 16.45, "elapsed_time": "0:27:23", "remaining_time": "2:19:08"} +{"current_steps": 460, "total_steps": 2766, "loss": 1.4478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009332963698096223, "epoch": 0.5, "percentage": 16.63, "elapsed_time": "0:27:41", "remaining_time": "2:18:48"} +{"current_steps": 465, "total_steps": 2766, "loss": 1.4977, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009318724479564215, "epoch": 0.5, "percentage": 16.81, "elapsed_time": "0:27:58", "remaining_time": "2:18:27"} +{"current_steps": 470, "total_steps": 2766, "loss": 1.5091, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009304345980719329, "epoch": 0.51, "percentage": 16.99, "elapsed_time": "0:28:16", "remaining_time": "2:18:07"} +{"current_steps": 475, "total_steps": 2766, "loss": 1.43, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009289828665272977, "epoch": 0.51, "percentage": 17.17, "elapsed_time": "0:28:34", "remaining_time": "2:17:47"} +{"current_steps": 480, "total_steps": 2766, "loss": 1.4725, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009275173001413448, "epoch": 0.52, "percentage": 17.35, "elapsed_time": "0:28:51", "remaining_time": "2:17:27"} +{"current_steps": 485, "total_steps": 2766, "loss": 1.3741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009260379461790822, "epoch": 0.53, "percentage": 17.53, "elapsed_time": "0:29:09", "remaining_time": "2:17:06"} +{"current_steps": 490, "total_steps": 2766, "loss": 1.4917, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009245448523501708, "epoch": 0.53, "percentage": 17.72, "elapsed_time": "0:29:26", "remaining_time": "2:16:46"} +{"current_steps": 495, "total_steps": 2766, "loss": 1.4684, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009230380668073877, "epoch": 0.54, "percentage": 17.9, "elapsed_time": "0:29:44", "remaining_time": "2:16:26"} +{"current_steps": 500, "total_steps": 2766, "loss": 1.5209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009215176381450717, "epoch": 0.54, "percentage": 18.08, "elapsed_time": "0:30:02", "remaining_time": "2:16:06"} +{"current_steps": 500, "total_steps": 2766, "loss": null, "eval_loss": 1.5050214529037476, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.54, "percentage": 18.08, "elapsed_time": "0:30:02", "remaining_time": "2:16:06"} +{"current_steps": 505, "total_steps": 2766, "loss": 1.4913, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009199836153975573, "epoch": 0.55, "percentage": 18.26, "elapsed_time": "0:30:30", "remaining_time": "2:16:34"} +{"current_steps": 510, "total_steps": 2766, "loss": 1.5377, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009184360480375926, "epoch": 0.55, "percentage": 18.44, "elapsed_time": "0:30:47", "remaining_time": "2:16:14"} +{"current_steps": 515, "total_steps": 2766, "loss": 1.4608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009168749859747438, "epoch": 0.56, "percentage": 18.62, "elapsed_time": "0:31:05", "remaining_time": "2:15:53"} +{"current_steps": 520, "total_steps": 2766, "loss": 1.4738, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009153004795537861, "epoch": 0.56, "percentage": 18.8, "elapsed_time": "0:31:23", "remaining_time": "2:15:33"} +{"current_steps": 525, "total_steps": 2766, "loss": 1.4947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009137125795530795, "epoch": 0.57, "percentage": 18.98, "elapsed_time": "0:31:40", "remaining_time": "2:15:13"} +{"current_steps": 530, "total_steps": 2766, "loss": 1.5267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009121113371829318, "epoch": 0.57, "percentage": 19.16, "elapsed_time": "0:31:58", "remaining_time": "2:14:52"} +{"current_steps": 535, "total_steps": 2766, "loss": 1.5116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009104968040839463, "epoch": 0.58, "percentage": 19.34, "elapsed_time": "0:32:15", "remaining_time": "2:14:32"} +{"current_steps": 540, "total_steps": 2766, "loss": 1.4423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000908869032325357, "epoch": 0.59, "percentage": 19.52, "elapsed_time": "0:32:33", "remaining_time": "2:14:12"} +{"current_steps": 545, "total_steps": 2766, "loss": 1.4565, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000907228074403349, "epoch": 0.59, "percentage": 19.7, "elapsed_time": "0:32:50", "remaining_time": "2:13:52"} +{"current_steps": 550, "total_steps": 2766, "loss": 1.4923, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009055739832393655, "epoch": 0.6, "percentage": 19.88, "elapsed_time": "0:33:08", "remaining_time": "2:13:32"} +{"current_steps": 555, "total_steps": 2766, "loss": 1.4304, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009039068121784016, "epoch": 0.6, "percentage": 20.07, "elapsed_time": "0:33:26", "remaining_time": "2:13:12"} +{"current_steps": 560, "total_steps": 2766, "loss": 1.4422, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009022266149872829, "epoch": 0.61, "percentage": 20.25, "elapsed_time": "0:33:43", "remaining_time": "2:12:52"} +{"current_steps": 565, "total_steps": 2766, "loss": 1.522, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009005334458529322, "epoch": 0.61, "percentage": 20.43, "elapsed_time": "0:34:01", "remaining_time": "2:12:32"} +{"current_steps": 570, "total_steps": 2766, "loss": 1.499, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008988273593806222, "epoch": 0.62, "percentage": 20.61, "elapsed_time": "0:34:18", "remaining_time": "2:12:12"} +{"current_steps": 575, "total_steps": 2766, "loss": 1.4796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008971084105922139, "epoch": 0.62, "percentage": 20.79, "elapsed_time": "0:34:36", "remaining_time": "2:11:52"} +{"current_steps": 580, "total_steps": 2766, "loss": 1.4231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008953766549243818, "epoch": 0.63, "percentage": 20.97, "elapsed_time": "0:34:54", "remaining_time": "2:11:32"} +{"current_steps": 585, "total_steps": 2766, "loss": 1.462, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008936321482268275, "epoch": 0.63, "percentage": 21.15, "elapsed_time": "0:35:11", "remaining_time": "2:11:12"} +{"current_steps": 590, "total_steps": 2766, "loss": 1.5191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008918749467604766, "epoch": 0.64, "percentage": 21.33, "elapsed_time": "0:35:29", "remaining_time": "2:10:52"} +{"current_steps": 595, "total_steps": 2766, "loss": 1.4845, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008901051071956661, "epoch": 0.64, "percentage": 21.51, "elapsed_time": "0:35:46", "remaining_time": "2:10:33"} +{"current_steps": 600, "total_steps": 2766, "loss": 1.4652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008883226866103152, "epoch": 0.65, "percentage": 21.69, "elapsed_time": "0:36:04", "remaining_time": "2:10:13"} +{"current_steps": 600, "total_steps": 2766, "loss": null, "eval_loss": 1.486396074295044, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.65, "percentage": 21.69, "elapsed_time": "0:36:04", "remaining_time": "2:10:13"} +{"current_steps": 605, "total_steps": 2766, "loss": 1.4773, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008865277424880859, "epoch": 0.66, "percentage": 21.87, "elapsed_time": "0:36:32", "remaining_time": "2:10:32"} +{"current_steps": 610, "total_steps": 2766, "loss": 1.4555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008847203327165278, "epoch": 0.66, "percentage": 22.05, "elapsed_time": "0:36:50", "remaining_time": "2:10:12"} +{"current_steps": 615, "total_steps": 2766, "loss": 1.5235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008829005155852125, "epoch": 0.67, "percentage": 22.23, "elapsed_time": "0:37:07", "remaining_time": "2:09:52"} +{"current_steps": 620, "total_steps": 2766, "loss": 1.4329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008810683497838525, "epoch": 0.67, "percentage": 22.42, "elapsed_time": "0:37:25", "remaining_time": "2:09:32"} +{"current_steps": 625, "total_steps": 2766, "loss": 1.4515, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008792238944004096, "epoch": 0.68, "percentage": 22.6, "elapsed_time": "0:37:43", "remaining_time": "2:09:12"} +{"current_steps": 630, "total_steps": 2766, "loss": 1.4616, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008773672089191885, "epoch": 0.68, "percentage": 22.78, "elapsed_time": "0:38:00", "remaining_time": "2:08:52"} +{"current_steps": 635, "total_steps": 2766, "loss": 1.3931, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008754983532189185, "epoch": 0.69, "percentage": 22.96, "elapsed_time": "0:38:18", "remaining_time": "2:08:32"} +{"current_steps": 640, "total_steps": 2766, "loss": 1.4714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008736173875708229, "epoch": 0.69, "percentage": 23.14, "elapsed_time": "0:38:35", "remaining_time": "2:08:12"} +{"current_steps": 645, "total_steps": 2766, "loss": 1.4831, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008717243726366746, "epoch": 0.7, "percentage": 23.32, "elapsed_time": "0:38:53", "remaining_time": "2:07:52"} +{"current_steps": 650, "total_steps": 2766, "loss": 1.4928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00086981936946684, "epoch": 0.7, "percentage": 23.5, "elapsed_time": "0:39:10", "remaining_time": "2:07:33"} +{"current_steps": 655, "total_steps": 2766, "loss": 1.3735, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008679024394983105, "epoch": 0.71, "percentage": 23.68, "elapsed_time": "0:39:28", "remaining_time": "2:07:13"} +{"current_steps": 660, "total_steps": 2766, "loss": 1.4587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008659736445527202, "epoch": 0.72, "percentage": 23.86, "elapsed_time": "0:39:46", "remaining_time": "2:06:53"} +{"current_steps": 665, "total_steps": 2766, "loss": 1.5138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008640330468343532, "epoch": 0.72, "percentage": 24.04, "elapsed_time": "0:40:03", "remaining_time": "2:06:34"} +{"current_steps": 670, "total_steps": 2766, "loss": 1.4625, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008620807089281364, "epoch": 0.73, "percentage": 24.22, "elapsed_time": "0:40:21", "remaining_time": "2:06:14"} +{"current_steps": 675, "total_steps": 2766, "loss": 1.4173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008601166937976226, "epoch": 0.73, "percentage": 24.4, "elapsed_time": "0:40:38", "remaining_time": "2:05:55"} +{"current_steps": 680, "total_steps": 2766, "loss": 1.4901, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000858141064782958, "epoch": 0.74, "percentage": 24.58, "elapsed_time": "0:40:56", "remaining_time": "2:05:35"} +{"current_steps": 685, "total_steps": 2766, "loss": 1.4056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008561538855988409, "epoch": 0.74, "percentage": 24.77, "elapsed_time": "0:41:14", "remaining_time": "2:05:16"} +{"current_steps": 690, "total_steps": 2766, "loss": 1.4486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008541552203324667, "epoch": 0.75, "percentage": 24.95, "elapsed_time": "0:41:31", "remaining_time": "2:04:56"} +{"current_steps": 695, "total_steps": 2766, "loss": 1.4147, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008521451334414605, "epoch": 0.75, "percentage": 25.13, "elapsed_time": "0:41:49", "remaining_time": "2:04:37"} +{"current_steps": 700, "total_steps": 2766, "loss": 1.4547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008501236897517987, "epoch": 0.76, "percentage": 25.31, "elapsed_time": "0:42:06", "remaining_time": "2:04:17"} +{"current_steps": 700, "total_steps": 2766, "loss": null, "eval_loss": 1.4729957580566406, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.76, "percentage": 25.31, "elapsed_time": "0:42:06", "remaining_time": "2:04:17"} +{"current_steps": 705, "total_steps": 2766, "loss": 1.4464, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000848090954455718, "epoch": 0.76, "percentage": 25.49, "elapsed_time": "0:42:35", "remaining_time": "2:04:29"} +{"current_steps": 710, "total_steps": 2766, "loss": 1.4163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008460469931096138, "epoch": 0.77, "percentage": 25.67, "elapsed_time": "0:42:52", "remaining_time": "2:04:09"} +{"current_steps": 715, "total_steps": 2766, "loss": 1.5283, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008439918716319246, "epoch": 0.77, "percentage": 25.85, "elapsed_time": "0:43:10", "remaining_time": "2:03:50"} +{"current_steps": 720, "total_steps": 2766, "loss": 1.4313, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008419256563010076, "epoch": 0.78, "percentage": 26.03, "elapsed_time": "0:43:27", "remaining_time": "2:03:30"} +{"current_steps": 725, "total_steps": 2766, "loss": 1.3995, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000839848413753, "epoch": 0.79, "percentage": 26.21, "elapsed_time": "0:43:45", "remaining_time": "2:03:10"} +{"current_steps": 730, "total_steps": 2766, "loss": 1.4265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008377602109796709, "epoch": 0.79, "percentage": 26.39, "elapsed_time": "0:44:02", "remaining_time": "2:02:51"} +{"current_steps": 735, "total_steps": 2766, "loss": 1.4426, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008356611153262598, "epoch": 0.8, "percentage": 26.57, "elapsed_time": "0:44:20", "remaining_time": "2:02:31"} +{"current_steps": 740, "total_steps": 2766, "loss": 1.4251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008335511944893057, "epoch": 0.8, "percentage": 26.75, "elapsed_time": "0:44:38", "remaining_time": "2:02:12"} +{"current_steps": 745, "total_steps": 2766, "loss": 1.4686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008314305165144633, "epoch": 0.81, "percentage": 26.93, "elapsed_time": "0:44:55", "remaining_time": "2:01:52"} +{"current_steps": 750, "total_steps": 2766, "loss": 1.4658, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008292991497943081, "epoch": 0.81, "percentage": 27.11, "elapsed_time": "0:45:13", "remaining_time": "2:01:33"} +{"current_steps": 755, "total_steps": 2766, "loss": 1.4347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008271571630661321, "epoch": 0.82, "percentage": 27.3, "elapsed_time": "0:45:30", "remaining_time": "2:01:13"} +{"current_steps": 760, "total_steps": 2766, "loss": 1.4235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008250046254097255, "epoch": 0.82, "percentage": 27.48, "elapsed_time": "0:45:48", "remaining_time": "2:00:54"} +{"current_steps": 765, "total_steps": 2766, "loss": 1.5047, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008228416062451494, "epoch": 0.83, "percentage": 27.66, "elapsed_time": "0:46:06", "remaining_time": "2:00:35"} +{"current_steps": 770, "total_steps": 2766, "loss": 1.445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008206681753304976, "epoch": 0.83, "percentage": 27.84, "elapsed_time": "0:46:23", "remaining_time": "2:00:15"} +{"current_steps": 775, "total_steps": 2766, "loss": 1.4077, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008184844027596461, "epoch": 0.84, "percentage": 28.02, "elapsed_time": "0:46:41", "remaining_time": "1:59:56"} +{"current_steps": 780, "total_steps": 2766, "loss": 1.5057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008162903589599924, "epoch": 0.85, "percentage": 28.2, "elapsed_time": "0:46:58", "remaining_time": "1:59:37"} +{"current_steps": 785, "total_steps": 2766, "loss": 1.4445, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008140861146901849, "epoch": 0.85, "percentage": 28.38, "elapsed_time": "0:47:16", "remaining_time": "1:59:17"} +{"current_steps": 790, "total_steps": 2766, "loss": 1.5333, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008118717410378407, "epoch": 0.86, "percentage": 28.56, "elapsed_time": "0:47:33", "remaining_time": "1:58:58"} +{"current_steps": 795, "total_steps": 2766, "loss": 1.3786, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008096473094172527, "epoch": 0.86, "percentage": 28.74, "elapsed_time": "0:47:51", "remaining_time": "1:58:39"} +{"current_steps": 800, "total_steps": 2766, "loss": 1.3781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008074128915670868, "epoch": 0.87, "percentage": 28.92, "elapsed_time": "0:48:09", "remaining_time": "1:58:20"} +{"current_steps": 800, "total_steps": 2766, "loss": null, "eval_loss": 1.4600605964660645, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.87, "percentage": 28.92, "elapsed_time": "0:48:09", "remaining_time": "1:58:20"} +{"current_steps": 805, "total_steps": 2766, "loss": 1.5097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008051685595480678, "epoch": 0.87, "percentage": 29.1, "elapsed_time": "0:48:37", "remaining_time": "1:58:26"} +{"current_steps": 810, "total_steps": 2766, "loss": 1.5608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008029143857406563, "epoch": 0.88, "percentage": 29.28, "elapsed_time": "0:48:55", "remaining_time": "1:58:07"} +{"current_steps": 815, "total_steps": 2766, "loss": 1.4113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008006504428427133, "epoch": 0.88, "percentage": 29.46, "elapsed_time": "0:49:12", "remaining_time": "1:57:48"} +{"current_steps": 820, "total_steps": 2766, "loss": 1.3781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007983768038671568, "epoch": 0.89, "percentage": 29.65, "elapsed_time": "0:49:30", "remaining_time": "1:57:28"} +{"current_steps": 825, "total_steps": 2766, "loss": 1.4056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007960935421396062, "epoch": 0.89, "percentage": 29.83, "elapsed_time": "0:49:47", "remaining_time": "1:57:09"} +{"current_steps": 830, "total_steps": 2766, "loss": 1.4463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007938007312960178, "epoch": 0.9, "percentage": 30.01, "elapsed_time": "0:50:05", "remaining_time": "1:56:50"} +{"current_steps": 835, "total_steps": 2766, "loss": 1.3983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007914984452803105, "epoch": 0.9, "percentage": 30.19, "elapsed_time": "0:50:22", "remaining_time": "1:56:30"} +{"current_steps": 840, "total_steps": 2766, "loss": 1.3968, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007891867583419805, "epoch": 0.91, "percentage": 30.37, "elapsed_time": "0:50:40", "remaining_time": "1:56:11"} +{"current_steps": 845, "total_steps": 2766, "loss": 1.4587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007868657450337066, "epoch": 0.92, "percentage": 30.55, "elapsed_time": "0:50:58", "remaining_time": "1:55:52"} +{"current_steps": 850, "total_steps": 2766, "loss": 1.4654, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007845354802089463, "epoch": 0.92, "percentage": 30.73, "elapsed_time": "0:51:15", "remaining_time": "1:55:32"} +{"current_steps": 855, "total_steps": 2766, "loss": 1.4384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007821960390195224, "epoch": 0.93, "percentage": 30.91, "elapsed_time": "0:51:33", "remaining_time": "1:55:13"} +{"current_steps": 860, "total_steps": 2766, "loss": 1.44, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007798474969131971, "epoch": 0.93, "percentage": 31.09, "elapsed_time": "0:51:50", "remaining_time": "1:54:54"} +{"current_steps": 865, "total_steps": 2766, "loss": 1.4221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007774899296312414, "epoch": 0.94, "percentage": 31.27, "elapsed_time": "0:52:08", "remaining_time": "1:54:35"} +{"current_steps": 870, "total_steps": 2766, "loss": 1.3795, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007751234132059906, "epoch": 0.94, "percentage": 31.45, "elapsed_time": "0:52:26", "remaining_time": "1:54:16"} +{"current_steps": 875, "total_steps": 2766, "loss": 1.4748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007727480239583933, "epoch": 0.95, "percentage": 31.63, "elapsed_time": "0:52:43", "remaining_time": "1:53:57"} +{"current_steps": 880, "total_steps": 2766, "loss": 1.5171, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007703638384955494, "epoch": 0.95, "percentage": 31.81, "elapsed_time": "0:53:01", "remaining_time": "1:53:37"} +{"current_steps": 885, "total_steps": 2766, "loss": 1.3996, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007679709337082394, "epoch": 0.96, "percentage": 32.0, "elapsed_time": "0:53:18", "remaining_time": "1:53:18"} +{"current_steps": 890, "total_steps": 2766, "loss": 1.4386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007655693867684454, "epoch": 0.96, "percentage": 32.18, "elapsed_time": "0:53:36", "remaining_time": "1:52:59"} +{"current_steps": 895, "total_steps": 2766, "loss": 1.3789, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007631592751268618, "epoch": 0.97, "percentage": 32.36, "elapsed_time": "0:53:53", "remaining_time": "1:52:40"} +{"current_steps": 900, "total_steps": 2766, "loss": 1.4553, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007607406765103972, "epoch": 0.98, "percentage": 32.54, "elapsed_time": "0:54:11", "remaining_time": "1:52:21"} +{"current_steps": 900, "total_steps": 2766, "loss": null, "eval_loss": 1.4479364156723022, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.98, "percentage": 32.54, "elapsed_time": "0:54:11", "remaining_time": "1:52:21"} +{"current_steps": 905, "total_steps": 2766, "loss": 1.3962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000758313668919668, "epoch": 0.98, "percentage": 32.72, "elapsed_time": "0:54:39", "remaining_time": "1:52:24"} +{"current_steps": 910, "total_steps": 2766, "loss": 1.3899, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000755878330626483, "epoch": 0.99, "percentage": 32.9, "elapsed_time": "0:54:57", "remaining_time": "1:52:05"} +{"current_steps": 915, "total_steps": 2766, "loss": 1.3965, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007534347401713191, "epoch": 0.99, "percentage": 33.08, "elapsed_time": "0:55:14", "remaining_time": "1:51:46"} +{"current_steps": 920, "total_steps": 2766, "loss": 1.367, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007509829763607879, "epoch": 1.0, "percentage": 33.26, "elapsed_time": "0:55:32", "remaining_time": "1:51:26"} +{"current_steps": 925, "total_steps": 2766, "loss": 1.4027, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007485231182650945, "epoch": 1.0, "percentage": 33.44, "elapsed_time": "0:55:50", "remaining_time": "1:51:07"} +{"current_steps": 930, "total_steps": 2766, "loss": 1.3563, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007460552452154877, "epoch": 1.01, "percentage": 33.62, "elapsed_time": "0:56:07", "remaining_time": "1:50:48"} +{"current_steps": 935, "total_steps": 2766, "loss": 1.3192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007435794368017007, "epoch": 1.01, "percentage": 33.8, "elapsed_time": "0:56:25", "remaining_time": "1:50:29"} +{"current_steps": 940, "total_steps": 2766, "loss": 1.2772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007410957728693856, "epoch": 1.02, "percentage": 33.98, "elapsed_time": "0:56:42", "remaining_time": "1:50:10"} +{"current_steps": 945, "total_steps": 2766, "loss": 1.3291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007386043335175367, "epoch": 1.02, "percentage": 34.16, "elapsed_time": "0:57:00", "remaining_time": "1:49:51"} +{"current_steps": 950, "total_steps": 2766, "loss": 1.304, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000736105199095909, "epoch": 1.03, "percentage": 34.35, "elapsed_time": "0:57:18", "remaining_time": "1:49:32"} +{"current_steps": 955, "total_steps": 2766, "loss": 1.3832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007335984502024256, "epoch": 1.03, "percentage": 34.53, "elapsed_time": "0:57:35", "remaining_time": "1:49:13"} +{"current_steps": 960, "total_steps": 2766, "loss": 1.3351, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007310841676805791, "epoch": 1.04, "percentage": 34.71, "elapsed_time": "0:57:53", "remaining_time": "1:48:54"} +{"current_steps": 965, "total_steps": 2766, "loss": 1.3375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000728562432616824, "epoch": 1.05, "percentage": 34.89, "elapsed_time": "0:58:10", "remaining_time": "1:48:34"} +{"current_steps": 970, "total_steps": 2766, "loss": 1.3323, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007260333263379619, "epoch": 1.05, "percentage": 35.07, "elapsed_time": "0:58:28", "remaining_time": "1:48:15"} +{"current_steps": 975, "total_steps": 2766, "loss": 1.3293, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007234969304085186, "epoch": 1.06, "percentage": 35.25, "elapsed_time": "0:58:45", "remaining_time": "1:47:56"} +{"current_steps": 980, "total_steps": 2766, "loss": 1.3859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007209533266281133, "epoch": 1.06, "percentage": 35.43, "elapsed_time": "0:59:03", "remaining_time": "1:47:37"} +{"current_steps": 985, "total_steps": 2766, "loss": 1.3553, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007184025970288211, "epoch": 1.07, "percentage": 35.61, "elapsed_time": "0:59:21", "remaining_time": "1:47:18"} +{"current_steps": 990, "total_steps": 2766, "loss": 1.3607, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000715844823872527, "epoch": 1.07, "percentage": 35.79, "elapsed_time": "0:59:38", "remaining_time": "1:47:00"} +{"current_steps": 995, "total_steps": 2766, "loss": 1.3457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007132800896482731, "epoch": 1.08, "percentage": 35.97, "elapsed_time": "0:59:56", "remaining_time": "1:46:41"} +{"current_steps": 1000, "total_steps": 2766, "loss": 1.3788, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007107084770695986, "epoch": 1.08, "percentage": 36.15, "elapsed_time": "1:00:13", "remaining_time": "1:46:22"} +{"current_steps": 1000, "total_steps": 2766, "loss": null, "eval_loss": 1.4371482133865356, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.08, "percentage": 36.15, "elapsed_time": "1:00:13", "remaining_time": "1:46:22"} +{"current_steps": 1005, "total_steps": 2766, "loss": 1.3039, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007081300690718709, "epoch": 1.09, "percentage": 36.33, "elapsed_time": "1:00:42", "remaining_time": "1:46:21"} +{"current_steps": 1010, "total_steps": 2766, "loss": 1.2719, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007055449488096132, "epoch": 1.09, "percentage": 36.51, "elapsed_time": "1:00:59", "remaining_time": "1:46:02"} +{"current_steps": 1015, "total_steps": 2766, "loss": 1.4107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007029531996538212, "epoch": 1.1, "percentage": 36.7, "elapsed_time": "1:01:17", "remaining_time": "1:45:43"} +{"current_steps": 1020, "total_steps": 2766, "loss": 1.38, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007003549051892738, "epoch": 1.11, "percentage": 36.88, "elapsed_time": "1:01:34", "remaining_time": "1:45:24"} +{"current_steps": 1025, "total_steps": 2766, "loss": 1.3408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006977501492118391, "epoch": 1.11, "percentage": 37.06, "elapsed_time": "1:01:52", "remaining_time": "1:45:05"} +{"current_steps": 1030, "total_steps": 2766, "loss": 1.3704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006951390157257712, "epoch": 1.12, "percentage": 37.24, "elapsed_time": "1:02:10", "remaining_time": "1:44:46"} +{"current_steps": 1035, "total_steps": 2766, "loss": 1.345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006925215889410004, "epoch": 1.12, "percentage": 37.42, "elapsed_time": "1:02:27", "remaining_time": "1:44:27"} +{"current_steps": 1040, "total_steps": 2766, "loss": 1.3414, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006898979532704186, "epoch": 1.13, "percentage": 37.6, "elapsed_time": "1:02:45", "remaining_time": "1:44:08"} +{"current_steps": 1045, "total_steps": 2766, "loss": 1.3131, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006872681933271559, "epoch": 1.13, "percentage": 37.78, "elapsed_time": "1:03:02", "remaining_time": "1:43:49"} +{"current_steps": 1050, "total_steps": 2766, "loss": 1.3363, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006846323939218526, "epoch": 1.14, "percentage": 37.96, "elapsed_time": "1:03:20", "remaining_time": "1:43:31"} +{"current_steps": 1055, "total_steps": 2766, "loss": 1.3659, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006819906400599234, "epoch": 1.14, "percentage": 38.14, "elapsed_time": "1:03:38", "remaining_time": "1:43:12"} +{"current_steps": 1060, "total_steps": 2766, "loss": 1.3145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006793430169388163, "epoch": 1.15, "percentage": 38.32, "elapsed_time": "1:03:55", "remaining_time": "1:42:53"} +{"current_steps": 1065, "total_steps": 2766, "loss": 1.3727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006766896099452652, "epoch": 1.15, "percentage": 38.5, "elapsed_time": "1:04:13", "remaining_time": "1:42:34"} +{"current_steps": 1070, "total_steps": 2766, "loss": 1.3478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006740305046525351, "epoch": 1.16, "percentage": 38.68, "elapsed_time": "1:04:30", "remaining_time": "1:42:15"} +{"current_steps": 1075, "total_steps": 2766, "loss": 1.3848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006713657868176639, "epoch": 1.16, "percentage": 38.86, "elapsed_time": "1:04:48", "remaining_time": "1:41:56"} +{"current_steps": 1080, "total_steps": 2766, "loss": 1.3501, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006686955423786951, "epoch": 1.17, "percentage": 39.05, "elapsed_time": "1:05:05", "remaining_time": "1:41:37"} +{"current_steps": 1085, "total_steps": 2766, "loss": 1.3782, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006660198574519078, "epoch": 1.18, "percentage": 39.23, "elapsed_time": "1:05:23", "remaining_time": "1:41:18"} +{"current_steps": 1090, "total_steps": 2766, "loss": 1.3767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000663338818329038, "epoch": 1.18, "percentage": 39.41, "elapsed_time": "1:05:41", "remaining_time": "1:40:59"} +{"current_steps": 1095, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006606525114744965, "epoch": 1.19, "percentage": 39.59, "elapsed_time": "1:05:58", "remaining_time": "1:40:41"} +{"current_steps": 1100, "total_steps": 2766, "loss": 1.2234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006579610235225805, "epoch": 1.19, "percentage": 39.77, "elapsed_time": "1:06:16", "remaining_time": "1:40:22"} +{"current_steps": 1100, "total_steps": 2766, "loss": null, "eval_loss": 1.4341663122177124, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.19, "percentage": 39.77, "elapsed_time": "1:06:16", "remaining_time": "1:40:22"} +{"current_steps": 1105, "total_steps": 2766, "loss": 1.4083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006552644412746791, "epoch": 1.2, "percentage": 39.95, "elapsed_time": "1:06:44", "remaining_time": "1:40:19"} +{"current_steps": 1110, "total_steps": 2766, "loss": 1.4225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006525628516964741, "epoch": 1.2, "percentage": 40.13, "elapsed_time": "1:07:02", "remaining_time": "1:40:00"} +{"current_steps": 1115, "total_steps": 2766, "loss": 1.3677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006498563419151354, "epoch": 1.21, "percentage": 40.31, "elapsed_time": "1:07:19", "remaining_time": "1:39:41"} +{"current_steps": 1120, "total_steps": 2766, "loss": 1.2836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006471449992165113, "epoch": 1.21, "percentage": 40.49, "elapsed_time": "1:07:37", "remaining_time": "1:39:22"} +{"current_steps": 1125, "total_steps": 2766, "loss": 1.3428, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006444289110423129, "epoch": 1.22, "percentage": 40.67, "elapsed_time": "1:07:54", "remaining_time": "1:39:03"} +{"current_steps": 1130, "total_steps": 2766, "loss": 1.3192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006417081649872952, "epoch": 1.22, "percentage": 40.85, "elapsed_time": "1:08:12", "remaining_time": "1:38:45"} +{"current_steps": 1135, "total_steps": 2766, "loss": 1.3084, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006389828487964305, "epoch": 1.23, "percentage": 41.03, "elapsed_time": "1:08:30", "remaining_time": "1:38:26"} +{"current_steps": 1140, "total_steps": 2766, "loss": 1.3702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00063625305036208, "epoch": 1.24, "percentage": 41.21, "elapsed_time": "1:08:47", "remaining_time": "1:38:07"} +{"current_steps": 1145, "total_steps": 2766, "loss": 1.3054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000633518857721159, "epoch": 1.24, "percentage": 41.4, "elapsed_time": "1:09:05", "remaining_time": "1:37:48"} +{"current_steps": 1150, "total_steps": 2766, "loss": 1.3211, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006307803590522972, "epoch": 1.25, "percentage": 41.58, "elapsed_time": "1:09:22", "remaining_time": "1:37:29"} +{"current_steps": 1155, "total_steps": 2766, "loss": 1.3319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006280376426729947, "epoch": 1.25, "percentage": 41.76, "elapsed_time": "1:09:40", "remaining_time": "1:37:10"} +{"current_steps": 1160, "total_steps": 2766, "loss": 1.4346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006252907970367749, "epoch": 1.26, "percentage": 41.94, "elapsed_time": "1:09:57", "remaining_time": "1:36:52"} +{"current_steps": 1165, "total_steps": 2766, "loss": 1.3938, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006225399107303309, "epoch": 1.26, "percentage": 42.12, "elapsed_time": "1:10:15", "remaining_time": "1:36:33"} +{"current_steps": 1170, "total_steps": 2766, "loss": 1.4371, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006197850724706682, "epoch": 1.27, "percentage": 42.3, "elapsed_time": "1:10:33", "remaining_time": "1:36:14"} +{"current_steps": 1175, "total_steps": 2766, "loss": 1.2925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006170263711022451, "epoch": 1.27, "percentage": 42.48, "elapsed_time": "1:10:50", "remaining_time": "1:35:55"} +{"current_steps": 1180, "total_steps": 2766, "loss": 1.3135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006142638955941057, "epoch": 1.28, "percentage": 42.66, "elapsed_time": "1:11:08", "remaining_time": "1:35:36"} +{"current_steps": 1185, "total_steps": 2766, "loss": 1.3572, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006114977350370114, "epoch": 1.28, "percentage": 42.84, "elapsed_time": "1:11:25", "remaining_time": "1:35:18"} +{"current_steps": 1190, "total_steps": 2766, "loss": 1.3918, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006087279786405684, "epoch": 1.29, "percentage": 43.02, "elapsed_time": "1:11:43", "remaining_time": "1:34:59"} +{"current_steps": 1195, "total_steps": 2766, "loss": 1.3732, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006059547157303491, "epoch": 1.3, "percentage": 43.2, "elapsed_time": "1:12:01", "remaining_time": "1:34:40"} +{"current_steps": 1200, "total_steps": 2766, "loss": 1.3541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006031780357450124, "epoch": 1.3, "percentage": 43.38, "elapsed_time": "1:12:18", "remaining_time": "1:34:21"} +{"current_steps": 1200, "total_steps": 2766, "loss": null, "eval_loss": 1.4208500385284424, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.3, "percentage": 43.38, "elapsed_time": "1:12:18", "remaining_time": "1:34:21"} +{"current_steps": 1205, "total_steps": 2766, "loss": 1.2997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006003980282334191, "epoch": 1.31, "percentage": 43.56, "elapsed_time": "1:12:46", "remaining_time": "1:34:17"} +{"current_steps": 1210, "total_steps": 2766, "loss": 1.2832, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005976147828517439, "epoch": 1.31, "percentage": 43.75, "elapsed_time": "1:13:04", "remaining_time": "1:33:58"} +{"current_steps": 1215, "total_steps": 2766, "loss": 1.3863, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005948283893605839, "epoch": 1.32, "percentage": 43.93, "elapsed_time": "1:13:22", "remaining_time": "1:33:39"} +{"current_steps": 1220, "total_steps": 2766, "loss": 1.3599, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005920389376220633, "epoch": 1.32, "percentage": 44.11, "elapsed_time": "1:13:39", "remaining_time": "1:33:20"} +{"current_steps": 1225, "total_steps": 2766, "loss": 1.3085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005892465175969366, "epoch": 1.33, "percentage": 44.29, "elapsed_time": "1:13:57", "remaining_time": "1:33:01"} +{"current_steps": 1230, "total_steps": 2766, "loss": 1.3355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000586451219341686, "epoch": 1.33, "percentage": 44.47, "elapsed_time": "1:14:14", "remaining_time": "1:32:43"} +{"current_steps": 1235, "total_steps": 2766, "loss": 1.291, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005836531330056176, "epoch": 1.34, "percentage": 44.65, "elapsed_time": "1:14:32", "remaining_time": "1:32:24"} +{"current_steps": 1240, "total_steps": 2766, "loss": 1.3286, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005808523488279542, "epoch": 1.34, "percentage": 44.83, "elapsed_time": "1:14:50", "remaining_time": "1:32:05"} +{"current_steps": 1245, "total_steps": 2766, "loss": 1.3704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005780489571349249, "epoch": 1.35, "percentage": 45.01, "elapsed_time": "1:15:07", "remaining_time": "1:31:46"} +{"current_steps": 1250, "total_steps": 2766, "loss": 1.3263, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000575243048336852, "epoch": 1.35, "percentage": 45.19, "elapsed_time": "1:15:25", "remaining_time": "1:31:28"} +{"current_steps": 1255, "total_steps": 2766, "loss": 1.3357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005724347129252354, "epoch": 1.36, "percentage": 45.37, "elapsed_time": "1:15:42", "remaining_time": "1:31:09"} +{"current_steps": 1260, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005696240414698337, "epoch": 1.37, "percentage": 45.55, "elapsed_time": "1:16:00", "remaining_time": "1:30:50"} +{"current_steps": 1265, "total_steps": 2766, "loss": 1.2568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005668111246157441, "epoch": 1.37, "percentage": 45.73, "elapsed_time": "1:16:17", "remaining_time": "1:30:31"} +{"current_steps": 1270, "total_steps": 2766, "loss": 1.3212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005639960530804787, "epoch": 1.38, "percentage": 45.91, "elapsed_time": "1:16:35", "remaining_time": "1:30:13"} +{"current_steps": 1275, "total_steps": 2766, "loss": 1.3358, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005611789176510384, "epoch": 1.38, "percentage": 46.1, "elapsed_time": "1:16:53", "remaining_time": "1:29:54"} +{"current_steps": 1280, "total_steps": 2766, "loss": 1.3618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005583598091809859, "epoch": 1.39, "percentage": 46.28, "elapsed_time": "1:17:10", "remaining_time": "1:29:35"} +{"current_steps": 1285, "total_steps": 2766, "loss": 1.3273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005555388185875146, "epoch": 1.39, "percentage": 46.46, "elapsed_time": "1:17:28", "remaining_time": "1:29:17"} +{"current_steps": 1290, "total_steps": 2766, "loss": 1.284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005527160368485172, "epoch": 1.4, "percentage": 46.64, "elapsed_time": "1:17:45", "remaining_time": "1:28:58"} +{"current_steps": 1295, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005498915549996516, "epoch": 1.4, "percentage": 46.82, "elapsed_time": "1:18:03", "remaining_time": "1:28:39"} +{"current_steps": 1300, "total_steps": 2766, "loss": 1.2796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005470654641314045, "epoch": 1.41, "percentage": 47.0, "elapsed_time": "1:18:21", "remaining_time": "1:28:21"} +{"current_steps": 1300, "total_steps": 2766, "loss": null, "eval_loss": 1.4054052829742432, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.41, "percentage": 47.0, "elapsed_time": "1:18:21", "remaining_time": "1:28:21"} +{"current_steps": 1305, "total_steps": 2766, "loss": 1.3107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005442378553861545, "epoch": 1.41, "percentage": 47.18, "elapsed_time": "1:18:49", "remaining_time": "1:28:14"} +{"current_steps": 1310, "total_steps": 2766, "loss": 1.3665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005414088199552319, "epoch": 1.42, "percentage": 47.36, "elapsed_time": "1:19:06", "remaining_time": "1:27:55"} +{"current_steps": 1315, "total_steps": 2766, "loss": 1.3326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000538578449075978, "epoch": 1.43, "percentage": 47.54, "elapsed_time": "1:19:24", "remaining_time": "1:27:37"} +{"current_steps": 1320, "total_steps": 2766, "loss": 1.3383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005357468340288031, "epoch": 1.43, "percentage": 47.72, "elapsed_time": "1:19:42", "remaining_time": "1:27:18"} +{"current_steps": 1325, "total_steps": 2766, "loss": 1.336, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000532914066134242, "epoch": 1.44, "percentage": 47.9, "elapsed_time": "1:19:59", "remaining_time": "1:26:59"} +{"current_steps": 1330, "total_steps": 2766, "loss": 1.3949, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005300802367500093, "epoch": 1.44, "percentage": 48.08, "elapsed_time": "1:20:17", "remaining_time": "1:26:41"} +{"current_steps": 1335, "total_steps": 2766, "loss": 1.3214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005272454372680532, "epoch": 1.45, "percentage": 48.26, "elapsed_time": "1:20:34", "remaining_time": "1:26:22"} +{"current_steps": 1340, "total_steps": 2766, "loss": 1.376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005244097591116077, "epoch": 1.45, "percentage": 48.45, "elapsed_time": "1:20:52", "remaining_time": "1:26:03"} +{"current_steps": 1345, "total_steps": 2766, "loss": 1.2345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005215732937322439, "epoch": 1.46, "percentage": 48.63, "elapsed_time": "1:21:09", "remaining_time": "1:25:45"} +{"current_steps": 1350, "total_steps": 2766, "loss": 1.4495, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005187361326069224, "epoch": 1.46, "percentage": 48.81, "elapsed_time": "1:21:27", "remaining_time": "1:25:26"} +{"current_steps": 1355, "total_steps": 2766, "loss": 1.3978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005158983672350405, "epoch": 1.47, "percentage": 48.99, "elapsed_time": "1:21:45", "remaining_time": "1:25:07"} +{"current_steps": 1360, "total_steps": 2766, "loss": 1.2517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005130600891354833, "epoch": 1.47, "percentage": 49.17, "elapsed_time": "1:22:02", "remaining_time": "1:24:49"} +{"current_steps": 1365, "total_steps": 2766, "loss": 1.3823, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005102213898436715, "epoch": 1.48, "percentage": 49.35, "elapsed_time": "1:22:20", "remaining_time": "1:24:30"} +{"current_steps": 1370, "total_steps": 2766, "loss": 1.3219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005073823609086091, "epoch": 1.48, "percentage": 49.53, "elapsed_time": "1:22:37", "remaining_time": "1:24:11"} +{"current_steps": 1375, "total_steps": 2766, "loss": 1.3354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005045430938899315, "epoch": 1.49, "percentage": 49.71, "elapsed_time": "1:22:55", "remaining_time": "1:23:53"} +{"current_steps": 1380, "total_steps": 2766, "loss": 1.3054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005017036803549523, "epoch": 1.5, "percentage": 49.89, "elapsed_time": "1:23:13", "remaining_time": "1:23:34"} +{"current_steps": 1385, "total_steps": 2766, "loss": 1.2346, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004988642118757102, "epoch": 1.5, "percentage": 50.07, "elapsed_time": "1:23:30", "remaining_time": "1:23:16"} +{"current_steps": 1390, "total_steps": 2766, "loss": 1.274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004960247800260161, "epoch": 1.51, "percentage": 50.25, "elapsed_time": "1:23:48", "remaining_time": "1:22:57"} +{"current_steps": 1395, "total_steps": 2766, "loss": 1.4231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004931854763784994, "epoch": 1.51, "percentage": 50.43, "elapsed_time": "1:24:05", "remaining_time": "1:22:38"} +{"current_steps": 1400, "total_steps": 2766, "loss": 1.2872, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000490346392501655, "epoch": 1.52, "percentage": 50.61, "elapsed_time": "1:24:23", "remaining_time": "1:22:20"} +{"current_steps": 1400, "total_steps": 2766, "loss": null, "eval_loss": 1.3990795612335205, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.52, "percentage": 50.61, "elapsed_time": "1:24:23", "remaining_time": "1:22:20"} +{"current_steps": 1405, "total_steps": 2766, "loss": 1.4041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00048750761995688984, "epoch": 1.52, "percentage": 50.8, "elapsed_time": "1:24:51", "remaining_time": "1:22:12"} +{"current_steps": 1410, "total_steps": 2766, "loss": 1.3405, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004846692502955709, "epoch": 1.53, "percentage": 50.98, "elapsed_time": "1:25:09", "remaining_time": "1:21:53"} +{"current_steps": 1415, "total_steps": 2766, "loss": 1.3198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00048183137505607154, "epoch": 1.53, "percentage": 51.16, "elapsed_time": "1:25:26", "remaining_time": "1:21:34"} +{"current_steps": 1420, "total_steps": 2766, "loss": 1.3528, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047899408576082016, "epoch": 1.54, "percentage": 51.34, "elapsed_time": "1:25:44", "remaining_time": "1:21:16"} +{"current_steps": 1425, "total_steps": 2766, "loss": 1.3095, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004761574739133478, "epoch": 1.54, "percentage": 51.52, "elapsed_time": "1:26:02", "remaining_time": "1:20:57"} +{"current_steps": 1430, "total_steps": 2766, "loss": 1.3278, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047332163099533787, "epoch": 1.55, "percentage": 51.7, "elapsed_time": "1:26:19", "remaining_time": "1:20:39"} +{"current_steps": 1435, "total_steps": 2766, "loss": 1.3305, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047048664846367587, "epoch": 1.56, "percentage": 51.88, "elapsed_time": "1:26:37", "remaining_time": "1:20:20"} +{"current_steps": 1440, "total_steps": 2766, "loss": 1.3997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004676526177474991, "epoch": 1.56, "percentage": 52.06, "elapsed_time": "1:26:54", "remaining_time": "1:20:01"} +{"current_steps": 1445, "total_steps": 2766, "loss": 1.341, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00046481963024524846, "epoch": 1.57, "percentage": 52.24, "elapsed_time": "1:27:12", "remaining_time": "1:19:43"} +{"current_steps": 1450, "total_steps": 2766, "loss": 1.3008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00046198777732172133, "epoch": 1.57, "percentage": 52.42, "elapsed_time": "1:27:29", "remaining_time": "1:19:24"} +{"current_steps": 1455, "total_steps": 2766, "loss": 1.2643, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045915715030512405, "epoch": 1.58, "percentage": 52.6, "elapsed_time": "1:27:47", "remaining_time": "1:19:06"} +{"current_steps": 1460, "total_steps": 2766, "loss": 1.3169, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004563278404841273, "epoch": 1.58, "percentage": 52.78, "elapsed_time": "1:28:05", "remaining_time": "1:18:47"} +{"current_steps": 1465, "total_steps": 2766, "loss": 1.3062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045349993910492154, "epoch": 1.59, "percentage": 52.96, "elapsed_time": "1:28:22", "remaining_time": "1:18:29"} +{"current_steps": 1470, "total_steps": 2766, "loss": 1.2876, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00045067353736827495, "epoch": 1.59, "percentage": 53.15, "elapsed_time": "1:28:40", "remaining_time": "1:18:10"} +{"current_steps": 1475, "total_steps": 2766, "loss": 1.3534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004478487264265913, "epoch": 1.6, "percentage": 53.33, "elapsed_time": "1:28:57", "remaining_time": "1:17:51"} +{"current_steps": 1480, "total_steps": 2766, "loss": 1.318, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004450255973809707, "epoch": 1.6, "percentage": 53.51, "elapsed_time": "1:29:15", "remaining_time": "1:17:33"} +{"current_steps": 1485, "total_steps": 2766, "loss": 1.3195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000442204241278272, "epoch": 1.61, "percentage": 53.69, "elapsed_time": "1:29:33", "remaining_time": "1:17:14"} +{"current_steps": 1490, "total_steps": 2766, "loss": 1.3208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004393847491081756, "epoch": 1.61, "percentage": 53.87, "elapsed_time": "1:29:50", "remaining_time": "1:16:56"} +{"current_steps": 1495, "total_steps": 2766, "loss": 1.3879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004365672118002494, "epoch": 1.62, "percentage": 54.05, "elapsed_time": "1:30:08", "remaining_time": "1:16:37"} +{"current_steps": 1500, "total_steps": 2766, "loss": 1.3356, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004337517202210168, "epoch": 1.63, "percentage": 54.23, "elapsed_time": "1:30:25", "remaining_time": "1:16:19"} +{"current_steps": 1500, "total_steps": 2766, "loss": null, "eval_loss": 1.3873966932296753, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.63, "percentage": 54.23, "elapsed_time": "1:30:25", "remaining_time": "1:16:19"} +{"current_steps": 1505, "total_steps": 2766, "loss": 1.3163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004309383651710254, "epoch": 1.63, "percentage": 54.41, "elapsed_time": "1:30:54", "remaining_time": "1:16:09"} +{"current_steps": 1510, "total_steps": 2766, "loss": 1.3119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00042812723738191896, "epoch": 1.64, "percentage": 54.59, "elapsed_time": "1:31:11", "remaining_time": "1:15:51"} +{"current_steps": 1515, "total_steps": 2766, "loss": 1.2777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004253184275135116, "epoch": 1.64, "percentage": 54.77, "elapsed_time": "1:31:29", "remaining_time": "1:15:32"} +{"current_steps": 1520, "total_steps": 2766, "loss": 1.3624, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004225120261508637, "epoch": 1.65, "percentage": 54.95, "elapsed_time": "1:31:46", "remaining_time": "1:15:14"} +{"current_steps": 1525, "total_steps": 2766, "loss": 1.3231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004197081238013602, "epoch": 1.65, "percentage": 55.13, "elapsed_time": "1:32:04", "remaining_time": "1:14:55"} +{"current_steps": 1530, "total_steps": 2766, "loss": 1.3807, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004169068108917924, "epoch": 1.66, "percentage": 55.31, "elapsed_time": "1:32:21", "remaining_time": "1:14:37"} +{"current_steps": 1535, "total_steps": 2766, "loss": 1.3301, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004141081777654412, "epoch": 1.66, "percentage": 55.5, "elapsed_time": "1:32:39", "remaining_time": "1:14:18"} +{"current_steps": 1540, "total_steps": 2766, "loss": 1.3032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004113123146791633, "epoch": 1.67, "percentage": 55.68, "elapsed_time": "1:32:57", "remaining_time": "1:13:59"} +{"current_steps": 1545, "total_steps": 2766, "loss": 1.2957, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000408519311800481, "epoch": 1.67, "percentage": 55.86, "elapsed_time": "1:33:14", "remaining_time": "1:13:41"} +{"current_steps": 1550, "total_steps": 2766, "loss": 1.3138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00040572925920467375, "epoch": 1.68, "percentage": 56.04, "elapsed_time": "1:33:32", "remaining_time": "1:13:22"} +{"current_steps": 1555, "total_steps": 2766, "loss": 1.2496, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004029422468718737, "epoch": 1.69, "percentage": 56.22, "elapsed_time": "1:33:49", "remaining_time": "1:13:04"} +{"current_steps": 1560, "total_steps": 2766, "loss": 1.3796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004001583646841632, "epoch": 1.69, "percentage": 56.4, "elapsed_time": "1:34:07", "remaining_time": "1:12:45"} +{"current_steps": 1565, "total_steps": 2766, "loss": 1.3492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00039737770242267637, "epoch": 1.7, "percentage": 56.58, "elapsed_time": "1:34:25", "remaining_time": "1:12:27"} +{"current_steps": 1570, "total_steps": 2766, "loss": 1.3138, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00039460034976470396, "epoch": 1.7, "percentage": 56.76, "elapsed_time": "1:34:42", "remaining_time": "1:12:08"} +{"current_steps": 1575, "total_steps": 2766, "loss": 1.3172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003918263962808004, "epoch": 1.71, "percentage": 56.94, "elapsed_time": "1:35:00", "remaining_time": "1:11:50"} +{"current_steps": 1580, "total_steps": 2766, "loss": 1.3446, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003890559314318959, "epoch": 1.71, "percentage": 57.12, "elapsed_time": "1:35:17", "remaining_time": "1:11:31"} +{"current_steps": 1585, "total_steps": 2766, "loss": 1.3062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00038628904456641116, "epoch": 1.72, "percentage": 57.3, "elapsed_time": "1:35:35", "remaining_time": "1:11:13"} +{"current_steps": 1590, "total_steps": 2766, "loss": 1.2899, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00038352582491737547, "epoch": 1.72, "percentage": 57.48, "elapsed_time": "1:35:52", "remaining_time": "1:10:55"} +{"current_steps": 1595, "total_steps": 2766, "loss": 1.2942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003807663615995491, "epoch": 1.73, "percentage": 57.66, "elapsed_time": "1:36:10", "remaining_time": "1:10:36"} +{"current_steps": 1600, "total_steps": 2766, "loss": 1.2902, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003780107436065498, "epoch": 1.73, "percentage": 57.85, "elapsed_time": "1:36:28", "remaining_time": "1:10:18"} +{"current_steps": 1600, "total_steps": 2766, "loss": null, "eval_loss": 1.379552960395813, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.73, "percentage": 57.85, "elapsed_time": "1:36:28", "remaining_time": "1:10:18"} +{"current_steps": 1605, "total_steps": 2766, "loss": 1.3213, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00037525905980798183, "epoch": 1.74, "percentage": 58.03, "elapsed_time": "1:36:56", "remaining_time": "1:10:07"} +{"current_steps": 1610, "total_steps": 2766, "loss": 1.2286, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003725113989465705, "epoch": 1.74, "percentage": 58.21, "elapsed_time": "1:37:14", "remaining_time": "1:09:48"} +{"current_steps": 1615, "total_steps": 2766, "loss": 1.3394, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00036976784963530017, "epoch": 1.75, "percentage": 58.39, "elapsed_time": "1:37:31", "remaining_time": "1:09:30"} +{"current_steps": 1620, "total_steps": 2766, "loss": 1.2879, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003670285003545564, "epoch": 1.76, "percentage": 58.57, "elapsed_time": "1:37:49", "remaining_time": "1:09:11"} +{"current_steps": 1625, "total_steps": 2766, "loss": 1.3369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00036429343944927196, "epoch": 1.76, "percentage": 58.75, "elapsed_time": "1:38:06", "remaining_time": "1:08:53"} +{"current_steps": 1630, "total_steps": 2766, "loss": 1.3393, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003615627551260785, "epoch": 1.77, "percentage": 58.93, "elapsed_time": "1:38:24", "remaining_time": "1:08:34"} +{"current_steps": 1635, "total_steps": 2766, "loss": 1.3437, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003588365354504612, "epoch": 1.77, "percentage": 59.11, "elapsed_time": "1:38:41", "remaining_time": "1:08:16"} +{"current_steps": 1640, "total_steps": 2766, "loss": 1.2843, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035611486834391894, "epoch": 1.78, "percentage": 59.29, "elapsed_time": "1:38:59", "remaining_time": "1:07:57"} +{"current_steps": 1645, "total_steps": 2766, "loss": 1.3463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035339784158112893, "epoch": 1.78, "percentage": 59.47, "elapsed_time": "1:39:17", "remaining_time": "1:07:39"} +{"current_steps": 1650, "total_steps": 2766, "loss": 1.2847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035068554278711494, "epoch": 1.79, "percentage": 59.65, "elapsed_time": "1:39:34", "remaining_time": "1:07:21"} +{"current_steps": 1655, "total_steps": 2766, "loss": 1.2493, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00034797805943442313, "epoch": 1.79, "percentage": 59.83, "elapsed_time": "1:39:52", "remaining_time": "1:07:02"} +{"current_steps": 1660, "total_steps": 2766, "loss": 1.3471, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003452754788402996, "epoch": 1.8, "percentage": 60.01, "elapsed_time": "1:40:09", "remaining_time": "1:06:44"} +{"current_steps": 1665, "total_steps": 2766, "loss": 1.2983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00034257788816387475, "epoch": 1.8, "percentage": 60.2, "elapsed_time": "1:40:27", "remaining_time": "1:06:25"} +{"current_steps": 1670, "total_steps": 2766, "loss": 1.3259, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003398853744033529, "epoch": 1.81, "percentage": 60.38, "elapsed_time": "1:40:45", "remaining_time": "1:06:07"} +{"current_steps": 1675, "total_steps": 2766, "loss": 1.333, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003371980243932056, "epoch": 1.82, "percentage": 60.56, "elapsed_time": "1:41:02", "remaining_time": "1:05:48"} +{"current_steps": 1680, "total_steps": 2766, "loss": 1.3071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033451592480137195, "epoch": 1.82, "percentage": 60.74, "elapsed_time": "1:41:20", "remaining_time": "1:05:30"} +{"current_steps": 1685, "total_steps": 2766, "loss": 1.3238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033183916212646346, "epoch": 1.83, "percentage": 60.92, "elapsed_time": "1:41:37", "remaining_time": "1:05:11"} +{"current_steps": 1690, "total_steps": 2766, "loss": 1.3129, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003291678226949741, "epoch": 1.83, "percentage": 61.1, "elapsed_time": "1:41:55", "remaining_time": "1:04:53"} +{"current_steps": 1695, "total_steps": 2766, "loss": 1.3235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003265019926584964, "epoch": 1.84, "percentage": 61.28, "elapsed_time": "1:42:12", "remaining_time": "1:04:35"} +{"current_steps": 1700, "total_steps": 2766, "loss": 1.3016, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00032384175799094297, "epoch": 1.84, "percentage": 61.46, "elapsed_time": "1:42:30", "remaining_time": "1:04:16"} +{"current_steps": 1700, "total_steps": 2766, "loss": null, "eval_loss": 1.3693352937698364, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.84, "percentage": 61.46, "elapsed_time": "1:42:30", "remaining_time": "1:04:16"} +{"current_steps": 1705, "total_steps": 2766, "loss": 1.2658, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003211872044857743, "epoch": 1.85, "percentage": 61.64, "elapsed_time": "1:42:58", "remaining_time": "1:04:05"} +{"current_steps": 1710, "total_steps": 2766, "loss": 1.274, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00031853841775323103, "epoch": 1.85, "percentage": 61.82, "elapsed_time": "1:43:16", "remaining_time": "1:03:46"} +{"current_steps": 1715, "total_steps": 2766, "loss": 1.2629, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00031589548321757366, "epoch": 1.86, "percentage": 62.0, "elapsed_time": "1:43:33", "remaining_time": "1:03:28"} +{"current_steps": 1720, "total_steps": 2766, "loss": 1.3052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003132584861143274, "epoch": 1.86, "percentage": 62.18, "elapsed_time": "1:43:51", "remaining_time": "1:03:09"} +{"current_steps": 1725, "total_steps": 2766, "loss": 1.3099, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003106275114875332, "epoch": 1.87, "percentage": 62.36, "elapsed_time": "1:44:09", "remaining_time": "1:02:51"} +{"current_steps": 1730, "total_steps": 2766, "loss": 1.2878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003080026441870051, "epoch": 1.87, "percentage": 62.55, "elapsed_time": "1:44:26", "remaining_time": "1:02:32"} +{"current_steps": 1735, "total_steps": 2766, "loss": 1.2815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030538396886559393, "epoch": 1.88, "percentage": 62.73, "elapsed_time": "1:44:44", "remaining_time": "1:02:14"} +{"current_steps": 1740, "total_steps": 2766, "loss": 1.2896, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030277156997645706, "epoch": 1.89, "percentage": 62.91, "elapsed_time": "1:45:01", "remaining_time": "1:01:55"} +{"current_steps": 1745, "total_steps": 2766, "loss": 1.3545, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030016553177033466, "epoch": 1.89, "percentage": 63.09, "elapsed_time": "1:45:19", "remaining_time": "1:01:37"} +{"current_steps": 1750, "total_steps": 2766, "loss": 1.29, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002975659382928332, "epoch": 1.9, "percentage": 63.27, "elapsed_time": "1:45:37", "remaining_time": "1:01:19"} +{"current_steps": 1755, "total_steps": 2766, "loss": 1.3543, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00029497287338171385, "epoch": 1.9, "percentage": 63.45, "elapsed_time": "1:45:54", "remaining_time": "1:01:00"} +{"current_steps": 1760, "total_steps": 2766, "loss": 1.3202, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00029238642066418995, "epoch": 1.91, "percentage": 63.63, "elapsed_time": "1:46:12", "remaining_time": "1:00:42"} +{"current_steps": 1765, "total_steps": 2766, "loss": 1.3261, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002898066635542288, "epoch": 1.91, "percentage": 63.81, "elapsed_time": "1:46:29", "remaining_time": "1:00:23"} +{"current_steps": 1770, "total_steps": 2766, "loss": 1.2256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002872336852498627, "epoch": 1.92, "percentage": 63.99, "elapsed_time": "1:46:47", "remaining_time": "1:00:05"} +{"current_steps": 1775, "total_steps": 2766, "loss": 1.3423, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002846675687305045, "epoch": 1.92, "percentage": 64.17, "elapsed_time": "1:47:04", "remaining_time": "0:59:47"} +{"current_steps": 1780, "total_steps": 2766, "loss": 1.2896, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002821083967542727, "epoch": 1.93, "percentage": 64.35, "elapsed_time": "1:47:22", "remaining_time": "0:59:28"} +{"current_steps": 1785, "total_steps": 2766, "loss": 1.2312, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027955625185532217, "epoch": 1.93, "percentage": 64.53, "elapsed_time": "1:47:40", "remaining_time": "0:59:10"} +{"current_steps": 1790, "total_steps": 2766, "loss": 1.2822, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027701121634118143, "epoch": 1.94, "percentage": 64.71, "elapsed_time": "1:47:57", "remaining_time": "0:58:51"} +{"current_steps": 1795, "total_steps": 2766, "loss": 1.319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027447337229009937, "epoch": 1.95, "percentage": 64.9, "elapsed_time": "1:48:15", "remaining_time": "0:58:33"} +{"current_steps": 1800, "total_steps": 2766, "loss": 1.3727, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027194280154839824, "epoch": 1.95, "percentage": 65.08, "elapsed_time": "1:48:32", "remaining_time": "0:58:15"} +{"current_steps": 1800, "total_steps": 2766, "loss": null, "eval_loss": 1.3620151281356812, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.95, "percentage": 65.08, "elapsed_time": "1:48:32", "remaining_time": "0:58:15"} +{"current_steps": 1805, "total_steps": 2766, "loss": 1.251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002694195857278326, "epoch": 1.96, "percentage": 65.26, "elapsed_time": "1:49:01", "remaining_time": "0:58:02"} +{"current_steps": 1810, "total_steps": 2766, "loss": 1.2324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002669038062029592, "epoch": 1.96, "percentage": 65.44, "elapsed_time": "1:49:18", "remaining_time": "0:57:44"} +{"current_steps": 1815, "total_steps": 2766, "loss": 1.2644, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002643955441085115, "epoch": 1.97, "percentage": 65.62, "elapsed_time": "1:49:36", "remaining_time": "0:57:25"} +{"current_steps": 1820, "total_steps": 2766, "loss": 1.2582, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000261894880336783, "epoch": 1.97, "percentage": 65.8, "elapsed_time": "1:49:53", "remaining_time": "0:57:07"} +{"current_steps": 1825, "total_steps": 2766, "loss": 1.3433, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002594018955350191, "epoch": 1.98, "percentage": 65.98, "elapsed_time": "1:50:11", "remaining_time": "0:56:49"} +{"current_steps": 1830, "total_steps": 2766, "loss": 1.3069, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025691667010281616, "epoch": 1.98, "percentage": 66.16, "elapsed_time": "1:50:29", "remaining_time": "0:56:30"} +{"current_steps": 1835, "total_steps": 2766, "loss": 1.2895, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025443928418952724, "epoch": 1.99, "percentage": 66.34, "elapsed_time": "1:50:46", "remaining_time": "0:56:12"} +{"current_steps": 1840, "total_steps": 2766, "loss": 1.2799, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002519698176916791, "epoch": 1.99, "percentage": 66.52, "elapsed_time": "1:51:04", "remaining_time": "0:55:53"} +{"current_steps": 1845, "total_steps": 2766, "loss": 1.3044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000249508350250395, "epoch": 2.0, "percentage": 66.7, "elapsed_time": "1:51:21", "remaining_time": "0:55:35"} +{"current_steps": 1850, "total_steps": 2766, "loss": 1.2324, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002470549612488247, "epoch": 2.0, "percentage": 66.88, "elapsed_time": "1:51:39", "remaining_time": "0:55:17"} +{"current_steps": 1855, "total_steps": 2766, "loss": 1.2357, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002446097298095867, "epoch": 2.01, "percentage": 67.06, "elapsed_time": "1:51:57", "remaining_time": "0:54:58"} +{"current_steps": 1860, "total_steps": 2766, "loss": 1.2329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00024217273479221514, "epoch": 2.02, "percentage": 67.25, "elapsed_time": "1:52:14", "remaining_time": "0:54:40"} +{"current_steps": 1865, "total_steps": 2766, "loss": 1.2486, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023974405479061623, "epoch": 2.02, "percentage": 67.43, "elapsed_time": "1:52:32", "remaining_time": "0:54:22"} +{"current_steps": 1870, "total_steps": 2766, "loss": 1.1959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002373237681305348, "epoch": 2.03, "percentage": 67.61, "elapsed_time": "1:52:49", "remaining_time": "0:54:03"} +{"current_steps": 1875, "total_steps": 2766, "loss": 1.2485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023491195286702777, "epoch": 2.03, "percentage": 67.79, "elapsed_time": "1:53:07", "remaining_time": "0:53:45"} +{"current_steps": 1880, "total_steps": 2766, "loss": 1.2585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023250868678194536, "epoch": 2.04, "percentage": 67.97, "elapsed_time": "1:53:24", "remaining_time": "0:53:27"} +{"current_steps": 1885, "total_steps": 2766, "loss": 1.2108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023011404738142532, "epoch": 2.04, "percentage": 68.15, "elapsed_time": "1:53:42", "remaining_time": "0:53:08"} +{"current_steps": 1890, "total_steps": 2766, "loss": 1.188, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002277281118933916, "epoch": 2.05, "percentage": 68.33, "elapsed_time": "1:54:00", "remaining_time": "0:52:50"} +{"current_steps": 1895, "total_steps": 2766, "loss": 1.2197, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022535095726506344, "epoch": 2.05, "percentage": 68.51, "elapsed_time": "1:54:17", "remaining_time": "0:52:32"} +{"current_steps": 1900, "total_steps": 2766, "loss": 1.1352, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022298266016047513, "epoch": 2.06, "percentage": 68.69, "elapsed_time": "1:54:35", "remaining_time": "0:52:13"} +{"current_steps": 1900, "total_steps": 2766, "loss": null, "eval_loss": 1.3637186288833618, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.06, "percentage": 68.69, "elapsed_time": "1:54:35", "remaining_time": "0:52:13"} +{"current_steps": 1905, "total_steps": 2766, "loss": 1.2265, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002206232969580027, "epoch": 2.06, "percentage": 68.87, "elapsed_time": "1:55:03", "remaining_time": "0:52:00"} +{"current_steps": 1910, "total_steps": 2766, "loss": 1.2631, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021827294374790034, "epoch": 2.07, "percentage": 69.05, "elapsed_time": "1:55:21", "remaining_time": "0:51:41"} +{"current_steps": 1915, "total_steps": 2766, "loss": 1.1309, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021593167632984756, "epoch": 2.08, "percentage": 69.23, "elapsed_time": "1:55:38", "remaining_time": "0:51:23"} +{"current_steps": 1920, "total_steps": 2766, "loss": 1.2877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021359957021050392, "epoch": 2.08, "percentage": 69.41, "elapsed_time": "1:55:56", "remaining_time": "0:51:05"} +{"current_steps": 1925, "total_steps": 2766, "loss": 1.2993, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021127670060107362, "epoch": 2.09, "percentage": 69.6, "elapsed_time": "1:56:13", "remaining_time": "0:50:46"} +{"current_steps": 1930, "total_steps": 2766, "loss": 1.2244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020896314241488075, "epoch": 2.09, "percentage": 69.78, "elapsed_time": "1:56:31", "remaining_time": "0:50:28"} +{"current_steps": 1935, "total_steps": 2766, "loss": 1.1812, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002066589702649529, "epoch": 2.1, "percentage": 69.96, "elapsed_time": "1:56:49", "remaining_time": "0:50:10"} +{"current_steps": 1940, "total_steps": 2766, "loss": 1.2113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020436425846161437, "epoch": 2.1, "percentage": 70.14, "elapsed_time": "1:57:06", "remaining_time": "0:49:51"} +{"current_steps": 1945, "total_steps": 2766, "loss": 1.1754, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020207908101009054, "epoch": 2.11, "percentage": 70.32, "elapsed_time": "1:57:24", "remaining_time": "0:49:33"} +{"current_steps": 1950, "total_steps": 2766, "loss": 1.1897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019980351160812083, "epoch": 2.11, "percentage": 70.5, "elapsed_time": "1:57:41", "remaining_time": "0:49:15"} +{"current_steps": 1955, "total_steps": 2766, "loss": 1.1978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001975376236435813, "epoch": 2.12, "percentage": 70.68, "elapsed_time": "1:57:59", "remaining_time": "0:48:56"} +{"current_steps": 1960, "total_steps": 2766, "loss": 1.1937, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019528149019211883, "epoch": 2.12, "percentage": 70.86, "elapsed_time": "1:58:17", "remaining_time": "0:48:38"} +{"current_steps": 1965, "total_steps": 2766, "loss": 1.2093, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019303518401479414, "epoch": 2.13, "percentage": 71.04, "elapsed_time": "1:58:34", "remaining_time": "0:48:20"} +{"current_steps": 1970, "total_steps": 2766, "loss": 1.2119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019079877755573442, "epoch": 2.13, "percentage": 71.22, "elapsed_time": "1:58:52", "remaining_time": "0:48:01"} +{"current_steps": 1975, "total_steps": 2766, "loss": 1.1933, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001885723429397983, "epoch": 2.14, "percentage": 71.4, "elapsed_time": "1:59:09", "remaining_time": "0:47:43"} +{"current_steps": 1980, "total_steps": 2766, "loss": 1.2046, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018635595197024886, "epoch": 2.15, "percentage": 71.58, "elapsed_time": "1:59:27", "remaining_time": "0:47:25"} +{"current_steps": 1985, "total_steps": 2766, "loss": 1.1605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018414967612643814, "epoch": 2.15, "percentage": 71.76, "elapsed_time": "1:59:44", "remaining_time": "0:47:06"} +{"current_steps": 1990, "total_steps": 2766, "loss": 1.1764, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001819535865615018, "epoch": 2.16, "percentage": 71.95, "elapsed_time": "2:00:02", "remaining_time": "0:46:48"} +{"current_steps": 1995, "total_steps": 2766, "loss": 1.2094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017976775410006508, "epoch": 2.16, "percentage": 72.13, "elapsed_time": "2:00:20", "remaining_time": "0:46:30"} +{"current_steps": 2000, "total_steps": 2766, "loss": 1.146, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000177592249235958, "epoch": 2.17, "percentage": 72.31, "elapsed_time": "2:00:37", "remaining_time": "0:46:12"} +{"current_steps": 2000, "total_steps": 2766, "loss": null, "eval_loss": 1.3614530563354492, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.17, "percentage": 72.31, "elapsed_time": "2:00:37", "remaining_time": "0:46:12"} +{"current_steps": 2005, "total_steps": 2766, "loss": 1.2674, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017542714212994188, "epoch": 2.17, "percentage": 72.49, "elapsed_time": "2:01:06", "remaining_time": "0:45:57"} +{"current_steps": 2010, "total_steps": 2766, "loss": 1.2817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017327250260744698, "epoch": 2.18, "percentage": 72.67, "elapsed_time": "2:01:23", "remaining_time": "0:45:39"} +{"current_steps": 2015, "total_steps": 2766, "loss": 1.2693, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017112840015632086, "epoch": 2.18, "percentage": 72.85, "elapsed_time": "2:01:41", "remaining_time": "0:45:21"} +{"current_steps": 2020, "total_steps": 2766, "loss": 1.253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016899490392458628, "epoch": 2.19, "percentage": 73.03, "elapsed_time": "2:01:58", "remaining_time": "0:45:02"} +{"current_steps": 2025, "total_steps": 2766, "loss": 1.208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016687208271821253, "epoch": 2.19, "percentage": 73.21, "elapsed_time": "2:02:16", "remaining_time": "0:44:44"} +{"current_steps": 2030, "total_steps": 2766, "loss": 1.1818, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016476000499889514, "epoch": 2.2, "percentage": 73.39, "elapsed_time": "2:02:33", "remaining_time": "0:44:26"} +{"current_steps": 2035, "total_steps": 2766, "loss": 1.1945, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001626587388818491, "epoch": 2.21, "percentage": 73.57, "elapsed_time": "2:02:51", "remaining_time": "0:44:07"} +{"current_steps": 2040, "total_steps": 2766, "loss": 1.2225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001605683521336116, "epoch": 2.21, "percentage": 73.75, "elapsed_time": "2:03:09", "remaining_time": "0:43:49"} +{"current_steps": 2045, "total_steps": 2766, "loss": 1.1726, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015848891216985596, "epoch": 2.22, "percentage": 73.93, "elapsed_time": "2:03:26", "remaining_time": "0:43:31"} +{"current_steps": 2050, "total_steps": 2766, "loss": 1.1651, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015642048605321856, "epoch": 2.22, "percentage": 74.11, "elapsed_time": "2:03:44", "remaining_time": "0:43:13"} +{"current_steps": 2055, "total_steps": 2766, "loss": 1.2148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001543631404911356, "epoch": 2.23, "percentage": 74.3, "elapsed_time": "2:04:01", "remaining_time": "0:42:54"} +{"current_steps": 2060, "total_steps": 2766, "loss": 1.191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015231694183369106, "epoch": 2.23, "percentage": 74.48, "elapsed_time": "2:04:19", "remaining_time": "0:42:36"} +{"current_steps": 2065, "total_steps": 2766, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001502819560714781, "epoch": 2.24, "percentage": 74.66, "elapsed_time": "2:04:37", "remaining_time": "0:42:18"} +{"current_steps": 2070, "total_steps": 2766, "loss": 1.1924, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014825824883347018, "epoch": 2.24, "percentage": 74.84, "elapsed_time": "2:04:54", "remaining_time": "0:41:59"} +{"current_steps": 2075, "total_steps": 2766, "loss": 1.1714, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014624588538490413, "epoch": 2.25, "percentage": 75.02, "elapsed_time": "2:05:12", "remaining_time": "0:41:41"} +{"current_steps": 2080, "total_steps": 2766, "loss": 1.2641, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014424493062517623, "epoch": 2.25, "percentage": 75.2, "elapsed_time": "2:05:29", "remaining_time": "0:41:23"} +{"current_steps": 2085, "total_steps": 2766, "loss": 1.2721, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014225544908574872, "epoch": 2.26, "percentage": 75.38, "elapsed_time": "2:05:47", "remaining_time": "0:41:05"} +{"current_steps": 2090, "total_steps": 2766, "loss": 1.2431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014027750492806817, "epoch": 2.26, "percentage": 75.56, "elapsed_time": "2:06:04", "remaining_time": "0:40:46"} +{"current_steps": 2095, "total_steps": 2766, "loss": 1.2983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013831116194149712, "epoch": 2.27, "percentage": 75.74, "elapsed_time": "2:06:22", "remaining_time": "0:40:28"} +{"current_steps": 2100, "total_steps": 2766, "loss": 1.2144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013635648354125662, "epoch": 2.28, "percentage": 75.92, "elapsed_time": "2:06:40", "remaining_time": "0:40:10"} +{"current_steps": 2100, "total_steps": 2766, "loss": null, "eval_loss": 1.3538448810577393, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.28, "percentage": 75.92, "elapsed_time": "2:06:40", "remaining_time": "0:40:10"} +{"current_steps": 2105, "total_steps": 2766, "loss": 1.1463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001344135327663804, "epoch": 2.28, "percentage": 76.1, "elapsed_time": "2:07:08", "remaining_time": "0:39:55"} +{"current_steps": 2110, "total_steps": 2766, "loss": 1.2751, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013248237227768246, "epoch": 2.29, "percentage": 76.28, "elapsed_time": "2:07:26", "remaining_time": "0:39:37"} +{"current_steps": 2115, "total_steps": 2766, "loss": 1.2196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013056306435573633, "epoch": 2.29, "percentage": 76.46, "elapsed_time": "2:07:43", "remaining_time": "0:39:18"} +{"current_steps": 2120, "total_steps": 2766, "loss": 1.1964, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012865567089886642, "epoch": 2.3, "percentage": 76.64, "elapsed_time": "2:08:01", "remaining_time": "0:39:00"} +{"current_steps": 2125, "total_steps": 2766, "loss": 1.1749, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012676025342115105, "epoch": 2.3, "percentage": 76.83, "elapsed_time": "2:08:18", "remaining_time": "0:38:42"} +{"current_steps": 2130, "total_steps": 2766, "loss": 1.2615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012487687305043978, "epoch": 2.31, "percentage": 77.01, "elapsed_time": "2:08:36", "remaining_time": "0:38:24"} +{"current_steps": 2135, "total_steps": 2766, "loss": 1.2064, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012300559052638122, "epoch": 2.31, "percentage": 77.19, "elapsed_time": "2:08:53", "remaining_time": "0:38:05"} +{"current_steps": 2140, "total_steps": 2766, "loss": 1.1642, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012114646619846425, "epoch": 2.32, "percentage": 77.37, "elapsed_time": "2:09:11", "remaining_time": "0:37:47"} +{"current_steps": 2145, "total_steps": 2766, "loss": 1.1704, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011929956002407194, "epoch": 2.32, "percentage": 77.55, "elapsed_time": "2:09:29", "remaining_time": "0:37:29"} +{"current_steps": 2150, "total_steps": 2766, "loss": 1.1668, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011746493156654814, "epoch": 2.33, "percentage": 77.73, "elapsed_time": "2:09:46", "remaining_time": "0:37:10"} +{"current_steps": 2155, "total_steps": 2766, "loss": 1.1584, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011564263999327546, "epoch": 2.34, "percentage": 77.91, "elapsed_time": "2:10:04", "remaining_time": "0:36:52"} +{"current_steps": 2160, "total_steps": 2766, "loss": 1.2412, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011383274407376848, "epoch": 2.34, "percentage": 78.09, "elapsed_time": "2:10:21", "remaining_time": "0:36:34"} +{"current_steps": 2165, "total_steps": 2766, "loss": 1.1688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001120353021777778, "epoch": 2.35, "percentage": 78.27, "elapsed_time": "2:10:39", "remaining_time": "0:36:16"} +{"current_steps": 2170, "total_steps": 2766, "loss": 1.2097, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011025037227340711, "epoch": 2.35, "percentage": 78.45, "elapsed_time": "2:10:57", "remaining_time": "0:35:57"} +{"current_steps": 2175, "total_steps": 2766, "loss": 1.2057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010847801192524454, "epoch": 2.36, "percentage": 78.63, "elapsed_time": "2:11:14", "remaining_time": "0:35:39"} +{"current_steps": 2180, "total_steps": 2766, "loss": 1.2296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010671827829250585, "epoch": 2.36, "percentage": 78.81, "elapsed_time": "2:11:32", "remaining_time": "0:35:21"} +{"current_steps": 2185, "total_steps": 2766, "loss": 1.2547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010497122812719068, "epoch": 2.37, "percentage": 78.99, "elapsed_time": "2:11:49", "remaining_time": "0:35:03"} +{"current_steps": 2190, "total_steps": 2766, "loss": 1.1746, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010323691777225286, "epoch": 2.37, "percentage": 79.18, "elapsed_time": "2:12:07", "remaining_time": "0:34:45"} +{"current_steps": 2195, "total_steps": 2766, "loss": 1.1466, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010151540315978314, "epoch": 2.38, "percentage": 79.36, "elapsed_time": "2:12:24", "remaining_time": "0:34:26"} +{"current_steps": 2200, "total_steps": 2766, "loss": 1.1551, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.98067398092049e-05, "epoch": 2.38, "percentage": 79.54, "elapsed_time": "2:12:42", "remaining_time": "0:34:08"} +{"current_steps": 2200, "total_steps": 2766, "loss": null, "eval_loss": 1.349250316619873, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.38, "percentage": 79.54, "elapsed_time": "2:12:42", "remaining_time": "0:34:08"} +{"current_steps": 2205, "total_steps": 2766, "loss": 1.158, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.811098282548447e-05, "epoch": 2.39, "percentage": 79.72, "elapsed_time": "2:13:10", "remaining_time": "0:33:53"} +{"current_steps": 2210, "total_steps": 2766, "loss": 1.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.642818689735305e-05, "epoch": 2.4, "percentage": 79.9, "elapsed_time": "2:13:28", "remaining_time": "0:33:34"} +{"current_steps": 2215, "total_steps": 2766, "loss": 1.2504, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.475840629554394e-05, "epoch": 2.4, "percentage": 80.08, "elapsed_time": "2:13:46", "remaining_time": "0:33:16"} +{"current_steps": 2220, "total_steps": 2766, "loss": 1.1439, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.310169487104131e-05, "epoch": 2.41, "percentage": 80.26, "elapsed_time": "2:14:03", "remaining_time": "0:32:58"} +{"current_steps": 2225, "total_steps": 2766, "loss": 1.2758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.145810605334454e-05, "epoch": 2.41, "percentage": 80.44, "elapsed_time": "2:14:21", "remaining_time": "0:32:40"} +{"current_steps": 2230, "total_steps": 2766, "loss": 1.1992, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.982769284874386e-05, "epoch": 2.42, "percentage": 80.62, "elapsed_time": "2:14:38", "remaining_time": "0:32:21"} +{"current_steps": 2235, "total_steps": 2766, "loss": 1.2177, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.821050783861212e-05, "epoch": 2.42, "percentage": 80.8, "elapsed_time": "2:14:56", "remaining_time": "0:32:03"} +{"current_steps": 2240, "total_steps": 2766, "loss": 1.1942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.660660317770841e-05, "epoch": 2.43, "percentage": 80.98, "elapsed_time": "2:15:13", "remaining_time": "0:31:45"} +{"current_steps": 2245, "total_steps": 2766, "loss": 1.163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.501603059249563e-05, "epoch": 2.43, "percentage": 81.16, "elapsed_time": "2:15:31", "remaining_time": "0:31:27"} +{"current_steps": 2250, "total_steps": 2766, "loss": 1.239, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.343884137947333e-05, "epoch": 2.44, "percentage": 81.34, "elapsed_time": "2:15:49", "remaining_time": "0:31:08"} +{"current_steps": 2255, "total_steps": 2766, "loss": 1.1455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.187508640352265e-05, "epoch": 2.44, "percentage": 81.53, "elapsed_time": "2:16:06", "remaining_time": "0:30:50"} +{"current_steps": 2260, "total_steps": 2766, "loss": 1.2165, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.032481609626575e-05, "epoch": 2.45, "percentage": 81.71, "elapsed_time": "2:16:24", "remaining_time": "0:30:32"} +{"current_steps": 2265, "total_steps": 2766, "loss": 1.1982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.878808045444014e-05, "epoch": 2.45, "percentage": 81.89, "elapsed_time": "2:16:41", "remaining_time": "0:30:14"} +{"current_steps": 2270, "total_steps": 2766, "loss": 1.212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.726492903828575e-05, "epoch": 2.46, "percentage": 82.07, "elapsed_time": "2:16:59", "remaining_time": "0:29:55"} +{"current_steps": 2275, "total_steps": 2766, "loss": 1.2453, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.575541096994637e-05, "epoch": 2.47, "percentage": 82.25, "elapsed_time": "2:17:17", "remaining_time": "0:29:37"} +{"current_steps": 2280, "total_steps": 2766, "loss": 1.2607, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.4259574931886e-05, "epoch": 2.47, "percentage": 82.43, "elapsed_time": "2:17:34", "remaining_time": "0:29:19"} +{"current_steps": 2285, "total_steps": 2766, "loss": 1.1936, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.27774691653188e-05, "epoch": 2.48, "percentage": 82.61, "elapsed_time": "2:17:52", "remaining_time": "0:29:01"} +{"current_steps": 2290, "total_steps": 2766, "loss": 1.2702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.130914146865247e-05, "epoch": 2.48, "percentage": 82.79, "elapsed_time": "2:18:09", "remaining_time": "0:28:43"} +{"current_steps": 2295, "total_steps": 2766, "loss": 1.133, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.985463919594781e-05, "epoch": 2.49, "percentage": 82.97, "elapsed_time": "2:18:27", "remaining_time": "0:28:24"} +{"current_steps": 2300, "total_steps": 2766, "loss": 1.2135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.841400925539104e-05, "epoch": 2.49, "percentage": 83.15, "elapsed_time": "2:18:44", "remaining_time": "0:28:06"} +{"current_steps": 2300, "total_steps": 2766, "loss": null, "eval_loss": 1.3470078706741333, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.49, "percentage": 83.15, "elapsed_time": "2:18:44", "remaining_time": "0:28:06"} +{"current_steps": 2305, "total_steps": 2766, "loss": 1.1986, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.698729810778065e-05, "epoch": 2.5, "percentage": 83.33, "elapsed_time": "2:19:13", "remaining_time": "0:27:50"} +{"current_steps": 2310, "total_steps": 2766, "loss": 1.2254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.557455176502986e-05, "epoch": 2.5, "percentage": 83.51, "elapsed_time": "2:19:30", "remaining_time": "0:27:32"} +{"current_steps": 2315, "total_steps": 2766, "loss": 1.212, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.417581578868198e-05, "epoch": 2.51, "percentage": 83.69, "elapsed_time": "2:19:48", "remaining_time": "0:27:14"} +{"current_steps": 2320, "total_steps": 2766, "loss": 1.1517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.279113528844127e-05, "epoch": 2.51, "percentage": 83.88, "elapsed_time": "2:20:06", "remaining_time": "0:26:55"} +{"current_steps": 2325, "total_steps": 2766, "loss": 1.1889, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.14205549207184e-05, "epoch": 2.52, "percentage": 84.06, "elapsed_time": "2:20:23", "remaining_time": "0:26:37"} +{"current_steps": 2330, "total_steps": 2766, "loss": 1.2348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.006411888718982e-05, "epoch": 2.53, "percentage": 84.24, "elapsed_time": "2:20:41", "remaining_time": "0:26:19"} +{"current_steps": 2335, "total_steps": 2766, "loss": 1.1862, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.872187093337239e-05, "epoch": 2.53, "percentage": 84.42, "elapsed_time": "2:20:58", "remaining_time": "0:26:01"} +{"current_steps": 2340, "total_steps": 2766, "loss": 1.2143, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.739385434721295e-05, "epoch": 2.54, "percentage": 84.6, "elapsed_time": "2:21:16", "remaining_time": "0:25:43"} +{"current_steps": 2345, "total_steps": 2766, "loss": 1.242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.608011195769186e-05, "epoch": 2.54, "percentage": 84.78, "elapsed_time": "2:21:33", "remaining_time": "0:25:24"} +{"current_steps": 2350, "total_steps": 2766, "loss": 1.1817, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.478068613344151e-05, "epoch": 2.55, "percentage": 84.96, "elapsed_time": "2:21:51", "remaining_time": "0:25:06"} +{"current_steps": 2355, "total_steps": 2766, "loss": 1.1916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.3495618781380764e-05, "epoch": 2.55, "percentage": 85.14, "elapsed_time": "2:22:09", "remaining_time": "0:24:48"} +{"current_steps": 2360, "total_steps": 2766, "loss": 1.1231, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.2224951345362703e-05, "epoch": 2.56, "percentage": 85.32, "elapsed_time": "2:22:26", "remaining_time": "0:24:30"} +{"current_steps": 2365, "total_steps": 2766, "loss": 1.2113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.096872480483816e-05, "epoch": 2.56, "percentage": 85.5, "elapsed_time": "2:22:44", "remaining_time": "0:24:12"} +{"current_steps": 2370, "total_steps": 2766, "loss": 1.164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.972697967353445e-05, "epoch": 2.57, "percentage": 85.68, "elapsed_time": "2:23:01", "remaining_time": "0:23:53"} +{"current_steps": 2375, "total_steps": 2766, "loss": 1.1947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8499755998148656e-05, "epoch": 2.57, "percentage": 85.86, "elapsed_time": "2:23:19", "remaining_time": "0:23:35"} +{"current_steps": 2380, "total_steps": 2766, "loss": 1.2219, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.728709335705561e-05, "epoch": 2.58, "percentage": 86.04, "elapsed_time": "2:23:37", "remaining_time": "0:23:17"} +{"current_steps": 2385, "total_steps": 2766, "loss": 1.2104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6089030859032376e-05, "epoch": 2.58, "percentage": 86.23, "elapsed_time": "2:23:54", "remaining_time": "0:22:59"} +{"current_steps": 2390, "total_steps": 2766, "loss": 1.2077, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.490560714199637e-05, "epoch": 2.59, "percentage": 86.41, "elapsed_time": "2:24:12", "remaining_time": "0:22:41"} +{"current_steps": 2395, "total_steps": 2766, "loss": 1.1758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.373686037175917e-05, "epoch": 2.6, "percentage": 86.59, "elapsed_time": "2:24:29", "remaining_time": "0:22:23"} +{"current_steps": 2400, "total_steps": 2766, "loss": 1.2094, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.258282824079618e-05, "epoch": 2.6, "percentage": 86.77, "elapsed_time": "2:24:47", "remaining_time": "0:22:04"} +{"current_steps": 2400, "total_steps": 2766, "loss": null, "eval_loss": 1.3436678647994995, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.6, "percentage": 86.77, "elapsed_time": "2:24:47", "remaining_time": "0:22:04"} +{"current_steps": 2405, "total_steps": 2766, "loss": 1.2, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1443547967030816e-05, "epoch": 2.61, "percentage": 86.95, "elapsed_time": "2:25:15", "remaining_time": "0:21:48"} +{"current_steps": 2410, "total_steps": 2766, "loss": 1.2246, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.031905629263371e-05, "epoch": 2.61, "percentage": 87.13, "elapsed_time": "2:25:33", "remaining_time": "0:21:30"} +{"current_steps": 2415, "total_steps": 2766, "loss": 1.198, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.92093894828387e-05, "epoch": 2.62, "percentage": 87.31, "elapsed_time": "2:25:50", "remaining_time": "0:21:11"} +{"current_steps": 2420, "total_steps": 2766, "loss": 1.269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.811458332477252e-05, "epoch": 2.62, "percentage": 87.49, "elapsed_time": "2:26:08", "remaining_time": "0:20:53"} +{"current_steps": 2425, "total_steps": 2766, "loss": 1.189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.703467312630088e-05, "epoch": 2.63, "percentage": 87.67, "elapsed_time": "2:26:26", "remaining_time": "0:20:35"} +{"current_steps": 2430, "total_steps": 2766, "loss": 1.1938, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.596969371488995e-05, "epoch": 2.63, "percentage": 87.85, "elapsed_time": "2:26:43", "remaining_time": "0:20:17"} +{"current_steps": 2435, "total_steps": 2766, "loss": 1.2421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.491967943648289e-05, "epoch": 2.64, "percentage": 88.03, "elapsed_time": "2:27:01", "remaining_time": "0:19:59"} +{"current_steps": 2440, "total_steps": 2766, "loss": 1.1145, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.388466415439234e-05, "epoch": 2.64, "percentage": 88.21, "elapsed_time": "2:27:18", "remaining_time": "0:19:40"} +{"current_steps": 2445, "total_steps": 2766, "loss": 1.1678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2864681248208184e-05, "epoch": 2.65, "percentage": 88.39, "elapsed_time": "2:27:36", "remaining_time": "0:19:22"} +{"current_steps": 2450, "total_steps": 2766, "loss": 1.2627, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.185976361272125e-05, "epoch": 2.66, "percentage": 88.58, "elapsed_time": "2:27:53", "remaining_time": "0:19:04"} +{"current_steps": 2455, "total_steps": 2766, "loss": 1.1344, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.086994365686246e-05, "epoch": 2.66, "percentage": 88.76, "elapsed_time": "2:28:11", "remaining_time": "0:18:46"} +{"current_steps": 2460, "total_steps": 2766, "loss": 1.2325, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9895253302657188e-05, "epoch": 2.67, "percentage": 88.94, "elapsed_time": "2:28:29", "remaining_time": "0:18:28"} +{"current_steps": 2465, "total_steps": 2766, "loss": 1.2533, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8935723984196304e-05, "epoch": 2.67, "percentage": 89.12, "elapsed_time": "2:28:46", "remaining_time": "0:18:10"} +{"current_steps": 2470, "total_steps": 2766, "loss": 1.226, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7991386646622207e-05, "epoch": 2.68, "percentage": 89.3, "elapsed_time": "2:29:04", "remaining_time": "0:17:51"} +{"current_steps": 2475, "total_steps": 2766, "loss": 1.0925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7062271745130595e-05, "epoch": 2.68, "percentage": 89.48, "elapsed_time": "2:29:21", "remaining_time": "0:17:33"} +{"current_steps": 2480, "total_steps": 2766, "loss": 1.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.614840924398876e-05, "epoch": 2.69, "percentage": 89.66, "elapsed_time": "2:29:39", "remaining_time": "0:17:15"} +{"current_steps": 2485, "total_steps": 2766, "loss": 1.214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5249828615568794e-05, "epoch": 2.69, "percentage": 89.84, "elapsed_time": "2:29:57", "remaining_time": "0:16:57"} +{"current_steps": 2490, "total_steps": 2766, "loss": 1.2427, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.436655883939737e-05, "epoch": 2.7, "percentage": 90.02, "elapsed_time": "2:30:14", "remaining_time": "0:16:39"} +{"current_steps": 2495, "total_steps": 2766, "loss": 1.2447, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3498628401221078e-05, "epoch": 2.7, "percentage": 90.2, "elapsed_time": "2:30:32", "remaining_time": "0:16:21"} +{"current_steps": 2500, "total_steps": 2766, "loss": 1.1835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2646065292087403e-05, "epoch": 2.71, "percentage": 90.38, "elapsed_time": "2:30:49", "remaining_time": "0:16:02"} +{"current_steps": 2500, "total_steps": 2766, "loss": null, "eval_loss": 1.343565821647644, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.71, "percentage": 90.38, "elapsed_time": "2:30:49", "remaining_time": "0:16:02"} +{"current_steps": 2505, "total_steps": 2766, "loss": 1.2457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1808897007442762e-05, "epoch": 2.71, "percentage": 90.56, "elapsed_time": "2:31:18", "remaining_time": "0:15:45"} +{"current_steps": 2510, "total_steps": 2766, "loss": 1.2248, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.098715054624506e-05, "epoch": 2.72, "percentage": 90.74, "elapsed_time": "2:31:35", "remaining_time": "0:15:27"} +{"current_steps": 2515, "total_steps": 2766, "loss": 1.2419, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0180852410093153e-05, "epoch": 2.73, "percentage": 90.93, "elapsed_time": "2:31:53", "remaining_time": "0:15:09"} +{"current_steps": 2520, "total_steps": 2766, "loss": 1.1763, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.939002860237249e-05, "epoch": 2.73, "percentage": 91.11, "elapsed_time": "2:32:10", "remaining_time": "0:14:51"} +{"current_steps": 2525, "total_steps": 2766, "loss": 1.2035, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.8614704627416045e-05, "epoch": 2.74, "percentage": 91.29, "elapsed_time": "2:32:28", "remaining_time": "0:14:33"} +{"current_steps": 2530, "total_steps": 2766, "loss": 1.1767, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7854905489681993e-05, "epoch": 2.74, "percentage": 91.47, "elapsed_time": "2:32:45", "remaining_time": "0:14:15"} +{"current_steps": 2535, "total_steps": 2766, "loss": 1.254, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7110655692947397e-05, "epoch": 2.75, "percentage": 91.65, "elapsed_time": "2:33:03", "remaining_time": "0:13:56"} +{"current_steps": 2540, "total_steps": 2766, "loss": 1.1941, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.638197923951784e-05, "epoch": 2.75, "percentage": 91.83, "elapsed_time": "2:33:21", "remaining_time": "0:13:38"} +{"current_steps": 2545, "total_steps": 2766, "loss": 1.2568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5668899629453225e-05, "epoch": 2.76, "percentage": 92.01, "elapsed_time": "2:33:38", "remaining_time": "0:13:20"} +{"current_steps": 2550, "total_steps": 2766, "loss": 1.2237, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4971439859810199e-05, "epoch": 2.76, "percentage": 92.19, "elapsed_time": "2:33:56", "remaining_time": "0:13:02"} +{"current_steps": 2555, "total_steps": 2766, "loss": 1.172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.428962242390025e-05, "epoch": 2.77, "percentage": 92.37, "elapsed_time": "2:34:13", "remaining_time": "0:12:44"} +{"current_steps": 2560, "total_steps": 2766, "loss": 1.1835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3623469310564408e-05, "epoch": 2.77, "percentage": 92.55, "elapsed_time": "2:34:31", "remaining_time": "0:12:26"} +{"current_steps": 2565, "total_steps": 2766, "loss": 1.1335, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2973002003463797e-05, "epoch": 2.78, "percentage": 92.73, "elapsed_time": "2:34:49", "remaining_time": "0:12:07"} +{"current_steps": 2570, "total_steps": 2766, "loss": 1.1968, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2338241480387369e-05, "epoch": 2.79, "percentage": 92.91, "elapsed_time": "2:35:06", "remaining_time": "0:11:49"} +{"current_steps": 2575, "total_steps": 2766, "loss": 1.1962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1719208212574939e-05, "epoch": 2.79, "percentage": 93.09, "elapsed_time": "2:35:24", "remaining_time": "0:11:31"} +{"current_steps": 2580, "total_steps": 2766, "loss": 1.2107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.111592216405688e-05, "epoch": 2.8, "percentage": 93.28, "elapsed_time": "2:35:41", "remaining_time": "0:11:13"} +{"current_steps": 2585, "total_steps": 2766, "loss": 1.2148, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0528402791010582e-05, "epoch": 2.8, "percentage": 93.46, "elapsed_time": "2:35:59", "remaining_time": "0:10:55"} +{"current_steps": 2590, "total_steps": 2766, "loss": 1.1443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.956669041133015e-06, "epoch": 2.81, "percentage": 93.64, "elapsed_time": "2:36:17", "remaining_time": "0:10:37"} +{"current_steps": 2595, "total_steps": 2766, "loss": 1.1805, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.400739353029209e-06, "epoch": 2.81, "percentage": 93.82, "elapsed_time": "2:36:34", "remaining_time": "0:10:19"} +{"current_steps": 2600, "total_steps": 2766, "loss": 1.2061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.860631655618124e-06, "epoch": 2.82, "percentage": 94.0, "elapsed_time": "2:36:52", "remaining_time": "0:10:00"} +{"current_steps": 2600, "total_steps": 2766, "loss": null, "eval_loss": 1.3423666954040527, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.82, "percentage": 94.0, "elapsed_time": "2:36:52", "remaining_time": "0:10:00"} +{"current_steps": 2605, "total_steps": 2766, "loss": 1.2418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.336363367554112e-06, "epoch": 2.82, "percentage": 94.18, "elapsed_time": "2:37:20", "remaining_time": "0:09:43"} +{"current_steps": 2610, "total_steps": 2766, "loss": 1.2532, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.827951396665312e-06, "epoch": 2.83, "percentage": 94.36, "elapsed_time": "2:37:38", "remaining_time": "0:09:25"} +{"current_steps": 2615, "total_steps": 2766, "loss": 1.2836, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.335412139408248e-06, "epoch": 2.83, "percentage": 94.54, "elapsed_time": "2:37:55", "remaining_time": "0:09:07"} +{"current_steps": 2620, "total_steps": 2766, "loss": 1.2918, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.85876148033926e-06, "epoch": 2.84, "percentage": 94.72, "elapsed_time": "2:38:13", "remaining_time": "0:08:49"} +{"current_steps": 2625, "total_steps": 2766, "loss": 1.1381, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.398014791601847e-06, "epoch": 2.84, "percentage": 94.9, "elapsed_time": "2:38:30", "remaining_time": "0:08:30"} +{"current_steps": 2630, "total_steps": 2766, "loss": 1.1686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.953186932431298e-06, "epoch": 2.85, "percentage": 95.08, "elapsed_time": "2:38:48", "remaining_time": "0:08:12"} +{"current_steps": 2635, "total_steps": 2766, "loss": 1.2104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.524292248675289e-06, "epoch": 2.86, "percentage": 95.26, "elapsed_time": "2:39:05", "remaining_time": "0:07:54"} +{"current_steps": 2640, "total_steps": 2766, "loss": 1.2651, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.111344572331145e-06, "epoch": 2.86, "percentage": 95.44, "elapsed_time": "2:39:23", "remaining_time": "0:07:36"} +{"current_steps": 2645, "total_steps": 2766, "loss": 1.1296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.714357221099974e-06, "epoch": 2.87, "percentage": 95.63, "elapsed_time": "2:39:41", "remaining_time": "0:07:18"} +{"current_steps": 2650, "total_steps": 2766, "loss": 1.1534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.333342997957013e-06, "epoch": 2.87, "percentage": 95.81, "elapsed_time": "2:39:58", "remaining_time": "0:07:00"} +{"current_steps": 2655, "total_steps": 2766, "loss": 1.2065, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.96831419073862e-06, "epoch": 2.88, "percentage": 95.99, "elapsed_time": "2:40:16", "remaining_time": "0:06:42"} +{"current_steps": 2660, "total_steps": 2766, "loss": 1.2118, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6192825717464294e-06, "epoch": 2.88, "percentage": 96.17, "elapsed_time": "2:40:33", "remaining_time": "0:06:23"} +{"current_steps": 2665, "total_steps": 2766, "loss": 1.1139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2862593973670975e-06, "epoch": 2.89, "percentage": 96.35, "elapsed_time": "2:40:51", "remaining_time": "0:06:05"} +{"current_steps": 2670, "total_steps": 2766, "loss": 1.1702, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.969255407709648e-06, "epoch": 2.89, "percentage": 96.53, "elapsed_time": "2:41:09", "remaining_time": "0:05:47"} +{"current_steps": 2675, "total_steps": 2766, "loss": 1.1821, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.668280826259195e-06, "epoch": 2.9, "percentage": 96.71, "elapsed_time": "2:41:26", "remaining_time": "0:05:29"} +{"current_steps": 2680, "total_steps": 2766, "loss": 1.1826, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.383345359546818e-06, "epoch": 2.9, "percentage": 96.89, "elapsed_time": "2:41:44", "remaining_time": "0:05:11"} +{"current_steps": 2685, "total_steps": 2766, "loss": 1.236, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.1144581968369213e-06, "epoch": 2.91, "percentage": 97.07, "elapsed_time": "2:42:01", "remaining_time": "0:04:53"} +{"current_steps": 2690, "total_steps": 2766, "loss": 1.2308, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.861628009830696e-06, "epoch": 2.92, "percentage": 97.25, "elapsed_time": "2:42:19", "remaining_time": "0:04:35"} +{"current_steps": 2695, "total_steps": 2766, "loss": 1.1957, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6248629523865077e-06, "epoch": 2.92, "percentage": 97.43, "elapsed_time": "2:42:37", "remaining_time": "0:04:17"} +{"current_steps": 2700, "total_steps": 2766, "loss": 1.1613, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4041706602567206e-06, "epoch": 2.93, "percentage": 97.61, "elapsed_time": "2:42:54", "remaining_time": "0:03:58"} +{"current_steps": 2700, "total_steps": 2766, "loss": null, "eval_loss": 1.3419121503829956, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.93, "percentage": 97.61, "elapsed_time": "2:42:54", "remaining_time": "0:03:58"} +{"current_steps": 2705, "total_steps": 2766, "loss": 1.152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1995582508418924e-06, "epoch": 2.93, "percentage": 97.79, "elapsed_time": "2:43:22", "remaining_time": "0:03:41"} +{"current_steps": 2710, "total_steps": 2766, "loss": 1.2173, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0110323229608476e-06, "epoch": 2.94, "percentage": 97.98, "elapsed_time": "2:43:40", "remaining_time": "0:03:22"} +{"current_steps": 2715, "total_steps": 2766, "loss": 1.2202, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.385989566379593e-07, "epoch": 2.94, "percentage": 98.16, "elapsed_time": "2:43:58", "remaining_time": "0:03:04"} +{"current_steps": 2720, "total_steps": 2766, "loss": 1.1492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.82263712907083e-07, "epoch": 2.95, "percentage": 98.34, "elapsed_time": "2:44:15", "remaining_time": "0:02:46"} +{"current_steps": 2725, "total_steps": 2766, "loss": 1.2548, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.420316336323117e-07, "epoch": 2.95, "percentage": 98.52, "elapsed_time": "2:44:33", "remaining_time": "0:02:28"} +{"current_steps": 2730, "total_steps": 2766, "loss": 1.2085, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1790724134521676e-07, "epoch": 2.96, "percentage": 98.7, "elapsed_time": "2:44:50", "remaining_time": "0:02:10"} +{"current_steps": 2735, "total_steps": 2766, "loss": 1.2253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.098945390991315e-07, "epoch": 2.96, "percentage": 98.88, "elapsed_time": "2:45:08", "remaining_time": "0:01:52"} +{"current_steps": 2740, "total_steps": 2766, "loss": 1.1604, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.179970103398654e-07, "epoch": 2.97, "percentage": 99.06, "elapsed_time": "2:45:25", "remaining_time": "0:01:34"} +{"current_steps": 2745, "total_steps": 2766, "loss": 1.2269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4221761879351648e-07, "epoch": 2.97, "percentage": 99.24, "elapsed_time": "2:45:43", "remaining_time": "0:01:16"} +{"current_steps": 2750, "total_steps": 2766, "loss": 1.2413, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.25588083709361e-08, "epoch": 2.98, "percentage": 99.42, "elapsed_time": "2:46:01", "remaining_time": "0:00:57"} +{"current_steps": 2755, "total_steps": 2766, "loss": 1.2255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9022503088737006e-08, "epoch": 2.99, "percentage": 99.6, "elapsed_time": "2:46:18", "remaining_time": "0:00:39"} +{"current_steps": 2760, "total_steps": 2766, "loss": 1.2192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1610107007398175e-08, "epoch": 2.99, "percentage": 99.78, "elapsed_time": "2:46:36", "remaining_time": "0:00:21"} +{"current_steps": 2765, "total_steps": 2766, "loss": 1.1758, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2250418585677564e-10, "epoch": 3.0, "percentage": 99.96, "elapsed_time": "2:46:53", "remaining_time": "0:00:03"} +{"current_steps": 2766, "total_steps": 2766, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:46:57", "remaining_time": "0:00:00"} +{"current_steps": 19, "total_steps": 19, "loss": null, "eval_loss": 1.3419121503829956, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:47:07", "remaining_time": "0:00:00"} diff --git a/PT/trainer_state.json b/PT/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcb159af38153c9a102aadb872540e4199b71f63 --- /dev/null +++ b/PT/trainer_state.json @@ -0,0 +1,3562 @@ +{ + "best_metric": 1.3419121503829956, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/2023-09-07-12-02-29/checkpoint-2700", + "epoch": 2.9975616364128963, + "eval_steps": 100, + "global_step": 2766, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.0009999919374161553, + "loss": 2.0025, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999677499246417, + "loss": 1.7737, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009999274383055143, + "loss": 1.7391, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 0.0009998710038588363, + "loss": 1.7959, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997984484046375, + "loss": 1.713, + "step": 25 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997097742828556, + "loss": 1.6441, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009996049843532607, + "loss": 1.704, + "step": 35 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009994840819953633, + "loss": 1.6532, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009993470711083048, + "loss": 1.6791, + "step": 45 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009991939561107325, + "loss": 1.6465, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 0.000999024741940656, + "loss": 1.6511, + "step": 55 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009988394340552898, + "loss": 1.6727, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 0.0009986380384308746, + "loss": 1.6653, + "step": 65 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009984205615624873, + "loss": 1.6339, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009981870104638294, + "loss": 1.5562, + "step": 75 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979373926670028, + "loss": 1.6291, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009976717162222645, + "loss": 1.625, + "step": 85 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009973899896977695, + "loss": 1.6008, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 0.000997092222179292, + "loss": 1.6821, + "step": 95 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009967784232699352, + "loss": 1.582, + "step": 100 + }, + { + "epoch": 0.11, + "eval_loss": 1.6186352968215942, + "eval_runtime": 10.6735, + "eval_samples_per_second": 14.054, + "eval_steps_per_second": 1.78, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 0.0009964486030898186, + "loss": 1.5769, + "step": 105 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009961027722757538, + "loss": 1.5868, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009957409419809006, + "loss": 1.5601, + "step": 115 + }, + { + "epoch": 0.13, + "learning_rate": 0.000995363123874407, + "loss": 1.6061, + "step": 120 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009949693301410341, + "loss": 1.6073, + "step": 125 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009945595734807615, + "loss": 1.4998, + "step": 130 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009941338671083794, + "loss": 1.5295, + "step": 135 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009936922247530606, + "loss": 1.5418, + "step": 140 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009932346606579192, + "loss": 1.554, + "step": 145 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009927611895795513, + "loss": 1.5509, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922718267875571, + "loss": 1.6123, + "step": 155 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009917665880640515, + "loss": 1.6267, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009912454897031524, + "loss": 1.6116, + "step": 165 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009907085485104568, + "loss": 1.5618, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009901557818024981, + "loss": 1.6085, + "step": 175 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895872074061885, + "loss": 1.5829, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009890028436582426, + "loss": 1.5407, + "step": 185 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009884027094045871, + "loss": 1.5568, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009877868239997532, + "loss": 1.5831, + "step": 195 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009871552073062516, + "loss": 1.5231, + "step": 200 + }, + { + "epoch": 0.22, + "eval_loss": 1.5717933177947998, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 0.0009865078796939327, + "loss": 1.5467, + "step": 205 + }, + { + "epoch": 0.23, + "learning_rate": 0.000985844862039329, + "loss": 1.6403, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009851661757249823, + "loss": 1.5352, + "step": 215 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009844718426387537, + "loss": 1.5616, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 0.000983761885173118, + "loss": 1.5274, + "step": 225 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983036326224442, + "loss": 1.6153, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009822951891922448, + "loss": 1.5062, + "step": 235 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009815384979784444, + "loss": 1.6038, + "step": 240 + }, + { + "epoch": 0.27, + "learning_rate": 0.000980766276986586, + "loss": 1.5097, + "step": 245 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799785511210557, + "loss": 1.535, + "step": 250 + }, + { + "epoch": 0.28, + "learning_rate": 0.000979175345786277, + "loss": 1.52, + "step": 255 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009783566868858912, + "loss": 1.5678, + "step": 260 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009775226008219224, + "loss": 1.5536, + "step": 265 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009766731144939258, + "loss": 1.4826, + "step": 270 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009758082552981204, + "loss": 1.5537, + "step": 275 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009749280511265056, + "loss": 1.5277, + "step": 280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009740325303659609, + "loss": 1.5445, + "step": 285 + }, + { + "epoch": 0.31, + "learning_rate": 0.000973121721897331, + "loss": 1.4944, + "step": 290 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009721956550944948, + "loss": 1.5088, + "step": 295 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009712543598234172, + "loss": 1.585, + "step": 300 + }, + { + "epoch": 0.33, + "eval_loss": 1.5345921516418457, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 300 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009702978664411863, + "loss": 1.5427, + "step": 305 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009693262057950345, + "loss": 1.4475, + "step": 310 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009683394092213436, + "loss": 1.5321, + "step": 315 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009673375085446339, + "loss": 1.5171, + "step": 320 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009663205360765382, + "loss": 1.5198, + "step": 325 + }, + { + "epoch": 0.36, + "learning_rate": 0.00096528852461476, + "loss": 1.492, + "step": 330 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009642415074420146, + "loss": 1.5036, + "step": 335 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009631795183249573, + "loss": 1.5134, + "step": 340 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009621025915130932, + "loss": 1.5568, + "step": 345 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009610107617376733, + "loss": 1.503, + "step": 350 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009599040642105736, + "loss": 1.4584, + "step": 355 + }, + { + "epoch": 0.39, + "learning_rate": 0.000958782534623161, + "loss": 1.4832, + "step": 360 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009576462091451406, + "loss": 1.4598, + "step": 365 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564951244233901, + "loss": 1.5492, + "step": 370 + }, + { + "epoch": 0.41, + "learning_rate": 0.000955329317580778, + "loss": 1.5145, + "step": 375 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009541488262149661, + "loss": 1.589, + "step": 380 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009529536883971963, + "loss": 1.6003, + "step": 385 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009517439426710646, + "loss": 1.55, + "step": 390 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009505196280512762, + "loss": 1.5359, + "step": 395 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009492807840223881, + "loss": 1.4854, + "step": 400 + }, + { + "epoch": 0.43, + "eval_loss": 1.5193477869033813, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 400 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009480274505375358, + "loss": 1.4891, + "step": 405 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009467596680171446, + "loss": 1.4719, + "step": 410 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009454774773476257, + "loss": 1.4939, + "step": 415 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009441809198800587, + "loss": 1.4382, + "step": 420 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009428700374288564, + "loss": 1.4427, + "step": 425 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009415448722704175, + "loss": 1.4767, + "step": 430 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009402054671417628, + "loss": 1.4799, + "step": 435 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009388518652391571, + "loss": 1.4608, + "step": 440 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009374841102167157, + "loss": 1.4937, + "step": 445 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009361022461849965, + "loss": 1.5468, + "step": 450 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009347063177095783, + "loss": 1.5481, + "step": 455 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009332963698096223, + "loss": 1.4478, + "step": 460 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009318724479564215, + "loss": 1.4977, + "step": 465 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009304345980719329, + "loss": 1.5091, + "step": 470 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009289828665272977, + "loss": 1.43, + "step": 475 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009275173001413448, + "loss": 1.4725, + "step": 480 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009260379461790822, + "loss": 1.3741, + "step": 485 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245448523501708, + "loss": 1.4917, + "step": 490 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009230380668073877, + "loss": 1.4684, + "step": 495 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009215176381450717, + "loss": 1.5209, + "step": 500 + }, + { + "epoch": 0.54, + "eval_loss": 1.5050214529037476, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 500 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009199836153975573, + "loss": 1.4913, + "step": 505 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009184360480375926, + "loss": 1.5377, + "step": 510 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009168749859747438, + "loss": 1.4608, + "step": 515 + }, + { + "epoch": 0.56, + "learning_rate": 0.0009153004795537861, + "loss": 1.4738, + "step": 520 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009137125795530795, + "loss": 1.4947, + "step": 525 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009121113371829318, + "loss": 1.5267, + "step": 530 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009104968040839463, + "loss": 1.5116, + "step": 535 + }, + { + "epoch": 0.59, + "learning_rate": 0.000908869032325357, + "loss": 1.4423, + "step": 540 + }, + { + "epoch": 0.59, + "learning_rate": 0.000907228074403349, + "loss": 1.4565, + "step": 545 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009055739832393655, + "loss": 1.4923, + "step": 550 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009039068121784016, + "loss": 1.4304, + "step": 555 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009022266149872829, + "loss": 1.4422, + "step": 560 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009005334458529322, + "loss": 1.522, + "step": 565 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008988273593806222, + "loss": 1.499, + "step": 570 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008971084105922139, + "loss": 1.4796, + "step": 575 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008953766549243818, + "loss": 1.4231, + "step": 580 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008936321482268275, + "loss": 1.462, + "step": 585 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008918749467604766, + "loss": 1.5191, + "step": 590 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008901051071956661, + "loss": 1.4845, + "step": 595 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008883226866103152, + "loss": 1.4652, + "step": 600 + }, + { + "epoch": 0.65, + "eval_loss": 1.486396074295044, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 600 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008865277424880859, + "loss": 1.4773, + "step": 605 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008847203327165278, + "loss": 1.4555, + "step": 610 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008829005155852125, + "loss": 1.5235, + "step": 615 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008810683497838525, + "loss": 1.4329, + "step": 620 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008792238944004096, + "loss": 1.4515, + "step": 625 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008773672089191885, + "loss": 1.4616, + "step": 630 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008754983532189185, + "loss": 1.3931, + "step": 635 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008736173875708229, + "loss": 1.4714, + "step": 640 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008717243726366746, + "loss": 1.4831, + "step": 645 + }, + { + "epoch": 0.7, + "learning_rate": 0.00086981936946684, + "loss": 1.4928, + "step": 650 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008679024394983105, + "loss": 1.3735, + "step": 655 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008659736445527202, + "loss": 1.4587, + "step": 660 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008640330468343532, + "loss": 1.5138, + "step": 665 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008620807089281364, + "loss": 1.4625, + "step": 670 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008601166937976226, + "loss": 1.4173, + "step": 675 + }, + { + "epoch": 0.74, + "learning_rate": 0.000858141064782958, + "loss": 1.4901, + "step": 680 + }, + { + "epoch": 0.74, + "learning_rate": 0.0008561538855988409, + "loss": 1.4056, + "step": 685 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008541552203324667, + "loss": 1.4486, + "step": 690 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008521451334414605, + "loss": 1.4147, + "step": 695 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008501236897517987, + "loss": 1.4547, + "step": 700 + }, + { + "epoch": 0.76, + "eval_loss": 1.4729957580566406, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 700 + }, + { + "epoch": 0.76, + "learning_rate": 0.000848090954455718, + "loss": 1.4464, + "step": 705 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008460469931096138, + "loss": 1.4163, + "step": 710 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008439918716319246, + "loss": 1.5283, + "step": 715 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008419256563010076, + "loss": 1.4313, + "step": 720 + }, + { + "epoch": 0.79, + "learning_rate": 0.000839848413753, + "loss": 1.3995, + "step": 725 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377602109796709, + "loss": 1.4265, + "step": 730 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008356611153262598, + "loss": 1.4426, + "step": 735 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008335511944893057, + "loss": 1.4251, + "step": 740 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008314305165144633, + "loss": 1.4686, + "step": 745 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008292991497943081, + "loss": 1.4658, + "step": 750 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008271571630661321, + "loss": 1.4347, + "step": 755 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008250046254097255, + "loss": 1.4235, + "step": 760 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008228416062451494, + "loss": 1.5047, + "step": 765 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008206681753304976, + "loss": 1.445, + "step": 770 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008184844027596461, + "loss": 1.4077, + "step": 775 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008162903589599924, + "loss": 1.5057, + "step": 780 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008140861146901849, + "loss": 1.4445, + "step": 785 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008118717410378407, + "loss": 1.5333, + "step": 790 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008096473094172527, + "loss": 1.3786, + "step": 795 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008074128915670868, + "loss": 1.3781, + "step": 800 + }, + { + "epoch": 0.87, + "eval_loss": 1.4600605964660645, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 800 + }, + { + "epoch": 0.87, + "learning_rate": 0.0008051685595480678, + "loss": 1.5097, + "step": 805 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008029143857406563, + "loss": 1.5608, + "step": 810 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008006504428427133, + "loss": 1.4113, + "step": 815 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007983768038671568, + "loss": 1.3781, + "step": 820 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007960935421396062, + "loss": 1.4056, + "step": 825 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007938007312960178, + "loss": 1.4463, + "step": 830 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007914984452803105, + "loss": 1.3983, + "step": 835 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007891867583419805, + "loss": 1.3968, + "step": 840 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007868657450337066, + "loss": 1.4587, + "step": 845 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845354802089463, + "loss": 1.4654, + "step": 850 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007821960390195224, + "loss": 1.4384, + "step": 855 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007798474969131971, + "loss": 1.44, + "step": 860 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007774899296312414, + "loss": 1.4221, + "step": 865 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007751234132059906, + "loss": 1.3795, + "step": 870 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007727480239583933, + "loss": 1.4748, + "step": 875 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007703638384955494, + "loss": 1.5171, + "step": 880 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007679709337082394, + "loss": 1.3996, + "step": 885 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007655693867684454, + "loss": 1.4386, + "step": 890 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007631592751268618, + "loss": 1.3789, + "step": 895 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007607406765103972, + "loss": 1.4553, + "step": 900 + }, + { + "epoch": 0.98, + "eval_loss": 1.4479364156723022, + "eval_runtime": 10.6698, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 900 + }, + { + "epoch": 0.98, + "learning_rate": 0.000758313668919668, + "loss": 1.3962, + "step": 905 + }, + { + "epoch": 0.99, + "learning_rate": 0.000755878330626483, + "loss": 1.3899, + "step": 910 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007534347401713191, + "loss": 1.3965, + "step": 915 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007509829763607879, + "loss": 1.367, + "step": 920 + }, + { + "epoch": 1.0, + "learning_rate": 0.0007485231182650945, + "loss": 1.4027, + "step": 925 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007460552452154877, + "loss": 1.3563, + "step": 930 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007435794368017007, + "loss": 1.3192, + "step": 935 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007410957728693856, + "loss": 1.2772, + "step": 940 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007386043335175367, + "loss": 1.3291, + "step": 945 + }, + { + "epoch": 1.03, + "learning_rate": 0.000736105199095909, + "loss": 1.304, + "step": 950 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007335984502024256, + "loss": 1.3832, + "step": 955 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007310841676805791, + "loss": 1.3351, + "step": 960 + }, + { + "epoch": 1.05, + "learning_rate": 0.000728562432616824, + "loss": 1.3375, + "step": 965 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260333263379619, + "loss": 1.3323, + "step": 970 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007234969304085186, + "loss": 1.3293, + "step": 975 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007209533266281133, + "loss": 1.3859, + "step": 980 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007184025970288211, + "loss": 1.3553, + "step": 985 + }, + { + "epoch": 1.07, + "learning_rate": 0.000715844823872527, + "loss": 1.3607, + "step": 990 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007132800896482731, + "loss": 1.3457, + "step": 995 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007107084770695986, + "loss": 1.3788, + "step": 1000 + }, + { + "epoch": 1.08, + "eval_loss": 1.4371482133865356, + "eval_runtime": 10.6719, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1000 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007081300690718709, + "loss": 1.3039, + "step": 1005 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007055449488096132, + "loss": 1.2719, + "step": 1010 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007029531996538212, + "loss": 1.4107, + "step": 1015 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007003549051892738, + "loss": 1.38, + "step": 1020 + }, + { + "epoch": 1.11, + "learning_rate": 0.0006977501492118391, + "loss": 1.3408, + "step": 1025 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006951390157257712, + "loss": 1.3704, + "step": 1030 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006925215889410004, + "loss": 1.345, + "step": 1035 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006898979532704186, + "loss": 1.3414, + "step": 1040 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006872681933271559, + "loss": 1.3131, + "step": 1045 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006846323939218526, + "loss": 1.3363, + "step": 1050 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006819906400599234, + "loss": 1.3659, + "step": 1055 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006793430169388163, + "loss": 1.3145, + "step": 1060 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006766896099452652, + "loss": 1.3727, + "step": 1065 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006740305046525351, + "loss": 1.3478, + "step": 1070 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006713657868176639, + "loss": 1.3848, + "step": 1075 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006686955423786951, + "loss": 1.3501, + "step": 1080 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006660198574519078, + "loss": 1.3782, + "step": 1085 + }, + { + "epoch": 1.18, + "learning_rate": 0.000663338818329038, + "loss": 1.3767, + "step": 1090 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006606525114744965, + "loss": 1.3665, + "step": 1095 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006579610235225805, + "loss": 1.2234, + "step": 1100 + }, + { + "epoch": 1.19, + "eval_loss": 1.4341663122177124, + "eval_runtime": 10.6713, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 1100 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006552644412746791, + "loss": 1.4083, + "step": 1105 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006525628516964741, + "loss": 1.4225, + "step": 1110 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006498563419151354, + "loss": 1.3677, + "step": 1115 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006471449992165113, + "loss": 1.2836, + "step": 1120 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006444289110423129, + "loss": 1.3428, + "step": 1125 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006417081649872952, + "loss": 1.3192, + "step": 1130 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006389828487964305, + "loss": 1.3084, + "step": 1135 + }, + { + "epoch": 1.24, + "learning_rate": 0.00063625305036208, + "loss": 1.3702, + "step": 1140 + }, + { + "epoch": 1.24, + "learning_rate": 0.000633518857721159, + "loss": 1.3054, + "step": 1145 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006307803590522972, + "loss": 1.3211, + "step": 1150 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006280376426729947, + "loss": 1.3319, + "step": 1155 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006252907970367749, + "loss": 1.4346, + "step": 1160 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006225399107303309, + "loss": 1.3938, + "step": 1165 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006197850724706682, + "loss": 1.4371, + "step": 1170 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006170263711022451, + "loss": 1.2925, + "step": 1175 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006142638955941057, + "loss": 1.3135, + "step": 1180 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006114977350370114, + "loss": 1.3572, + "step": 1185 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006087279786405684, + "loss": 1.3918, + "step": 1190 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006059547157303491, + "loss": 1.3732, + "step": 1195 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006031780357450124, + "loss": 1.3541, + "step": 1200 + }, + { + "epoch": 1.3, + "eval_loss": 1.4208500385284424, + "eval_runtime": 10.67, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1200 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006003980282334191, + "loss": 1.2997, + "step": 1205 + }, + { + "epoch": 1.31, + "learning_rate": 0.0005976147828517439, + "loss": 1.2832, + "step": 1210 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005948283893605839, + "loss": 1.3863, + "step": 1215 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005920389376220633, + "loss": 1.3599, + "step": 1220 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005892465175969366, + "loss": 1.3085, + "step": 1225 + }, + { + "epoch": 1.33, + "learning_rate": 0.000586451219341686, + "loss": 1.3355, + "step": 1230 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005836531330056176, + "loss": 1.291, + "step": 1235 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005808523488279542, + "loss": 1.3286, + "step": 1240 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005780489571349249, + "loss": 1.3704, + "step": 1245 + }, + { + "epoch": 1.35, + "learning_rate": 0.000575243048336852, + "loss": 1.3263, + "step": 1250 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005724347129252354, + "loss": 1.3357, + "step": 1255 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005696240414698337, + "loss": 1.3665, + "step": 1260 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005668111246157441, + "loss": 1.2568, + "step": 1265 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005639960530804787, + "loss": 1.3212, + "step": 1270 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005611789176510384, + "loss": 1.3358, + "step": 1275 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005583598091809859, + "loss": 1.3618, + "step": 1280 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005555388185875146, + "loss": 1.3273, + "step": 1285 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005527160368485172, + "loss": 1.284, + "step": 1290 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005498915549996516, + "loss": 1.3665, + "step": 1295 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005470654641314045, + "loss": 1.2796, + "step": 1300 + }, + { + "epoch": 1.41, + "eval_loss": 1.4054052829742432, + "eval_runtime": 10.6703, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1300 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005442378553861545, + "loss": 1.3107, + "step": 1305 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005414088199552319, + "loss": 1.3665, + "step": 1310 + }, + { + "epoch": 1.43, + "learning_rate": 0.000538578449075978, + "loss": 1.3326, + "step": 1315 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005357468340288031, + "loss": 1.3383, + "step": 1320 + }, + { + "epoch": 1.44, + "learning_rate": 0.000532914066134242, + "loss": 1.336, + "step": 1325 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005300802367500093, + "loss": 1.3949, + "step": 1330 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005272454372680532, + "loss": 1.3214, + "step": 1335 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005244097591116077, + "loss": 1.376, + "step": 1340 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005215732937322439, + "loss": 1.2345, + "step": 1345 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005187361326069224, + "loss": 1.4495, + "step": 1350 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005158983672350405, + "loss": 1.3978, + "step": 1355 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005130600891354833, + "loss": 1.2517, + "step": 1360 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005102213898436715, + "loss": 1.3823, + "step": 1365 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005073823609086091, + "loss": 1.3219, + "step": 1370 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005045430938899315, + "loss": 1.3354, + "step": 1375 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005017036803549523, + "loss": 1.3054, + "step": 1380 + }, + { + "epoch": 1.5, + "learning_rate": 0.0004988642118757102, + "loss": 1.2346, + "step": 1385 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004960247800260161, + "loss": 1.274, + "step": 1390 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004931854763784994, + "loss": 1.4231, + "step": 1395 + }, + { + "epoch": 1.52, + "learning_rate": 0.000490346392501655, + "loss": 1.2872, + "step": 1400 + }, + { + "epoch": 1.52, + "eval_loss": 1.3990795612335205, + "eval_runtime": 10.6706, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1400 + }, + { + "epoch": 1.52, + "learning_rate": 0.00048750761995688984, + "loss": 1.4041, + "step": 1405 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004846692502955709, + "loss": 1.3405, + "step": 1410 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048183137505607154, + "loss": 1.3198, + "step": 1415 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047899408576082016, + "loss": 1.3528, + "step": 1420 + }, + { + "epoch": 1.54, + "learning_rate": 0.0004761574739133478, + "loss": 1.3095, + "step": 1425 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047332163099533787, + "loss": 1.3278, + "step": 1430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00047048664846367587, + "loss": 1.3305, + "step": 1435 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004676526177474991, + "loss": 1.3997, + "step": 1440 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046481963024524846, + "loss": 1.341, + "step": 1445 + }, + { + "epoch": 1.57, + "learning_rate": 0.00046198777732172133, + "loss": 1.3008, + "step": 1450 + }, + { + "epoch": 1.58, + "learning_rate": 0.00045915715030512405, + "loss": 1.2643, + "step": 1455 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004563278404841273, + "loss": 1.3169, + "step": 1460 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045349993910492154, + "loss": 1.3062, + "step": 1465 + }, + { + "epoch": 1.59, + "learning_rate": 0.00045067353736827495, + "loss": 1.2876, + "step": 1470 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004478487264265913, + "loss": 1.3534, + "step": 1475 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004450255973809707, + "loss": 1.318, + "step": 1480 + }, + { + "epoch": 1.61, + "learning_rate": 0.000442204241278272, + "loss": 1.3195, + "step": 1485 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004393847491081756, + "loss": 1.3208, + "step": 1490 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004365672118002494, + "loss": 1.3879, + "step": 1495 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004337517202210168, + "loss": 1.3356, + "step": 1500 + }, + { + "epoch": 1.63, + "eval_loss": 1.3873966932296753, + "eval_runtime": 10.6704, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 1500 + }, + { + "epoch": 1.63, + "learning_rate": 0.0004309383651710254, + "loss": 1.3163, + "step": 1505 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042812723738191896, + "loss": 1.3119, + "step": 1510 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004253184275135116, + "loss": 1.2777, + "step": 1515 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004225120261508637, + "loss": 1.3624, + "step": 1520 + }, + { + "epoch": 1.65, + "learning_rate": 0.0004197081238013602, + "loss": 1.3231, + "step": 1525 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004169068108917924, + "loss": 1.3807, + "step": 1530 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004141081777654412, + "loss": 1.3301, + "step": 1535 + }, + { + "epoch": 1.67, + "learning_rate": 0.0004113123146791633, + "loss": 1.3032, + "step": 1540 + }, + { + "epoch": 1.67, + "learning_rate": 0.000408519311800481, + "loss": 1.2957, + "step": 1545 + }, + { + "epoch": 1.68, + "learning_rate": 0.00040572925920467375, + "loss": 1.3138, + "step": 1550 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004029422468718737, + "loss": 1.2496, + "step": 1555 + }, + { + "epoch": 1.69, + "learning_rate": 0.0004001583646841632, + "loss": 1.3796, + "step": 1560 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039737770242267637, + "loss": 1.3492, + "step": 1565 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039460034976470396, + "loss": 1.3138, + "step": 1570 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003918263962808004, + "loss": 1.3172, + "step": 1575 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003890559314318959, + "loss": 1.3446, + "step": 1580 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038628904456641116, + "loss": 1.3062, + "step": 1585 + }, + { + "epoch": 1.72, + "learning_rate": 0.00038352582491737547, + "loss": 1.2899, + "step": 1590 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003807663615995491, + "loss": 1.2942, + "step": 1595 + }, + { + "epoch": 1.73, + "learning_rate": 0.0003780107436065498, + "loss": 1.2902, + "step": 1600 + }, + { + "epoch": 1.73, + "eval_loss": 1.379552960395813, + "eval_runtime": 10.6705, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1600 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037525905980798183, + "loss": 1.3213, + "step": 1605 + }, + { + "epoch": 1.74, + "learning_rate": 0.0003725113989465705, + "loss": 1.2286, + "step": 1610 + }, + { + "epoch": 1.75, + "learning_rate": 0.00036976784963530017, + "loss": 1.3394, + "step": 1615 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003670285003545564, + "loss": 1.2879, + "step": 1620 + }, + { + "epoch": 1.76, + "learning_rate": 0.00036429343944927196, + "loss": 1.3369, + "step": 1625 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003615627551260785, + "loss": 1.3393, + "step": 1630 + }, + { + "epoch": 1.77, + "learning_rate": 0.0003588365354504612, + "loss": 1.3437, + "step": 1635 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035611486834391894, + "loss": 1.2843, + "step": 1640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00035339784158112893, + "loss": 1.3463, + "step": 1645 + }, + { + "epoch": 1.79, + "learning_rate": 0.00035068554278711494, + "loss": 1.2847, + "step": 1650 + }, + { + "epoch": 1.79, + "learning_rate": 0.00034797805943442313, + "loss": 1.2493, + "step": 1655 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003452754788402996, + "loss": 1.3471, + "step": 1660 + }, + { + "epoch": 1.8, + "learning_rate": 0.00034257788816387475, + "loss": 1.2983, + "step": 1665 + }, + { + "epoch": 1.81, + "learning_rate": 0.0003398853744033529, + "loss": 1.3259, + "step": 1670 + }, + { + "epoch": 1.82, + "learning_rate": 0.0003371980243932056, + "loss": 1.333, + "step": 1675 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033451592480137195, + "loss": 1.3071, + "step": 1680 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033183916212646346, + "loss": 1.3238, + "step": 1685 + }, + { + "epoch": 1.83, + "learning_rate": 0.0003291678226949741, + "loss": 1.3129, + "step": 1690 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003265019926584964, + "loss": 1.3235, + "step": 1695 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032384175799094297, + "loss": 1.3016, + "step": 1700 + }, + { + "epoch": 1.84, + "eval_loss": 1.3693352937698364, + "eval_runtime": 10.6712, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.78, + "step": 1700 + }, + { + "epoch": 1.85, + "learning_rate": 0.0003211872044857743, + "loss": 1.2658, + "step": 1705 + }, + { + "epoch": 1.85, + "learning_rate": 0.00031853841775323103, + "loss": 1.274, + "step": 1710 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031589548321757366, + "loss": 1.2629, + "step": 1715 + }, + { + "epoch": 1.86, + "learning_rate": 0.0003132584861143274, + "loss": 1.3052, + "step": 1720 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003106275114875332, + "loss": 1.3099, + "step": 1725 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003080026441870051, + "loss": 1.2878, + "step": 1730 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030538396886559393, + "loss": 1.2815, + "step": 1735 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030277156997645706, + "loss": 1.2896, + "step": 1740 + }, + { + "epoch": 1.89, + "learning_rate": 0.00030016553177033466, + "loss": 1.3545, + "step": 1745 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002975659382928332, + "loss": 1.29, + "step": 1750 + }, + { + "epoch": 1.9, + "learning_rate": 0.00029497287338171385, + "loss": 1.3543, + "step": 1755 + }, + { + "epoch": 1.91, + "learning_rate": 0.00029238642066418995, + "loss": 1.3202, + "step": 1760 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002898066635542288, + "loss": 1.3261, + "step": 1765 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002872336852498627, + "loss": 1.2256, + "step": 1770 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002846675687305045, + "loss": 1.3423, + "step": 1775 + }, + { + "epoch": 1.93, + "learning_rate": 0.0002821083967542727, + "loss": 1.2896, + "step": 1780 + }, + { + "epoch": 1.93, + "learning_rate": 0.00027955625185532217, + "loss": 1.2312, + "step": 1785 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027701121634118143, + "loss": 1.2822, + "step": 1790 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027447337229009937, + "loss": 1.319, + "step": 1795 + }, + { + "epoch": 1.95, + "learning_rate": 0.00027194280154839824, + "loss": 1.3727, + "step": 1800 + }, + { + "epoch": 1.95, + "eval_loss": 1.3620151281356812, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1800 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002694195857278326, + "loss": 1.251, + "step": 1805 + }, + { + "epoch": 1.96, + "learning_rate": 0.0002669038062029592, + "loss": 1.2324, + "step": 1810 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002643955441085115, + "loss": 1.2644, + "step": 1815 + }, + { + "epoch": 1.97, + "learning_rate": 0.000261894880336783, + "loss": 1.2582, + "step": 1820 + }, + { + "epoch": 1.98, + "learning_rate": 0.0002594018955350191, + "loss": 1.3433, + "step": 1825 + }, + { + "epoch": 1.98, + "learning_rate": 0.00025691667010281616, + "loss": 1.3069, + "step": 1830 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025443928418952724, + "loss": 1.2895, + "step": 1835 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002519698176916791, + "loss": 1.2799, + "step": 1840 + }, + { + "epoch": 2.0, + "learning_rate": 0.000249508350250395, + "loss": 1.3044, + "step": 1845 + }, + { + "epoch": 2.0, + "learning_rate": 0.0002470549612488247, + "loss": 1.2324, + "step": 1850 + }, + { + "epoch": 2.01, + "learning_rate": 0.0002446097298095867, + "loss": 1.2357, + "step": 1855 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024217273479221514, + "loss": 1.2329, + "step": 1860 + }, + { + "epoch": 2.02, + "learning_rate": 0.00023974405479061623, + "loss": 1.2486, + "step": 1865 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002373237681305348, + "loss": 1.1959, + "step": 1870 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023491195286702777, + "loss": 1.2485, + "step": 1875 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023250868678194536, + "loss": 1.2585, + "step": 1880 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023011404738142532, + "loss": 1.2108, + "step": 1885 + }, + { + "epoch": 2.05, + "learning_rate": 0.0002277281118933916, + "loss": 1.188, + "step": 1890 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022535095726506344, + "loss": 1.2197, + "step": 1895 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022298266016047513, + "loss": 1.1352, + "step": 1900 + }, + { + "epoch": 2.06, + "eval_loss": 1.3637186288833618, + "eval_runtime": 10.671, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 1900 + }, + { + "epoch": 2.06, + "learning_rate": 0.0002206232969580027, + "loss": 1.2265, + "step": 1905 + }, + { + "epoch": 2.07, + "learning_rate": 0.00021827294374790034, + "loss": 1.2631, + "step": 1910 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021593167632984756, + "loss": 1.1309, + "step": 1915 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021359957021050392, + "loss": 1.2877, + "step": 1920 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021127670060107362, + "loss": 1.2993, + "step": 1925 + }, + { + "epoch": 2.09, + "learning_rate": 0.00020896314241488075, + "loss": 1.2244, + "step": 1930 + }, + { + "epoch": 2.1, + "learning_rate": 0.0002066589702649529, + "loss": 1.1812, + "step": 1935 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020436425846161437, + "loss": 1.2113, + "step": 1940 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020207908101009054, + "loss": 1.1754, + "step": 1945 + }, + { + "epoch": 2.11, + "learning_rate": 0.00019980351160812083, + "loss": 1.1897, + "step": 1950 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001975376236435813, + "loss": 1.1978, + "step": 1955 + }, + { + "epoch": 2.12, + "learning_rate": 0.00019528149019211883, + "loss": 1.1937, + "step": 1960 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019303518401479414, + "loss": 1.2093, + "step": 1965 + }, + { + "epoch": 2.13, + "learning_rate": 0.00019079877755573442, + "loss": 1.2119, + "step": 1970 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001885723429397983, + "loss": 1.1933, + "step": 1975 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018635595197024886, + "loss": 1.2046, + "step": 1980 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018414967612643814, + "loss": 1.1605, + "step": 1985 + }, + { + "epoch": 2.16, + "learning_rate": 0.0001819535865615018, + "loss": 1.1764, + "step": 1990 + }, + { + "epoch": 2.16, + "learning_rate": 0.00017976775410006508, + "loss": 1.2094, + "step": 1995 + }, + { + "epoch": 2.17, + "learning_rate": 0.000177592249235958, + "loss": 1.146, + "step": 2000 + }, + { + "epoch": 2.17, + "eval_loss": 1.3614530563354492, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2000 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017542714212994188, + "loss": 1.2674, + "step": 2005 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017327250260744698, + "loss": 1.2817, + "step": 2010 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017112840015632086, + "loss": 1.2693, + "step": 2015 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016899490392458628, + "loss": 1.253, + "step": 2020 + }, + { + "epoch": 2.19, + "learning_rate": 0.00016687208271821253, + "loss": 1.208, + "step": 2025 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016476000499889514, + "loss": 1.1818, + "step": 2030 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001626587388818491, + "loss": 1.1945, + "step": 2035 + }, + { + "epoch": 2.21, + "learning_rate": 0.0001605683521336116, + "loss": 1.2225, + "step": 2040 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015848891216985596, + "loss": 1.1726, + "step": 2045 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015642048605321856, + "loss": 1.1651, + "step": 2050 + }, + { + "epoch": 2.23, + "learning_rate": 0.0001543631404911356, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015231694183369106, + "loss": 1.191, + "step": 2060 + }, + { + "epoch": 2.24, + "learning_rate": 0.0001502819560714781, + "loss": 1.2421, + "step": 2065 + }, + { + "epoch": 2.24, + "learning_rate": 0.00014825824883347018, + "loss": 1.1924, + "step": 2070 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014624588538490413, + "loss": 1.1714, + "step": 2075 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014424493062517623, + "loss": 1.2641, + "step": 2080 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014225544908574872, + "loss": 1.2721, + "step": 2085 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014027750492806817, + "loss": 1.2431, + "step": 2090 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013831116194149712, + "loss": 1.2983, + "step": 2095 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013635648354125662, + "loss": 1.2144, + "step": 2100 + }, + { + "epoch": 2.28, + "eval_loss": 1.3538448810577393, + "eval_runtime": 10.6717, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2100 + }, + { + "epoch": 2.28, + "learning_rate": 0.0001344135327663804, + "loss": 1.1463, + "step": 2105 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013248237227768246, + "loss": 1.2751, + "step": 2110 + }, + { + "epoch": 2.29, + "learning_rate": 0.00013056306435573633, + "loss": 1.2196, + "step": 2115 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012865567089886642, + "loss": 1.1964, + "step": 2120 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012676025342115105, + "loss": 1.1749, + "step": 2125 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012487687305043978, + "loss": 1.2615, + "step": 2130 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012300559052638122, + "loss": 1.2064, + "step": 2135 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012114646619846425, + "loss": 1.1642, + "step": 2140 + }, + { + "epoch": 2.32, + "learning_rate": 0.00011929956002407194, + "loss": 1.1704, + "step": 2145 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011746493156654814, + "loss": 1.1668, + "step": 2150 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011564263999327546, + "loss": 1.1584, + "step": 2155 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011383274407376848, + "loss": 1.2412, + "step": 2160 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001120353021777778, + "loss": 1.1688, + "step": 2165 + }, + { + "epoch": 2.35, + "learning_rate": 0.00011025037227340711, + "loss": 1.2097, + "step": 2170 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010847801192524454, + "loss": 1.2057, + "step": 2175 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010671827829250585, + "loss": 1.2296, + "step": 2180 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010497122812719068, + "loss": 1.2547, + "step": 2185 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010323691777225286, + "loss": 1.1746, + "step": 2190 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010151540315978314, + "loss": 1.1466, + "step": 2195 + }, + { + "epoch": 2.38, + "learning_rate": 9.98067398092049e-05, + "loss": 1.1551, + "step": 2200 + }, + { + "epoch": 2.38, + "eval_loss": 1.349250316619873, + "eval_runtime": 10.6708, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2200 + }, + { + "epoch": 2.39, + "learning_rate": 9.811098282548447e-05, + "loss": 1.158, + "step": 2205 + }, + { + "epoch": 2.4, + "learning_rate": 9.642818689735305e-05, + "loss": 1.1444, + "step": 2210 + }, + { + "epoch": 2.4, + "learning_rate": 9.475840629554394e-05, + "loss": 1.2504, + "step": 2215 + }, + { + "epoch": 2.41, + "learning_rate": 9.310169487104131e-05, + "loss": 1.1439, + "step": 2220 + }, + { + "epoch": 2.41, + "learning_rate": 9.145810605334454e-05, + "loss": 1.2758, + "step": 2225 + }, + { + "epoch": 2.42, + "learning_rate": 8.982769284874386e-05, + "loss": 1.1992, + "step": 2230 + }, + { + "epoch": 2.42, + "learning_rate": 8.821050783861212e-05, + "loss": 1.2177, + "step": 2235 + }, + { + "epoch": 2.43, + "learning_rate": 8.660660317770841e-05, + "loss": 1.1942, + "step": 2240 + }, + { + "epoch": 2.43, + "learning_rate": 8.501603059249563e-05, + "loss": 1.163, + "step": 2245 + }, + { + "epoch": 2.44, + "learning_rate": 8.343884137947333e-05, + "loss": 1.239, + "step": 2250 + }, + { + "epoch": 2.44, + "learning_rate": 8.187508640352265e-05, + "loss": 1.1455, + "step": 2255 + }, + { + "epoch": 2.45, + "learning_rate": 8.032481609626575e-05, + "loss": 1.2165, + "step": 2260 + }, + { + "epoch": 2.45, + "learning_rate": 7.878808045444014e-05, + "loss": 1.1982, + "step": 2265 + }, + { + "epoch": 2.46, + "learning_rate": 7.726492903828575e-05, + "loss": 1.212, + "step": 2270 + }, + { + "epoch": 2.47, + "learning_rate": 7.575541096994637e-05, + "loss": 1.2453, + "step": 2275 + }, + { + "epoch": 2.47, + "learning_rate": 7.4259574931886e-05, + "loss": 1.2607, + "step": 2280 + }, + { + "epoch": 2.48, + "learning_rate": 7.27774691653188e-05, + "loss": 1.1936, + "step": 2285 + }, + { + "epoch": 2.48, + "learning_rate": 7.130914146865247e-05, + "loss": 1.2702, + "step": 2290 + }, + { + "epoch": 2.49, + "learning_rate": 6.985463919594781e-05, + "loss": 1.133, + "step": 2295 + }, + { + "epoch": 2.49, + "learning_rate": 6.841400925539104e-05, + "loss": 1.2135, + "step": 2300 + }, + { + "epoch": 2.49, + "eval_loss": 1.3470078706741333, + "eval_runtime": 10.6711, + "eval_samples_per_second": 14.057, + "eval_steps_per_second": 1.781, + "step": 2300 + }, + { + "epoch": 2.5, + "learning_rate": 6.698729810778065e-05, + "loss": 1.1986, + "step": 2305 + }, + { + "epoch": 2.5, + "learning_rate": 6.557455176502986e-05, + "loss": 1.2254, + "step": 2310 + }, + { + "epoch": 2.51, + "learning_rate": 6.417581578868198e-05, + "loss": 1.212, + "step": 2315 + }, + { + "epoch": 2.51, + "learning_rate": 6.279113528844127e-05, + "loss": 1.1517, + "step": 2320 + }, + { + "epoch": 2.52, + "learning_rate": 6.14205549207184e-05, + "loss": 1.1889, + "step": 2325 + }, + { + "epoch": 2.53, + "learning_rate": 6.006411888718982e-05, + "loss": 1.2348, + "step": 2330 + }, + { + "epoch": 2.53, + "learning_rate": 5.872187093337239e-05, + "loss": 1.1862, + "step": 2335 + }, + { + "epoch": 2.54, + "learning_rate": 5.739385434721295e-05, + "loss": 1.2143, + "step": 2340 + }, + { + "epoch": 2.54, + "learning_rate": 5.608011195769186e-05, + "loss": 1.242, + "step": 2345 + }, + { + "epoch": 2.55, + "learning_rate": 5.478068613344151e-05, + "loss": 1.1817, + "step": 2350 + }, + { + "epoch": 2.55, + "learning_rate": 5.3495618781380764e-05, + "loss": 1.1916, + "step": 2355 + }, + { + "epoch": 2.56, + "learning_rate": 5.2224951345362703e-05, + "loss": 1.1231, + "step": 2360 + }, + { + "epoch": 2.56, + "learning_rate": 5.096872480483816e-05, + "loss": 1.2113, + "step": 2365 + }, + { + "epoch": 2.57, + "learning_rate": 4.972697967353445e-05, + "loss": 1.164, + "step": 2370 + }, + { + "epoch": 2.57, + "learning_rate": 4.8499755998148656e-05, + "loss": 1.1947, + "step": 2375 + }, + { + "epoch": 2.58, + "learning_rate": 4.728709335705561e-05, + "loss": 1.2219, + "step": 2380 + }, + { + "epoch": 2.58, + "learning_rate": 4.6089030859032376e-05, + "loss": 1.2104, + "step": 2385 + }, + { + "epoch": 2.59, + "learning_rate": 4.490560714199637e-05, + "loss": 1.2077, + "step": 2390 + }, + { + "epoch": 2.6, + "learning_rate": 4.373686037175917e-05, + "loss": 1.1758, + "step": 2395 + }, + { + "epoch": 2.6, + "learning_rate": 4.258282824079618e-05, + "loss": 1.2094, + "step": 2400 + }, + { + "epoch": 2.6, + "eval_loss": 1.3436678647994995, + "eval_runtime": 10.6699, + "eval_samples_per_second": 14.058, + "eval_steps_per_second": 1.781, + "step": 2400 + }, + { + "epoch": 2.61, + "learning_rate": 4.1443547967030816e-05, + "loss": 1.2, + "step": 2405 + }, + { + "epoch": 2.61, + "learning_rate": 4.031905629263371e-05, + "loss": 1.2246, + "step": 2410 + }, + { + "epoch": 2.62, + "learning_rate": 3.92093894828387e-05, + "loss": 1.198, + "step": 2415 + }, + { + "epoch": 2.62, + "learning_rate": 3.811458332477252e-05, + "loss": 1.269, + "step": 2420 + }, + { + "epoch": 2.63, + "learning_rate": 3.703467312630088e-05, + "loss": 1.189, + "step": 2425 + }, + { + "epoch": 2.63, + "learning_rate": 3.596969371488995e-05, + "loss": 1.1938, + "step": 2430 + }, + { + "epoch": 2.64, + "learning_rate": 3.491967943648289e-05, + "loss": 1.2421, + "step": 2435 + }, + { + "epoch": 2.64, + "learning_rate": 3.388466415439234e-05, + "loss": 1.1145, + "step": 2440 + }, + { + "epoch": 2.65, + "learning_rate": 3.2864681248208184e-05, + "loss": 1.1678, + "step": 2445 + }, + { + "epoch": 2.66, + "learning_rate": 3.185976361272125e-05, + "loss": 1.2627, + "step": 2450 + }, + { + "epoch": 2.66, + "learning_rate": 3.086994365686246e-05, + "loss": 1.1344, + "step": 2455 + }, + { + "epoch": 2.67, + "learning_rate": 2.9895253302657188e-05, + "loss": 1.2325, + "step": 2460 + }, + { + "epoch": 2.67, + "learning_rate": 2.8935723984196304e-05, + "loss": 1.2533, + "step": 2465 + }, + { + "epoch": 2.68, + "learning_rate": 2.7991386646622207e-05, + "loss": 1.226, + "step": 2470 + }, + { + "epoch": 2.68, + "learning_rate": 2.7062271745130595e-05, + "loss": 1.0925, + "step": 2475 + }, + { + "epoch": 2.69, + "learning_rate": 2.614840924398876e-05, + "loss": 1.1444, + "step": 2480 + }, + { + "epoch": 2.69, + "learning_rate": 2.5249828615568794e-05, + "loss": 1.214, + "step": 2485 + }, + { + "epoch": 2.7, + "learning_rate": 2.436655883939737e-05, + "loss": 1.2427, + "step": 2490 + }, + { + "epoch": 2.7, + "learning_rate": 2.3498628401221078e-05, + "loss": 1.2447, + "step": 2495 + }, + { + "epoch": 2.71, + "learning_rate": 2.2646065292087403e-05, + "loss": 1.1835, + "step": 2500 + }, + { + "epoch": 2.71, + "eval_loss": 1.343565821647644, + "eval_runtime": 10.6718, + "eval_samples_per_second": 14.056, + "eval_steps_per_second": 1.78, + "step": 2500 + }, + { + "epoch": 2.71, + "learning_rate": 2.1808897007442762e-05, + "loss": 1.2457, + "step": 2505 + }, + { + "epoch": 2.72, + "learning_rate": 2.098715054624506e-05, + "loss": 1.2248, + "step": 2510 + }, + { + "epoch": 2.73, + "learning_rate": 2.0180852410093153e-05, + "loss": 1.2419, + "step": 2515 + }, + { + "epoch": 2.73, + "learning_rate": 1.939002860237249e-05, + "loss": 1.1763, + "step": 2520 + }, + { + "epoch": 2.74, + "learning_rate": 1.8614704627416045e-05, + "loss": 1.2035, + "step": 2525 + }, + { + "epoch": 2.74, + "learning_rate": 1.7854905489681993e-05, + "loss": 1.1767, + "step": 2530 + }, + { + "epoch": 2.75, + "learning_rate": 1.7110655692947397e-05, + "loss": 1.254, + "step": 2535 + }, + { + "epoch": 2.75, + "learning_rate": 1.638197923951784e-05, + "loss": 1.1941, + "step": 2540 + }, + { + "epoch": 2.76, + "learning_rate": 1.5668899629453225e-05, + "loss": 1.2568, + "step": 2545 + }, + { + "epoch": 2.76, + "learning_rate": 1.4971439859810199e-05, + "loss": 1.2237, + "step": 2550 + }, + { + "epoch": 2.77, + "learning_rate": 1.428962242390025e-05, + "loss": 1.172, + "step": 2555 + }, + { + "epoch": 2.77, + "learning_rate": 1.3623469310564408e-05, + "loss": 1.1835, + "step": 2560 + }, + { + "epoch": 2.78, + "learning_rate": 1.2973002003463797e-05, + "loss": 1.1335, + "step": 2565 + }, + { + "epoch": 2.79, + "learning_rate": 1.2338241480387369e-05, + "loss": 1.1968, + "step": 2570 + }, + { + "epoch": 2.79, + "learning_rate": 1.1719208212574939e-05, + "loss": 1.1962, + "step": 2575 + }, + { + "epoch": 2.8, + "learning_rate": 1.111592216405688e-05, + "loss": 1.2107, + "step": 2580 + }, + { + "epoch": 2.8, + "learning_rate": 1.0528402791010582e-05, + "loss": 1.2148, + "step": 2585 + }, + { + "epoch": 2.81, + "learning_rate": 9.956669041133015e-06, + "loss": 1.1443, + "step": 2590 + }, + { + "epoch": 2.81, + "learning_rate": 9.400739353029209e-06, + "loss": 1.1805, + "step": 2595 + }, + { + "epoch": 2.82, + "learning_rate": 8.860631655618124e-06, + "loss": 1.2061, + "step": 2600 + }, + { + "epoch": 2.82, + "eval_loss": 1.3423666954040527, + "eval_runtime": 10.6722, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2600 + }, + { + "epoch": 2.82, + "learning_rate": 8.336363367554112e-06, + "loss": 1.2418, + "step": 2605 + }, + { + "epoch": 2.83, + "learning_rate": 7.827951396665312e-06, + "loss": 1.2532, + "step": 2610 + }, + { + "epoch": 2.83, + "learning_rate": 7.335412139408248e-06, + "loss": 1.2836, + "step": 2615 + }, + { + "epoch": 2.84, + "learning_rate": 6.85876148033926e-06, + "loss": 1.2918, + "step": 2620 + }, + { + "epoch": 2.84, + "learning_rate": 6.398014791601847e-06, + "loss": 1.1381, + "step": 2625 + }, + { + "epoch": 2.85, + "learning_rate": 5.953186932431298e-06, + "loss": 1.1686, + "step": 2630 + }, + { + "epoch": 2.86, + "learning_rate": 5.524292248675289e-06, + "loss": 1.2104, + "step": 2635 + }, + { + "epoch": 2.86, + "learning_rate": 5.111344572331145e-06, + "loss": 1.2651, + "step": 2640 + }, + { + "epoch": 2.87, + "learning_rate": 4.714357221099974e-06, + "loss": 1.1296, + "step": 2645 + }, + { + "epoch": 2.87, + "learning_rate": 4.333342997957013e-06, + "loss": 1.1534, + "step": 2650 + }, + { + "epoch": 2.88, + "learning_rate": 3.96831419073862e-06, + "loss": 1.2065, + "step": 2655 + }, + { + "epoch": 2.88, + "learning_rate": 3.6192825717464294e-06, + "loss": 1.2118, + "step": 2660 + }, + { + "epoch": 2.89, + "learning_rate": 3.2862593973670975e-06, + "loss": 1.1139, + "step": 2665 + }, + { + "epoch": 2.89, + "learning_rate": 2.969255407709648e-06, + "loss": 1.1702, + "step": 2670 + }, + { + "epoch": 2.9, + "learning_rate": 2.668280826259195e-06, + "loss": 1.1821, + "step": 2675 + }, + { + "epoch": 2.9, + "learning_rate": 2.383345359546818e-06, + "loss": 1.1826, + "step": 2680 + }, + { + "epoch": 2.91, + "learning_rate": 2.1144581968369213e-06, + "loss": 1.236, + "step": 2685 + }, + { + "epoch": 2.92, + "learning_rate": 1.861628009830696e-06, + "loss": 1.2308, + "step": 2690 + }, + { + "epoch": 2.92, + "learning_rate": 1.6248629523865077e-06, + "loss": 1.1957, + "step": 2695 + }, + { + "epoch": 2.93, + "learning_rate": 1.4041706602567206e-06, + "loss": 1.1613, + "step": 2700 + }, + { + "epoch": 2.93, + "eval_loss": 1.3419121503829956, + "eval_runtime": 10.672, + "eval_samples_per_second": 14.055, + "eval_steps_per_second": 1.78, + "step": 2700 + }, + { + "epoch": 2.93, + "learning_rate": 1.1995582508418924e-06, + "loss": 1.152, + "step": 2705 + }, + { + "epoch": 2.94, + "learning_rate": 1.0110323229608476e-06, + "loss": 1.2173, + "step": 2710 + }, + { + "epoch": 2.94, + "learning_rate": 8.385989566379593e-07, + "loss": 1.2202, + "step": 2715 + }, + { + "epoch": 2.95, + "learning_rate": 6.82263712907083e-07, + "loss": 1.1492, + "step": 2720 + }, + { + "epoch": 2.95, + "learning_rate": 5.420316336323117e-07, + "loss": 1.2548, + "step": 2725 + }, + { + "epoch": 2.96, + "learning_rate": 4.1790724134521676e-07, + "loss": 1.2085, + "step": 2730 + }, + { + "epoch": 2.96, + "learning_rate": 3.098945390991315e-07, + "loss": 1.2253, + "step": 2735 + }, + { + "epoch": 2.97, + "learning_rate": 2.179970103398654e-07, + "loss": 1.1604, + "step": 2740 + }, + { + "epoch": 2.97, + "learning_rate": 1.4221761879351648e-07, + "loss": 1.2269, + "step": 2745 + }, + { + "epoch": 2.98, + "learning_rate": 8.25588083709361e-08, + "loss": 1.2413, + "step": 2750 + }, + { + "epoch": 2.99, + "learning_rate": 3.9022503088737006e-08, + "loss": 1.2255, + "step": 2755 + }, + { + "epoch": 2.99, + "learning_rate": 1.1610107007398175e-08, + "loss": 1.2192, + "step": 2760 + }, + { + "epoch": 3.0, + "learning_rate": 3.2250418585677564e-10, + "loss": 1.1758, + "step": 2765 + }, + { + "epoch": 3.0, + "step": 2766, + "total_flos": 9.118407110185452e+17, + "train_loss": 1.3472231076148138, + "train_runtime": 10021.615, + "train_samples_per_second": 4.42, + "train_steps_per_second": 0.276 + } + ], + "logging_steps": 5, + "max_steps": 2766, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 9.118407110185452e+17, + "trial_name": null, + "trial_params": null +} diff --git a/PT/training_args.bin b/PT/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..37ccaa868292a344ecf406147baf26160efd3673 --- /dev/null +++ b/PT/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2166a4c94c80da923c6c349bc51e8b46840abb71e780a855ae721f59f1371d47 +size 3466 diff --git a/SFT-1600/README.md b/SFT-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT-1600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT-1600/adapter_config.json b/SFT-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT-1600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT-1600/adapter_model.bin b/SFT-1600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ac2178f771dfaa2ef7bd6f184d7881f87dde1f4e --- /dev/null +++ b/SFT-1600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e471f59d56ece56a7e719a7d1c7b2d967e26f97a8475729615e787d02ee8552 +size 16821197 diff --git a/SFT-1600/finetuning_args.json b/SFT-1600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT-1600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT-1600/optimizer.pt b/SFT-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..34f92952d0b3f10f2b14241228963f9f5def6cda --- /dev/null +++ b/SFT-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aafa03638ff643cef96b1aefe18758a901c316297c96887435a8b83101f6294 +size 33661637 diff --git a/SFT-1600/rng_state.pth b/SFT-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce30a636a5441b6aaba5952c5080a2716b971ad2 --- /dev/null +++ b/SFT-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45bdfee9d7d1723ef5a06a32158dbf12b12c5f7c3ea6f28389512704065131f6 +size 14575 diff --git a/SFT-1600/scheduler.pt b/SFT-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..64c15014f782acf7108b8179e3ca368f6a681510 --- /dev/null +++ b/SFT-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77470147fc4fe853892edfcab3168fa02fc978668c512ef042e655580cfc0bff +size 627 diff --git a/SFT-1600/trainer_state.json b/SFT-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..afb3659efe5f2c2275e7af39c0fc1f1705ab9c0f --- /dev/null +++ b/SFT-1600/trainer_state.json @@ -0,0 +1,2067 @@ +{ + "best_metric": 2.005824327468872, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1500", + "epoch": 2.191780821917808, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.735206482632704e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT-1600/training_args.bin b/SFT-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/README.md b/SFT/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/adapter_config.json b/SFT/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/adapter_model.bin b/SFT/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..686bbb6210494c9f5918ddafeb3d0b62da71884e --- /dev/null +++ b/SFT/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77033327d358030e6779b97afc5e7382636d6f679c223d1faef04ea679c7e2d +size 16821197 diff --git a/SFT/all_results.json b/SFT/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..14181d9db9737077d0e13f0344ed965971828ccf --- /dev/null +++ b/SFT/all_results.json @@ -0,0 +1,11 @@ +{ + "epoch": 3.0, + "eval_loss": 1.98636794090271, + "eval_runtime": 8.8384, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "train_loss": 1.969855450712927, + "train_runtime": 7239.9426, + "train_samples_per_second": 4.84, + "train_steps_per_second": 0.302 +} \ No newline at end of file diff --git a/SFT/checkpoint-100/README.md b/SFT/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-100/adapter_config.json b/SFT/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-100/adapter_model.bin b/SFT/checkpoint-100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3a1817644d9c805805353ff1120fa40826d1de9e --- /dev/null +++ b/SFT/checkpoint-100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c0d091ca73311b380b3779848a1b9ffdb7fa719e1520300a73b985b88dc0e2 +size 16821197 diff --git a/SFT/checkpoint-100/finetuning_args.json b/SFT/checkpoint-100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-100/optimizer.pt b/SFT/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9850b29e3be30a308494fe5b1c57ee1f2c540422 --- /dev/null +++ b/SFT/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ba139d5cd0ceed3f1719c2f1d9ca5f9a142d81f913b3ebb9521366ad5d6d032 +size 33661637 diff --git a/SFT/checkpoint-100/rng_state.pth b/SFT/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b733ecd53211bcb11fd54e804120f88a37b2956f --- /dev/null +++ b/SFT/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c7ce5e5f2b9faea3985fa993cab0e0f1d3eefabe458f9f8109dd2e3eb3e912 +size 14575 diff --git a/SFT/checkpoint-100/scheduler.pt b/SFT/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..01fe361a963b3366977cfd23a23365e5ef3e5f83 --- /dev/null +++ b/SFT/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:447ba1df53e2c56686cb0ab379c97e22904a88acbc0a22fc914eee0ebbf5c815 +size 627 diff --git a/SFT/checkpoint-100/trainer_state.json b/SFT/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d12c9558e38c8bb5480c9ab545639607501ec914 --- /dev/null +++ b/SFT/checkpoint-100/trainer_state.json @@ -0,0 +1,147 @@ +{ + "best_metric": 2.3112449645996094, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-100", + "epoch": 0.136986301369863, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.953615677574349e+16, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-100/training_args.bin b/SFT/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1000/README.md b/SFT/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1000/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1000/adapter_config.json b/SFT/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1000/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1000/adapter_model.bin b/SFT/checkpoint-1000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..b00c2c7e82baa9ab7296eb8f8495b7f23eac5ace --- /dev/null +++ b/SFT/checkpoint-1000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bbe352501fb78b7319a2a39466ceb9f210340b9a0fe0e00e56fb25036e5d6b1 +size 16821197 diff --git a/SFT/checkpoint-1000/finetuning_args.json b/SFT/checkpoint-1000/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1000/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1000/optimizer.pt b/SFT/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c10587d98c01e893663088c44e47fd8db427b69 --- /dev/null +++ b/SFT/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b99ac512081c55c04eef3417577435b90e82d08308cc53a28fc9823c0608694d +size 33661637 diff --git a/SFT/checkpoint-1000/rng_state.pth b/SFT/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..82f27557dbdf113b75fcb8e1c955041b9d390cbc --- /dev/null +++ b/SFT/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b003156d6f70a21c7b9439cfbcbde86139307fef860612dab5b6716c05380a34 +size 14575 diff --git a/SFT/checkpoint-1000/scheduler.pt b/SFT/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cafdd245e31a1d9925762375319c49bace89380c --- /dev/null +++ b/SFT/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7d3013e7640b352c1586392ad90d0a413160d09b74e6570c5811203eb293839 +size 627 diff --git a/SFT/checkpoint-1000/trainer_state.json b/SFT/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7386b2fa154c3dc696f6324ce401d5b67ef1e32e --- /dev/null +++ b/SFT/checkpoint-1000/trainer_state.json @@ -0,0 +1,1299 @@ +{ + "best_metric": 2.0705747604370117, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1000", + "epoch": 1.36986301369863, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.9523198932238336e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1000/training_args.bin b/SFT/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1100/README.md b/SFT/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1100/adapter_config.json b/SFT/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1100/adapter_model.bin b/SFT/checkpoint-1100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..84ce8eeaae611e39ca5ae76079ae38b9a0a02166 --- /dev/null +++ b/SFT/checkpoint-1100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70abc000f68532545c1defb6d6de8b8bd49e53fabb58dd2d5fd9192812ad1953 +size 16821197 diff --git a/SFT/checkpoint-1100/finetuning_args.json b/SFT/checkpoint-1100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1100/optimizer.pt b/SFT/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..66734d19d7b3bdf17a64f265b3d0a2042d9b0c21 --- /dev/null +++ b/SFT/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab670c1be59d2d8d13de4c1b5c2591c7aaaece99188148e3628680871e1667e8 +size 33661637 diff --git a/SFT/checkpoint-1100/rng_state.pth b/SFT/checkpoint-1100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6ea8839ff718e9302b4afd65a393ba04e310b35 --- /dev/null +++ b/SFT/checkpoint-1100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d9fa67f61999beb5cb13425a972b95f112bb8e07c20ba9aee1129dc77534dba +size 14575 diff --git a/SFT/checkpoint-1100/scheduler.pt b/SFT/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b4b5ee3eb37a245779c44e26afc1787ad395ad42 --- /dev/null +++ b/SFT/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f16c98cd4c984b3aba7d4d99ae100a1d8bb439cd5400d7467341466d18cf2284 +size 627 diff --git a/SFT/checkpoint-1100/trainer_state.json b/SFT/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2672891ac9cd2e5d207a482cde9fefb22c0e1c32 --- /dev/null +++ b/SFT/checkpoint-1100/trainer_state.json @@ -0,0 +1,1427 @@ +{ + "best_metric": 2.0488994121551514, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1100", + "epoch": 1.5068493150684932, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.251724952022876e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1100/training_args.bin b/SFT/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1200/README.md b/SFT/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1200/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1200/adapter_config.json b/SFT/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1200/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1200/adapter_model.bin b/SFT/checkpoint-1200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..4af90d2f484d913a2f8f519302710501e1ba840c --- /dev/null +++ b/SFT/checkpoint-1200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5733322854196fddcb7864936f64f93cfbfef067a1932f14ced5f43dd449349c +size 16821197 diff --git a/SFT/checkpoint-1200/finetuning_args.json b/SFT/checkpoint-1200/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1200/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1200/optimizer.pt b/SFT/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..581c844b962d60754ffc62b0bc572fe7f4cfca96 --- /dev/null +++ b/SFT/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ccab8dd7426e6d2af3b1264ff82fdf9fdb19d3f061ac469ed72f4149b4b9f3c4 +size 33661637 diff --git a/SFT/checkpoint-1200/rng_state.pth b/SFT/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..235ef601fce06485748da992eb94b28f3b1fffa5 --- /dev/null +++ b/SFT/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa72349b3df004a1fb2b070d3dac98e7a61846374917b67013e9855ac235a15e +size 14575 diff --git a/SFT/checkpoint-1200/scheduler.pt b/SFT/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ba7562abc1a1f763c6c914aa599cfc786ab5c86 --- /dev/null +++ b/SFT/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:653edd8fee2d2046a8c4d6858484ea4c61c63259c78855e9a38efbc5cbbd24ae +size 627 diff --git a/SFT/checkpoint-1200/trainer_state.json b/SFT/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..db3ff00753e7289026591379af9d303ef8f97d32 --- /dev/null +++ b/SFT/checkpoint-1200/trainer_state.json @@ -0,0 +1,1555 @@ +{ + "best_metric": 2.030912160873413, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1200", + "epoch": 1.643835616438356, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.543426130198856e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1200/training_args.bin b/SFT/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1300/README.md b/SFT/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1300/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1300/adapter_config.json b/SFT/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1300/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1300/adapter_model.bin b/SFT/checkpoint-1300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..9c984e5405b515f8ce577d5f57e96276cad85db4 --- /dev/null +++ b/SFT/checkpoint-1300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78205d775d068f0b0518bc8cb32b468d770705c190a4cc41fcb106fd7bc5ff0 +size 16821197 diff --git a/SFT/checkpoint-1300/finetuning_args.json b/SFT/checkpoint-1300/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1300/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1300/optimizer.pt b/SFT/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4750fd293f273812f193930e06aed13a40a72c3d --- /dev/null +++ b/SFT/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac35c539f06df44016ed339f0332ae5f661ad731e3568155672c175d11bba77 +size 33661637 diff --git a/SFT/checkpoint-1300/rng_state.pth b/SFT/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b28e13cfd580948fc0ceffe38e201fe0c629deec --- /dev/null +++ b/SFT/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f5c2f0be19a807f2bfd1084e200064bd95dff1b91c1605684a40b19cca8747f +size 14575 diff --git a/SFT/checkpoint-1300/scheduler.pt b/SFT/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2bc018e800fd202e43f22d1184a205ba8c4b762c --- /dev/null +++ b/SFT/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cad426e26bec593368ae9059439f40ec6efcbe937de3f428337373f6284270c3 +size 627 diff --git a/SFT/checkpoint-1300/trainer_state.json b/SFT/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c6722beb53cb6dbd6263c47f3a9db58bf1f305d0 --- /dev/null +++ b/SFT/checkpoint-1300/trainer_state.json @@ -0,0 +1,1683 @@ +{ + "best_metric": 2.012606382369995, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1300", + "epoch": 1.7808219178082192, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 3.8384303512161485e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1300/training_args.bin b/SFT/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1400/README.md b/SFT/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1400/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1400/adapter_config.json b/SFT/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1400/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1400/adapter_model.bin b/SFT/checkpoint-1400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed30e9a895a7cce3ca2748180526aa7ebb0146da --- /dev/null +++ b/SFT/checkpoint-1400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5229336644c2bdcc5b409313e8638d7f3a7731929fa72c0c658c950297ae9edf +size 16821197 diff --git a/SFT/checkpoint-1400/finetuning_args.json b/SFT/checkpoint-1400/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1400/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1400/optimizer.pt b/SFT/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..778ced9b9cd01c727eff7c8367d1ccefd9290e07 --- /dev/null +++ b/SFT/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f595479f555c6f39529ff133a194ed7a0e96e41256d24bc626fecdcb6eb21c +size 33661637 diff --git a/SFT/checkpoint-1400/rng_state.pth b/SFT/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d6da10ac94dd0cb44032409c02c49e7aa9d87fee --- /dev/null +++ b/SFT/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f348340c7384044321c00d1c5ce20b1aea0b0c99b35e505b29c75890e4ecccf +size 14575 diff --git a/SFT/checkpoint-1400/scheduler.pt b/SFT/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f4b5197ee41d3cd4fd9985cb773cccca983f4e7 --- /dev/null +++ b/SFT/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39be319fda44b48e6e325eddd0a574a0df501835694476dc05b84498fc6a0d47 +size 627 diff --git a/SFT/checkpoint-1400/trainer_state.json b/SFT/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..92bd0d34ac5c5e9ba1bee74778a627666e1af858 --- /dev/null +++ b/SFT/checkpoint-1400/trainer_state.json @@ -0,0 +1,1811 @@ +{ + "best_metric": 2.006899833679199, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1400", + "epoch": 1.9178082191780823, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.1257564463303885e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1400/training_args.bin b/SFT/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1500/README.md b/SFT/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1500/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1500/adapter_config.json b/SFT/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1500/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1500/adapter_model.bin b/SFT/checkpoint-1500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..f3611028033b155a478041a551108e0cf7f01e42 --- /dev/null +++ b/SFT/checkpoint-1500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c036246009c9a2b342e1d5c5755ee840c20f9d81e06affa66f8b6599e3aaec87 +size 16821197 diff --git a/SFT/checkpoint-1500/finetuning_args.json b/SFT/checkpoint-1500/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1500/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1500/optimizer.pt b/SFT/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9ba7d61fc0ea4d1f391553d0ce7792576e941a84 --- /dev/null +++ b/SFT/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2446c450df650734dc666621d6f1cee43b947739b0a8e7eb1e04dee047629d98 +size 33661637 diff --git a/SFT/checkpoint-1500/rng_state.pth b/SFT/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..53f50e4d2cf621fd1b8c82ac8cf6ef9c829ca497 --- /dev/null +++ b/SFT/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f5dd27b9efba39225ccb19106d436db5ec8f9e628cb5f31f2b5fdc0c9e7e269 +size 14575 diff --git a/SFT/checkpoint-1500/scheduler.pt b/SFT/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a888fcda9f0d6ed780ebe51c4efd9f18becb5c36 --- /dev/null +++ b/SFT/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:521b83f48f4c3cc230e09ae6f50469cd68819a69769426774dee53961f0aa592 +size 627 diff --git a/SFT/checkpoint-1500/trainer_state.json b/SFT/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..29f925d6d9e599ee4c74f02503a8296b08e1190a --- /dev/null +++ b/SFT/checkpoint-1500/trainer_state.json @@ -0,0 +1,1939 @@ +{ + "best_metric": 2.005824327468872, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1500", + "epoch": 2.0547945205479454, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.432472626272338e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1500/training_args.bin b/SFT/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1600/README.md b/SFT/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1600/adapter_config.json b/SFT/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1600/adapter_model.bin b/SFT/checkpoint-1600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ac2178f771dfaa2ef7bd6f184d7881f87dde1f4e --- /dev/null +++ b/SFT/checkpoint-1600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e471f59d56ece56a7e719a7d1c7b2d967e26f97a8475729615e787d02ee8552 +size 16821197 diff --git a/SFT/checkpoint-1600/finetuning_args.json b/SFT/checkpoint-1600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1600/optimizer.pt b/SFT/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..34f92952d0b3f10f2b14241228963f9f5def6cda --- /dev/null +++ b/SFT/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aafa03638ff643cef96b1aefe18758a901c316297c96887435a8b83101f6294 +size 33661637 diff --git a/SFT/checkpoint-1600/rng_state.pth b/SFT/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ce30a636a5441b6aaba5952c5080a2716b971ad2 --- /dev/null +++ b/SFT/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45bdfee9d7d1723ef5a06a32158dbf12b12c5f7c3ea6f28389512704065131f6 +size 14575 diff --git a/SFT/checkpoint-1600/scheduler.pt b/SFT/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..64c15014f782acf7108b8179e3ca368f6a681510 --- /dev/null +++ b/SFT/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77470147fc4fe853892edfcab3168fa02fc978668c512ef042e655580cfc0bff +size 627 diff --git a/SFT/checkpoint-1600/trainer_state.json b/SFT/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..afb3659efe5f2c2275e7af39c0fc1f1705ab9c0f --- /dev/null +++ b/SFT/checkpoint-1600/trainer_state.json @@ -0,0 +1,2067 @@ +{ + "best_metric": 2.005824327468872, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1500", + "epoch": 2.191780821917808, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 4.735206482632704e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1600/training_args.bin b/SFT/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1700/README.md b/SFT/checkpoint-1700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1700/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1700/adapter_config.json b/SFT/checkpoint-1700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1700/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1700/adapter_model.bin b/SFT/checkpoint-1700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3fd713a4f0c5ad43d34fdfd195fed81c98a9adb2 --- /dev/null +++ b/SFT/checkpoint-1700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c386ca4baa863428532d6ec909e7cd1a8e6976959384cdb5403e2ce94d20ec9 +size 16821197 diff --git a/SFT/checkpoint-1700/finetuning_args.json b/SFT/checkpoint-1700/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1700/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1700/optimizer.pt b/SFT/checkpoint-1700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..54f8d553f6a9c2179a9f3ecc32e164abd3838101 --- /dev/null +++ b/SFT/checkpoint-1700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81ee2362785d8791b9f626281645372542a41d63eebe21edd35f05395477508d +size 33661637 diff --git a/SFT/checkpoint-1700/rng_state.pth b/SFT/checkpoint-1700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a13916e314ccce0d37b580e22940aa0f8bb32069 --- /dev/null +++ b/SFT/checkpoint-1700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8614ed9d07437e987c211bde01c6791da7a727ea5076029783d557e69f2dda90 +size 14575 diff --git a/SFT/checkpoint-1700/scheduler.pt b/SFT/checkpoint-1700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b781baac37ab1d93a0645607a121b752f3b4162d --- /dev/null +++ b/SFT/checkpoint-1700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2a5811c3aa6ab21c4e7f1b0c0a822546b028a8350dc1a5d82a11cb636c5026d +size 627 diff --git a/SFT/checkpoint-1700/trainer_state.json b/SFT/checkpoint-1700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..53beb9f9bfcbf88df7ac3ea159d1e4cc0b249381 --- /dev/null +++ b/SFT/checkpoint-1700/trainer_state.json @@ -0,0 +1,2195 @@ +{ + "best_metric": 2.0000417232513428, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1700", + "epoch": 2.328767123287671, + "eval_steps": 100, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.037734301232988e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1700/training_args.bin b/SFT/checkpoint-1700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1800/README.md b/SFT/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1800/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1800/adapter_config.json b/SFT/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1800/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1800/adapter_model.bin b/SFT/checkpoint-1800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..e03fbff18e30c2504b4c23054523dc4b2b5e069d --- /dev/null +++ b/SFT/checkpoint-1800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0236bd211bcba2876558e26f4d4a17b23940da94baeef2d21451f378a6da607 +size 16821197 diff --git a/SFT/checkpoint-1800/finetuning_args.json b/SFT/checkpoint-1800/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1800/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1800/optimizer.pt b/SFT/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3aca64510c419c431a82ab30ec56d6546db3e05 --- /dev/null +++ b/SFT/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:323a6cdef03cf134667156e0e8d79d821e888a857fde9a340bc36d56494bcc16 +size 33661637 diff --git a/SFT/checkpoint-1800/rng_state.pth b/SFT/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e63ade8e9775c02da529165effaf73874a4ee91a --- /dev/null +++ b/SFT/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d52c0acafa9202769514ae2f06500e064f8e5d70cc1e15b3388a2aa1f4d9b79 +size 14575 diff --git a/SFT/checkpoint-1800/scheduler.pt b/SFT/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe68843ed64db6d6b2813b179e40813e0223d8fe --- /dev/null +++ b/SFT/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26237ce9acc6a8e63186d3bfe3bae1287b6846b623a83485f68de68654e1e6c +size 627 diff --git a/SFT/checkpoint-1800/trainer_state.json b/SFT/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a99f0577cf68944bbd31f3380a1c5d6e676628ac --- /dev/null +++ b/SFT/checkpoint-1800/trainer_state.json @@ -0,0 +1,2323 @@ +{ + "best_metric": 1.9944177865982056, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1800", + "epoch": 2.4657534246575343, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011713150386974947, + "loss": 1.7458, + "step": 1705 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011483483403405659, + "loss": 1.6685, + "step": 1710 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001125579793164797, + "loss": 1.733, + "step": 1715 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011030105685156039, + "loss": 1.7281, + "step": 1720 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010806418274841024, + "loss": 1.7581, + "step": 1725 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010584747208473738, + "loss": 1.6238, + "step": 1730 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010365103890092636, + "loss": 1.7462, + "step": 1735 + }, + { + "epoch": 2.38, + "learning_rate": 0.000101474996194171, + "loss": 1.7897, + "step": 1740 + }, + { + "epoch": 2.39, + "learning_rate": 9.931945591266172e-05, + "loss": 1.8603, + "step": 1745 + }, + { + "epoch": 2.4, + "learning_rate": 9.718452894982571e-05, + "loss": 1.8379, + "step": 1750 + }, + { + "epoch": 2.4, + "learning_rate": 9.507032513862195e-05, + "loss": 1.7378, + "step": 1755 + }, + { + "epoch": 2.41, + "learning_rate": 9.297695324589106e-05, + "loss": 1.6022, + "step": 1760 + }, + { + "epoch": 2.42, + "learning_rate": 9.090452096675993e-05, + "loss": 1.7144, + "step": 1765 + }, + { + "epoch": 2.42, + "learning_rate": 8.885313491910052e-05, + "loss": 1.6529, + "step": 1770 + }, + { + "epoch": 2.43, + "learning_rate": 8.682290063804527e-05, + "loss": 1.7523, + "step": 1775 + }, + { + "epoch": 2.44, + "learning_rate": 8.48139225705578e-05, + "loss": 1.6469, + "step": 1780 + }, + { + "epoch": 2.45, + "learning_rate": 8.28263040700598e-05, + "loss": 1.7107, + "step": 1785 + }, + { + "epoch": 2.45, + "learning_rate": 8.086014739111297e-05, + "loss": 1.5931, + "step": 1790 + }, + { + "epoch": 2.46, + "learning_rate": 7.891555368415947e-05, + "loss": 1.7049, + "step": 1795 + }, + { + "epoch": 2.47, + "learning_rate": 7.699262299031778e-05, + "loss": 1.6175, + "step": 1800 + }, + { + "epoch": 2.47, + "eval_loss": 1.9944177865982056, + "eval_runtime": 8.8381, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1800 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.326528415387812e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1800/training_args.bin b/SFT/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-1900/README.md b/SFT/checkpoint-1900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-1900/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-1900/adapter_config.json b/SFT/checkpoint-1900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-1900/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-1900/adapter_model.bin b/SFT/checkpoint-1900/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..ec9f4d0037915a45ca3abbe33102c6fea0ae9461 --- /dev/null +++ b/SFT/checkpoint-1900/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e168cb3379b0fc5ee03cad2d05e40aca5894504bd593e7832564ea2f1ae10e4 +size 16821197 diff --git a/SFT/checkpoint-1900/finetuning_args.json b/SFT/checkpoint-1900/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-1900/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-1900/optimizer.pt b/SFT/checkpoint-1900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f6dedb4f96243e76cb8275ab74d970a4742f548 --- /dev/null +++ b/SFT/checkpoint-1900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08cf976afc862053fbe5b8f8a2159d700867af2bb88572e8fddc78d3c012eaff +size 33661637 diff --git a/SFT/checkpoint-1900/rng_state.pth b/SFT/checkpoint-1900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ceb5b2bd2ac33def35cb501d0ae771e95404309d --- /dev/null +++ b/SFT/checkpoint-1900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3317e6c7ff8e54a1f2781972e5d357c22aff05e8e96bba2c25c7eb91949f78 +size 14575 diff --git a/SFT/checkpoint-1900/scheduler.pt b/SFT/checkpoint-1900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..42020b566c64bc5db98b537af4bf548bb29202e3 --- /dev/null +++ b/SFT/checkpoint-1900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d0a7428fc4b22f29cfbb49af90af3b1578a8a5425b07aabd3db30a150209bb +size 627 diff --git a/SFT/checkpoint-1900/trainer_state.json b/SFT/checkpoint-1900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2f995295f7227af1cfb855a37fd207d9333b1cd6 --- /dev/null +++ b/SFT/checkpoint-1900/trainer_state.json @@ -0,0 +1,2451 @@ +{ + "best_metric": 1.991329312324524, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-1900", + "epoch": 2.602739726027397, + "eval_steps": 100, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011713150386974947, + "loss": 1.7458, + "step": 1705 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011483483403405659, + "loss": 1.6685, + "step": 1710 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001125579793164797, + "loss": 1.733, + "step": 1715 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011030105685156039, + "loss": 1.7281, + "step": 1720 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010806418274841024, + "loss": 1.7581, + "step": 1725 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010584747208473738, + "loss": 1.6238, + "step": 1730 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010365103890092636, + "loss": 1.7462, + "step": 1735 + }, + { + "epoch": 2.38, + "learning_rate": 0.000101474996194171, + "loss": 1.7897, + "step": 1740 + }, + { + "epoch": 2.39, + "learning_rate": 9.931945591266172e-05, + "loss": 1.8603, + "step": 1745 + }, + { + "epoch": 2.4, + "learning_rate": 9.718452894982571e-05, + "loss": 1.8379, + "step": 1750 + }, + { + "epoch": 2.4, + "learning_rate": 9.507032513862195e-05, + "loss": 1.7378, + "step": 1755 + }, + { + "epoch": 2.41, + "learning_rate": 9.297695324589106e-05, + "loss": 1.6022, + "step": 1760 + }, + { + "epoch": 2.42, + "learning_rate": 9.090452096675993e-05, + "loss": 1.7144, + "step": 1765 + }, + { + "epoch": 2.42, + "learning_rate": 8.885313491910052e-05, + "loss": 1.6529, + "step": 1770 + }, + { + "epoch": 2.43, + "learning_rate": 8.682290063804527e-05, + "loss": 1.7523, + "step": 1775 + }, + { + "epoch": 2.44, + "learning_rate": 8.48139225705578e-05, + "loss": 1.6469, + "step": 1780 + }, + { + "epoch": 2.45, + "learning_rate": 8.28263040700598e-05, + "loss": 1.7107, + "step": 1785 + }, + { + "epoch": 2.45, + "learning_rate": 8.086014739111297e-05, + "loss": 1.5931, + "step": 1790 + }, + { + "epoch": 2.46, + "learning_rate": 7.891555368415947e-05, + "loss": 1.7049, + "step": 1795 + }, + { + "epoch": 2.47, + "learning_rate": 7.699262299031778e-05, + "loss": 1.6175, + "step": 1800 + }, + { + "epoch": 2.47, + "eval_loss": 1.9944177865982056, + "eval_runtime": 8.8381, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1800 + }, + { + "epoch": 2.47, + "learning_rate": 7.509145423623608e-05, + "loss": 1.5914, + "step": 1805 + }, + { + "epoch": 2.48, + "learning_rate": 7.321214522900271e-05, + "loss": 1.5903, + "step": 1810 + }, + { + "epoch": 2.49, + "learning_rate": 7.13547926511145e-05, + "loss": 1.5715, + "step": 1815 + }, + { + "epoch": 2.49, + "learning_rate": 6.951949205550284e-05, + "loss": 1.5976, + "step": 1820 + }, + { + "epoch": 2.5, + "learning_rate": 6.770633786061819e-05, + "loss": 1.7028, + "step": 1825 + }, + { + "epoch": 2.51, + "learning_rate": 6.591542334557222e-05, + "loss": 1.7763, + "step": 1830 + }, + { + "epoch": 2.51, + "learning_rate": 6.41468406453391e-05, + "loss": 1.5076, + "step": 1835 + }, + { + "epoch": 2.52, + "learning_rate": 6.240068074601568e-05, + "loss": 1.665, + "step": 1840 + }, + { + "epoch": 2.53, + "learning_rate": 6.067703348014086e-05, + "loss": 1.7332, + "step": 1845 + }, + { + "epoch": 2.53, + "learning_rate": 5.897598752207328e-05, + "loss": 1.7152, + "step": 1850 + }, + { + "epoch": 2.54, + "learning_rate": 5.729763038343022e-05, + "loss": 1.7113, + "step": 1855 + }, + { + "epoch": 2.55, + "learning_rate": 5.564204840858511e-05, + "loss": 1.8046, + "step": 1860 + }, + { + "epoch": 2.55, + "learning_rate": 5.40093267702258e-05, + "loss": 1.652, + "step": 1865 + }, + { + "epoch": 2.56, + "learning_rate": 5.239954946497227e-05, + "loss": 1.7188, + "step": 1870 + }, + { + "epoch": 2.57, + "learning_rate": 5.0812799309055746e-05, + "loss": 1.7157, + "step": 1875 + }, + { + "epoch": 2.58, + "learning_rate": 4.9249157934057985e-05, + "loss": 1.7516, + "step": 1880 + }, + { + "epoch": 2.58, + "learning_rate": 4.770870578271197e-05, + "loss": 1.7406, + "step": 1885 + }, + { + "epoch": 2.59, + "learning_rate": 4.619152210476296e-05, + "loss": 1.6949, + "step": 1890 + }, + { + "epoch": 2.6, + "learning_rate": 4.469768495289189e-05, + "loss": 1.6593, + "step": 1895 + }, + { + "epoch": 2.6, + "learning_rate": 4.322727117869951e-05, + "loss": 1.762, + "step": 1900 + }, + { + "epoch": 2.6, + "eval_loss": 1.991329312324524, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1900 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.616352718343045e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-1900/training_args.bin b/SFT/checkpoint-1900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-1900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-200/README.md b/SFT/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-200/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-200/adapter_config.json b/SFT/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-200/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-200/adapter_model.bin b/SFT/checkpoint-200/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b46ae9f97cec168b4e1956649b9acd35e946daa --- /dev/null +++ b/SFT/checkpoint-200/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d55d349eb824527d1ef06b1fede12e72c4d727bab1aaaa25c3ada43eb8d2a84d +size 16821197 diff --git a/SFT/checkpoint-200/finetuning_args.json b/SFT/checkpoint-200/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-200/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-200/optimizer.pt b/SFT/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6d24982e38381ce3fe3f9c3201fff723c9dcee68 --- /dev/null +++ b/SFT/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3785217d6c2eeaf3db1e4c792623c61dcab4327e9286fa1cc249e59aa9ea97cb +size 33661637 diff --git a/SFT/checkpoint-200/rng_state.pth b/SFT/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e5cf4a1dda6d8a18a9d44b678871595d04af9d3d --- /dev/null +++ b/SFT/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ec0c40661b8eab19a51b75c2c322390927a119739061ad396d1aeef9a4658d3 +size 14575 diff --git a/SFT/checkpoint-200/scheduler.pt b/SFT/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad7f26ae9475a469b9816fab3046057319a39643 --- /dev/null +++ b/SFT/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fded78f552247cb693034fe9a3a0d6cae40d6f161d1b18c60c9d63437c181f84 +size 627 diff --git a/SFT/checkpoint-200/trainer_state.json b/SFT/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..55ff1bddb82c4ebb842450f6170bb1369282cc58 --- /dev/null +++ b/SFT/checkpoint-200/trainer_state.json @@ -0,0 +1,275 @@ +{ + "best_metric": 2.234222650527954, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-200", + "epoch": 0.273972602739726, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.897476754944819e+16, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-200/training_args.bin b/SFT/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-2000/README.md b/SFT/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-2000/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-2000/adapter_config.json b/SFT/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-2000/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-2000/adapter_model.bin b/SFT/checkpoint-2000/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..19fb6cb124f968e6c14a5c64b2acf2da330955ee --- /dev/null +++ b/SFT/checkpoint-2000/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85b721ea61ff1dc3b974d2260aa8bf867d0f769a19d11adc824e6da15fd43016 +size 16821197 diff --git a/SFT/checkpoint-2000/finetuning_args.json b/SFT/checkpoint-2000/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-2000/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-2000/optimizer.pt b/SFT/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f9fda3fab1b1db251fbed0aa9d2d8ea1d4cbc93 --- /dev/null +++ b/SFT/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f942a03ffd97f268837218d3ea20f128355998b33675c6c5e59f387b8ad99a +size 33661637 diff --git a/SFT/checkpoint-2000/rng_state.pth b/SFT/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2247360628bb7688cac5fa33a4136e15b6767866 --- /dev/null +++ b/SFT/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301ddbdb6ca9ed41de5d9ffd43175cb9614fe28ae8a095f3dccde7a80001425e +size 14575 diff --git a/SFT/checkpoint-2000/scheduler.pt b/SFT/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd14578d3af16a6bc47cf7a618c04d5f184153fc --- /dev/null +++ b/SFT/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b5a87ebdb1517c9fdafac08ee46630dd8da3d3cd4e9dcc32bf9f29d88fc97ce +size 627 diff --git a/SFT/checkpoint-2000/trainer_state.json b/SFT/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..15600985cc0ca837c6db1bceeffe732636d92571 --- /dev/null +++ b/SFT/checkpoint-2000/trainer_state.json @@ -0,0 +1,2579 @@ +{ + "best_metric": 1.988287329673767, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-2000", + "epoch": 2.73972602739726, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011713150386974947, + "loss": 1.7458, + "step": 1705 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011483483403405659, + "loss": 1.6685, + "step": 1710 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001125579793164797, + "loss": 1.733, + "step": 1715 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011030105685156039, + "loss": 1.7281, + "step": 1720 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010806418274841024, + "loss": 1.7581, + "step": 1725 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010584747208473738, + "loss": 1.6238, + "step": 1730 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010365103890092636, + "loss": 1.7462, + "step": 1735 + }, + { + "epoch": 2.38, + "learning_rate": 0.000101474996194171, + "loss": 1.7897, + "step": 1740 + }, + { + "epoch": 2.39, + "learning_rate": 9.931945591266172e-05, + "loss": 1.8603, + "step": 1745 + }, + { + "epoch": 2.4, + "learning_rate": 9.718452894982571e-05, + "loss": 1.8379, + "step": 1750 + }, + { + "epoch": 2.4, + "learning_rate": 9.507032513862195e-05, + "loss": 1.7378, + "step": 1755 + }, + { + "epoch": 2.41, + "learning_rate": 9.297695324589106e-05, + "loss": 1.6022, + "step": 1760 + }, + { + "epoch": 2.42, + "learning_rate": 9.090452096675993e-05, + "loss": 1.7144, + "step": 1765 + }, + { + "epoch": 2.42, + "learning_rate": 8.885313491910052e-05, + "loss": 1.6529, + "step": 1770 + }, + { + "epoch": 2.43, + "learning_rate": 8.682290063804527e-05, + "loss": 1.7523, + "step": 1775 + }, + { + "epoch": 2.44, + "learning_rate": 8.48139225705578e-05, + "loss": 1.6469, + "step": 1780 + }, + { + "epoch": 2.45, + "learning_rate": 8.28263040700598e-05, + "loss": 1.7107, + "step": 1785 + }, + { + "epoch": 2.45, + "learning_rate": 8.086014739111297e-05, + "loss": 1.5931, + "step": 1790 + }, + { + "epoch": 2.46, + "learning_rate": 7.891555368415947e-05, + "loss": 1.7049, + "step": 1795 + }, + { + "epoch": 2.47, + "learning_rate": 7.699262299031778e-05, + "loss": 1.6175, + "step": 1800 + }, + { + "epoch": 2.47, + "eval_loss": 1.9944177865982056, + "eval_runtime": 8.8381, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1800 + }, + { + "epoch": 2.47, + "learning_rate": 7.509145423623608e-05, + "loss": 1.5914, + "step": 1805 + }, + { + "epoch": 2.48, + "learning_rate": 7.321214522900271e-05, + "loss": 1.5903, + "step": 1810 + }, + { + "epoch": 2.49, + "learning_rate": 7.13547926511145e-05, + "loss": 1.5715, + "step": 1815 + }, + { + "epoch": 2.49, + "learning_rate": 6.951949205550284e-05, + "loss": 1.5976, + "step": 1820 + }, + { + "epoch": 2.5, + "learning_rate": 6.770633786061819e-05, + "loss": 1.7028, + "step": 1825 + }, + { + "epoch": 2.51, + "learning_rate": 6.591542334557222e-05, + "loss": 1.7763, + "step": 1830 + }, + { + "epoch": 2.51, + "learning_rate": 6.41468406453391e-05, + "loss": 1.5076, + "step": 1835 + }, + { + "epoch": 2.52, + "learning_rate": 6.240068074601568e-05, + "loss": 1.665, + "step": 1840 + }, + { + "epoch": 2.53, + "learning_rate": 6.067703348014086e-05, + "loss": 1.7332, + "step": 1845 + }, + { + "epoch": 2.53, + "learning_rate": 5.897598752207328e-05, + "loss": 1.7152, + "step": 1850 + }, + { + "epoch": 2.54, + "learning_rate": 5.729763038343022e-05, + "loss": 1.7113, + "step": 1855 + }, + { + "epoch": 2.55, + "learning_rate": 5.564204840858511e-05, + "loss": 1.8046, + "step": 1860 + }, + { + "epoch": 2.55, + "learning_rate": 5.40093267702258e-05, + "loss": 1.652, + "step": 1865 + }, + { + "epoch": 2.56, + "learning_rate": 5.239954946497227e-05, + "loss": 1.7188, + "step": 1870 + }, + { + "epoch": 2.57, + "learning_rate": 5.0812799309055746e-05, + "loss": 1.7157, + "step": 1875 + }, + { + "epoch": 2.58, + "learning_rate": 4.9249157934057985e-05, + "loss": 1.7516, + "step": 1880 + }, + { + "epoch": 2.58, + "learning_rate": 4.770870578271197e-05, + "loss": 1.7406, + "step": 1885 + }, + { + "epoch": 2.59, + "learning_rate": 4.619152210476296e-05, + "loss": 1.6949, + "step": 1890 + }, + { + "epoch": 2.6, + "learning_rate": 4.469768495289189e-05, + "loss": 1.6593, + "step": 1895 + }, + { + "epoch": 2.6, + "learning_rate": 4.322727117869951e-05, + "loss": 1.762, + "step": 1900 + }, + { + "epoch": 2.6, + "eval_loss": 1.991329312324524, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1900 + }, + { + "epoch": 2.61, + "learning_rate": 4.178035642875322e-05, + "loss": 1.8229, + "step": 1905 + }, + { + "epoch": 2.62, + "learning_rate": 4.0357015140694843e-05, + "loss": 1.7235, + "step": 1910 + }, + { + "epoch": 2.62, + "learning_rate": 3.89573205394112e-05, + "loss": 1.6724, + "step": 1915 + }, + { + "epoch": 2.63, + "learning_rate": 3.758134463326729e-05, + "loss": 1.576, + "step": 1920 + }, + { + "epoch": 2.64, + "learning_rate": 3.622915821040174e-05, + "loss": 1.6975, + "step": 1925 + }, + { + "epoch": 2.64, + "learning_rate": 3.4900830835084604e-05, + "loss": 1.8066, + "step": 1930 + }, + { + "epoch": 2.65, + "learning_rate": 3.3596430844139216e-05, + "loss": 1.6242, + "step": 1935 + }, + { + "epoch": 2.66, + "learning_rate": 3.231602534342587e-05, + "loss": 1.6191, + "step": 1940 + }, + { + "epoch": 2.66, + "learning_rate": 3.105968020439026e-05, + "loss": 1.7211, + "step": 1945 + }, + { + "epoch": 2.67, + "learning_rate": 2.9827460060673938e-05, + "loss": 1.7227, + "step": 1950 + }, + { + "epoch": 2.68, + "learning_rate": 2.8619428304789697e-05, + "loss": 1.7096, + "step": 1955 + }, + { + "epoch": 2.68, + "learning_rate": 2.743564708485996e-05, + "loss": 1.7527, + "step": 1960 + }, + { + "epoch": 2.69, + "learning_rate": 2.6276177301419955e-05, + "loss": 1.6941, + "step": 1965 + }, + { + "epoch": 2.7, + "learning_rate": 2.5141078604284108e-05, + "loss": 1.6471, + "step": 1970 + }, + { + "epoch": 2.71, + "learning_rate": 2.4030409389477757e-05, + "loss": 1.7648, + "step": 1975 + }, + { + "epoch": 2.71, + "learning_rate": 2.2944226796232537e-05, + "loss": 1.5269, + "step": 1980 + }, + { + "epoch": 2.72, + "learning_rate": 2.188258670404719e-05, + "loss": 1.6842, + "step": 1985 + }, + { + "epoch": 2.73, + "learning_rate": 2.0845543729812566e-05, + "loss": 1.8391, + "step": 1990 + }, + { + "epoch": 2.73, + "learning_rate": 1.9833151225001734e-05, + "loss": 1.6511, + "step": 1995 + }, + { + "epoch": 2.74, + "learning_rate": 1.884546127292569e-05, + "loss": 1.6843, + "step": 2000 + }, + { + "epoch": 2.74, + "eval_loss": 1.988287329673767, + "eval_runtime": 8.8383, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 2000 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 5.906114244168253e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-2000/training_args.bin b/SFT/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-2100/README.md b/SFT/checkpoint-2100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-2100/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-2100/adapter_config.json b/SFT/checkpoint-2100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-2100/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-2100/adapter_model.bin b/SFT/checkpoint-2100/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..686bbb6210494c9f5918ddafeb3d0b62da71884e --- /dev/null +++ b/SFT/checkpoint-2100/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c77033327d358030e6779b97afc5e7382636d6f679c223d1faef04ea679c7e2d +size 16821197 diff --git a/SFT/checkpoint-2100/finetuning_args.json b/SFT/checkpoint-2100/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-2100/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-2100/optimizer.pt b/SFT/checkpoint-2100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b324c09dc1a42f93f38e9ba98201a96cd858d22f --- /dev/null +++ b/SFT/checkpoint-2100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16c5f918b1c65b302f64de860c22a85958ed5a17710396de24aad9f9b6bbbd74 +size 33661637 diff --git a/SFT/checkpoint-2100/rng_state.pth b/SFT/checkpoint-2100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ceed93c795da5890ca8838f4e9deec41c6920123 --- /dev/null +++ b/SFT/checkpoint-2100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96006247b173d6de3c06d36d0d1dfa25e02aed53488e1d76ac0dc27ba52d5d90 +size 14575 diff --git a/SFT/checkpoint-2100/scheduler.pt b/SFT/checkpoint-2100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..037b9562c5bd892f2d0dbc42460433df6dc3590e --- /dev/null +++ b/SFT/checkpoint-2100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f860f2cc88430c7a4dd7f6dedefae0ade49b8738e0d070ea02cab7e75489ab67 +size 627 diff --git a/SFT/checkpoint-2100/trainer_state.json b/SFT/checkpoint-2100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..558aae316b356d2306d3c4fe19a48004925cd57b --- /dev/null +++ b/SFT/checkpoint-2100/trainer_state.json @@ -0,0 +1,2707 @@ +{ + "best_metric": 1.98636794090271, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-2100", + "epoch": 2.8767123287671232, + "eval_steps": 100, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011713150386974947, + "loss": 1.7458, + "step": 1705 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011483483403405659, + "loss": 1.6685, + "step": 1710 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001125579793164797, + "loss": 1.733, + "step": 1715 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011030105685156039, + "loss": 1.7281, + "step": 1720 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010806418274841024, + "loss": 1.7581, + "step": 1725 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010584747208473738, + "loss": 1.6238, + "step": 1730 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010365103890092636, + "loss": 1.7462, + "step": 1735 + }, + { + "epoch": 2.38, + "learning_rate": 0.000101474996194171, + "loss": 1.7897, + "step": 1740 + }, + { + "epoch": 2.39, + "learning_rate": 9.931945591266172e-05, + "loss": 1.8603, + "step": 1745 + }, + { + "epoch": 2.4, + "learning_rate": 9.718452894982571e-05, + "loss": 1.8379, + "step": 1750 + }, + { + "epoch": 2.4, + "learning_rate": 9.507032513862195e-05, + "loss": 1.7378, + "step": 1755 + }, + { + "epoch": 2.41, + "learning_rate": 9.297695324589106e-05, + "loss": 1.6022, + "step": 1760 + }, + { + "epoch": 2.42, + "learning_rate": 9.090452096675993e-05, + "loss": 1.7144, + "step": 1765 + }, + { + "epoch": 2.42, + "learning_rate": 8.885313491910052e-05, + "loss": 1.6529, + "step": 1770 + }, + { + "epoch": 2.43, + "learning_rate": 8.682290063804527e-05, + "loss": 1.7523, + "step": 1775 + }, + { + "epoch": 2.44, + "learning_rate": 8.48139225705578e-05, + "loss": 1.6469, + "step": 1780 + }, + { + "epoch": 2.45, + "learning_rate": 8.28263040700598e-05, + "loss": 1.7107, + "step": 1785 + }, + { + "epoch": 2.45, + "learning_rate": 8.086014739111297e-05, + "loss": 1.5931, + "step": 1790 + }, + { + "epoch": 2.46, + "learning_rate": 7.891555368415947e-05, + "loss": 1.7049, + "step": 1795 + }, + { + "epoch": 2.47, + "learning_rate": 7.699262299031778e-05, + "loss": 1.6175, + "step": 1800 + }, + { + "epoch": 2.47, + "eval_loss": 1.9944177865982056, + "eval_runtime": 8.8381, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1800 + }, + { + "epoch": 2.47, + "learning_rate": 7.509145423623608e-05, + "loss": 1.5914, + "step": 1805 + }, + { + "epoch": 2.48, + "learning_rate": 7.321214522900271e-05, + "loss": 1.5903, + "step": 1810 + }, + { + "epoch": 2.49, + "learning_rate": 7.13547926511145e-05, + "loss": 1.5715, + "step": 1815 + }, + { + "epoch": 2.49, + "learning_rate": 6.951949205550284e-05, + "loss": 1.5976, + "step": 1820 + }, + { + "epoch": 2.5, + "learning_rate": 6.770633786061819e-05, + "loss": 1.7028, + "step": 1825 + }, + { + "epoch": 2.51, + "learning_rate": 6.591542334557222e-05, + "loss": 1.7763, + "step": 1830 + }, + { + "epoch": 2.51, + "learning_rate": 6.41468406453391e-05, + "loss": 1.5076, + "step": 1835 + }, + { + "epoch": 2.52, + "learning_rate": 6.240068074601568e-05, + "loss": 1.665, + "step": 1840 + }, + { + "epoch": 2.53, + "learning_rate": 6.067703348014086e-05, + "loss": 1.7332, + "step": 1845 + }, + { + "epoch": 2.53, + "learning_rate": 5.897598752207328e-05, + "loss": 1.7152, + "step": 1850 + }, + { + "epoch": 2.54, + "learning_rate": 5.729763038343022e-05, + "loss": 1.7113, + "step": 1855 + }, + { + "epoch": 2.55, + "learning_rate": 5.564204840858511e-05, + "loss": 1.8046, + "step": 1860 + }, + { + "epoch": 2.55, + "learning_rate": 5.40093267702258e-05, + "loss": 1.652, + "step": 1865 + }, + { + "epoch": 2.56, + "learning_rate": 5.239954946497227e-05, + "loss": 1.7188, + "step": 1870 + }, + { + "epoch": 2.57, + "learning_rate": 5.0812799309055746e-05, + "loss": 1.7157, + "step": 1875 + }, + { + "epoch": 2.58, + "learning_rate": 4.9249157934057985e-05, + "loss": 1.7516, + "step": 1880 + }, + { + "epoch": 2.58, + "learning_rate": 4.770870578271197e-05, + "loss": 1.7406, + "step": 1885 + }, + { + "epoch": 2.59, + "learning_rate": 4.619152210476296e-05, + "loss": 1.6949, + "step": 1890 + }, + { + "epoch": 2.6, + "learning_rate": 4.469768495289189e-05, + "loss": 1.6593, + "step": 1895 + }, + { + "epoch": 2.6, + "learning_rate": 4.322727117869951e-05, + "loss": 1.762, + "step": 1900 + }, + { + "epoch": 2.6, + "eval_loss": 1.991329312324524, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1900 + }, + { + "epoch": 2.61, + "learning_rate": 4.178035642875322e-05, + "loss": 1.8229, + "step": 1905 + }, + { + "epoch": 2.62, + "learning_rate": 4.0357015140694843e-05, + "loss": 1.7235, + "step": 1910 + }, + { + "epoch": 2.62, + "learning_rate": 3.89573205394112e-05, + "loss": 1.6724, + "step": 1915 + }, + { + "epoch": 2.63, + "learning_rate": 3.758134463326729e-05, + "loss": 1.576, + "step": 1920 + }, + { + "epoch": 2.64, + "learning_rate": 3.622915821040174e-05, + "loss": 1.6975, + "step": 1925 + }, + { + "epoch": 2.64, + "learning_rate": 3.4900830835084604e-05, + "loss": 1.8066, + "step": 1930 + }, + { + "epoch": 2.65, + "learning_rate": 3.3596430844139216e-05, + "loss": 1.6242, + "step": 1935 + }, + { + "epoch": 2.66, + "learning_rate": 3.231602534342587e-05, + "loss": 1.6191, + "step": 1940 + }, + { + "epoch": 2.66, + "learning_rate": 3.105968020439026e-05, + "loss": 1.7211, + "step": 1945 + }, + { + "epoch": 2.67, + "learning_rate": 2.9827460060673938e-05, + "loss": 1.7227, + "step": 1950 + }, + { + "epoch": 2.68, + "learning_rate": 2.8619428304789697e-05, + "loss": 1.7096, + "step": 1955 + }, + { + "epoch": 2.68, + "learning_rate": 2.743564708485996e-05, + "loss": 1.7527, + "step": 1960 + }, + { + "epoch": 2.69, + "learning_rate": 2.6276177301419955e-05, + "loss": 1.6941, + "step": 1965 + }, + { + "epoch": 2.7, + "learning_rate": 2.5141078604284108e-05, + "loss": 1.6471, + "step": 1970 + }, + { + "epoch": 2.71, + "learning_rate": 2.4030409389477757e-05, + "loss": 1.7648, + "step": 1975 + }, + { + "epoch": 2.71, + "learning_rate": 2.2944226796232537e-05, + "loss": 1.5269, + "step": 1980 + }, + { + "epoch": 2.72, + "learning_rate": 2.188258670404719e-05, + "loss": 1.6842, + "step": 1985 + }, + { + "epoch": 2.73, + "learning_rate": 2.0845543729812566e-05, + "loss": 1.8391, + "step": 1990 + }, + { + "epoch": 2.73, + "learning_rate": 1.9833151225001734e-05, + "loss": 1.6511, + "step": 1995 + }, + { + "epoch": 2.74, + "learning_rate": 1.884546127292569e-05, + "loss": 1.6843, + "step": 2000 + }, + { + "epoch": 2.74, + "eval_loss": 1.988287329673767, + "eval_runtime": 8.8383, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 2000 + }, + { + "epoch": 2.75, + "learning_rate": 1.7882524686053393e-05, + "loss": 1.6908, + "step": 2005 + }, + { + "epoch": 2.75, + "learning_rate": 1.6944391003397895e-05, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.76, + "learning_rate": 1.603110848796785e-05, + "loss": 1.6644, + "step": 2015 + }, + { + "epoch": 2.77, + "learning_rate": 1.5142724124284579e-05, + "loss": 1.674, + "step": 2020 + }, + { + "epoch": 2.77, + "learning_rate": 1.42792836159647e-05, + "loss": 1.7485, + "step": 2025 + }, + { + "epoch": 2.78, + "learning_rate": 1.3440831383369045e-05, + "loss": 1.7741, + "step": 2030 + }, + { + "epoch": 2.79, + "learning_rate": 1.2627410561317387e-05, + "loss": 1.7444, + "step": 2035 + }, + { + "epoch": 2.79, + "learning_rate": 1.1839062996869377e-05, + "loss": 1.7367, + "step": 2040 + }, + { + "epoch": 2.8, + "learning_rate": 1.1075829247171598e-05, + "loss": 1.6514, + "step": 2045 + }, + { + "epoch": 2.81, + "learning_rate": 1.0337748577371186e-05, + "loss": 1.5851, + "step": 2050 + }, + { + "epoch": 2.82, + "learning_rate": 9.624858958595716e-06, + "loss": 1.735, + "step": 2055 + }, + { + "epoch": 2.82, + "learning_rate": 8.937197065999714e-06, + "loss": 1.7278, + "step": 2060 + }, + { + "epoch": 2.83, + "learning_rate": 8.274798276878049e-06, + "loss": 1.4815, + "step": 2065 + }, + { + "epoch": 2.84, + "learning_rate": 7.637696668845728e-06, + "loss": 1.7065, + "step": 2070 + }, + { + "epoch": 2.84, + "learning_rate": 7.0259250180848e-06, + "loss": 1.6512, + "step": 2075 + }, + { + "epoch": 2.85, + "learning_rate": 6.439514797658308e-06, + "loss": 1.6191, + "step": 2080 + }, + { + "epoch": 2.86, + "learning_rate": 5.8784961758908685e-06, + "loss": 1.613, + "step": 2085 + }, + { + "epoch": 2.86, + "learning_rate": 5.342898014816855e-06, + "loss": 1.5417, + "step": 2090 + }, + { + "epoch": 2.87, + "learning_rate": 4.832747868695475e-06, + "loss": 1.6967, + "step": 2095 + }, + { + "epoch": 2.88, + "learning_rate": 4.348071982593293e-06, + "loss": 1.7315, + "step": 2100 + }, + { + "epoch": 2.88, + "eval_loss": 1.98636794090271, + "eval_runtime": 8.8386, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 2100 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.199349437854843e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-2100/training_args.bin b/SFT/checkpoint-2100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-2100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-300/README.md b/SFT/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-300/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-300/adapter_config.json b/SFT/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-300/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-300/adapter_model.bin b/SFT/checkpoint-300/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..435098f21fd7e8c2b2423caaa06b0739fd878817 --- /dev/null +++ b/SFT/checkpoint-300/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf7264c7b922b223f92f5a00458dac2dcdd080b3c4ebfdb88255337d2fcd616 +size 16821197 diff --git a/SFT/checkpoint-300/finetuning_args.json b/SFT/checkpoint-300/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-300/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-300/optimizer.pt b/SFT/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b69f4187ddb2d76b3fdc7d068b44340aaccc111 --- /dev/null +++ b/SFT/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f4fd271e3f35aad797430a0f2005c749c3af70ea63d91ec8cab93f66e822599 +size 33661637 diff --git a/SFT/checkpoint-300/rng_state.pth b/SFT/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bcfe1c4d9970712806ccb143ff89e5f8f584775d --- /dev/null +++ b/SFT/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ee31f4e8ccf03669cc613e5d858c3984c8e3c970b1a9ba434908c29fd0aac39 +size 14575 diff --git a/SFT/checkpoint-300/scheduler.pt b/SFT/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a30ac8d71053940779c1b2cdeca116250ed4e72c --- /dev/null +++ b/SFT/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e62fb217d4f433c5218c2eb7a93e48263c5471fb5ecd1e2f448132c605bafbc +size 627 diff --git a/SFT/checkpoint-300/trainer_state.json b/SFT/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..884dfa3fd6e6195c75ae1e58eb1b228b5ad0e36d --- /dev/null +++ b/SFT/checkpoint-300/trainer_state.json @@ -0,0 +1,403 @@ +{ + "best_metric": 2.184567451477051, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-300", + "epoch": 0.410958904109589, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 8.864822917624627e+16, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-300/training_args.bin b/SFT/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-400/README.md b/SFT/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-400/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-400/adapter_config.json b/SFT/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-400/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-400/adapter_model.bin b/SFT/checkpoint-400/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..c0f82bbe818e10dcfa429c0c9f2b944bd976c3fd --- /dev/null +++ b/SFT/checkpoint-400/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce1dd8a5189a3cb2e39883ba19146d7935d32a09721615c650af0995640860d9 +size 16821197 diff --git a/SFT/checkpoint-400/finetuning_args.json b/SFT/checkpoint-400/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-400/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-400/optimizer.pt b/SFT/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..757fa7f1e47df0f527c42e85eefc6bfeebe525f0 --- /dev/null +++ b/SFT/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c676b41dd9644920d3318021ddc996f1facc981390b524762f67ee148ec0421 +size 33661637 diff --git a/SFT/checkpoint-400/rng_state.pth b/SFT/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e3f955cce60c782a9dfb39d78e5927cd5a0a57a9 --- /dev/null +++ b/SFT/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b3f46e917a8781ca7c66db61abab174b5b496de3010eac5757054f728a96ca6 +size 14575 diff --git a/SFT/checkpoint-400/scheduler.pt b/SFT/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..546a531f83f2dda52fdec8dc8126089d2592979d --- /dev/null +++ b/SFT/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac5d3f43ed69446fad9930029d8fae9a1a7ffa8bbc4be2c259c862806fc36ef +size 627 diff --git a/SFT/checkpoint-400/trainer_state.json b/SFT/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..308c49eb9173079f3bcb6078b97dfb4a29dbe6b3 --- /dev/null +++ b/SFT/checkpoint-400/trainer_state.json @@ -0,0 +1,531 @@ +{ + "best_metric": 2.1513657569885254, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-400", + "epoch": 0.547945205479452, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.1819710234499482e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-400/training_args.bin b/SFT/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-500/README.md b/SFT/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-500/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-500/adapter_config.json b/SFT/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-500/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-500/adapter_model.bin b/SFT/checkpoint-500/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..51055ae497dda5e7d5382584c7e4ebff3d3d2c7c --- /dev/null +++ b/SFT/checkpoint-500/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f173813bf19288e92c2b5be67ffb61c8b03f98a294b8110101e3abc218e2610 +size 16821197 diff --git a/SFT/checkpoint-500/finetuning_args.json b/SFT/checkpoint-500/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-500/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-500/optimizer.pt b/SFT/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3954004fba2a7f9d82d203136bc213368261207e --- /dev/null +++ b/SFT/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daaef6c63b1e5004ec4a80d49b4fbee6901c71eb2376fa80422d65d1b9cc848e +size 33661637 diff --git a/SFT/checkpoint-500/rng_state.pth b/SFT/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4284e9d73ffdc6ca4430c88fa395f7a40e8324a3 --- /dev/null +++ b/SFT/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3677a4fccdce55d183b8bc09adb74bb89d2b29bfb9398ee43ebc840ec2992e5e +size 14575 diff --git a/SFT/checkpoint-500/scheduler.pt b/SFT/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bede0d6dc943b945544ced964468b456c9f1183e --- /dev/null +++ b/SFT/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f74d397706a0ea1524d9b5b10c103cae47ffea1ad7fc83e3ef4470c2feb1dffb +size 627 diff --git a/SFT/checkpoint-500/trainer_state.json b/SFT/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d07fcc772160305e3bd908c47f869ec1f75fcaca --- /dev/null +++ b/SFT/checkpoint-500/trainer_state.json @@ -0,0 +1,659 @@ +{ + "best_metric": 2.1377201080322266, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-500", + "epoch": 0.684931506849315, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.4744657689362432e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-500/training_args.bin b/SFT/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-600/README.md b/SFT/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-600/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-600/adapter_config.json b/SFT/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-600/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-600/adapter_model.bin b/SFT/checkpoint-600/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0bee1ca3cbda3045defdd28845b7e38fb2dcccb0 --- /dev/null +++ b/SFT/checkpoint-600/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3336198add616e2180956c42280cecf2625fcfa9c26880f30ebe78f241795481 +size 16821197 diff --git a/SFT/checkpoint-600/finetuning_args.json b/SFT/checkpoint-600/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-600/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-600/optimizer.pt b/SFT/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2058ab88cef58af62e6cac4dba37bfb4301a3b9d --- /dev/null +++ b/SFT/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331f9d68e5fb23cc3b3003560faa73570c2e4418fcb30a27505b62dfe60bde06 +size 33661637 diff --git a/SFT/checkpoint-600/rng_state.pth b/SFT/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4cb254b0e8d8f341e8924017e2f394e7416b05b5 --- /dev/null +++ b/SFT/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541c6546d013a1e9213e668d930152dc34cc0939b9d3159058f37df42860ffc1 +size 14575 diff --git a/SFT/checkpoint-600/scheduler.pt b/SFT/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a0cbe6575bcad9a5c86ffda04ca5ed91943ebbe --- /dev/null +++ b/SFT/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430f3b776a44554544b9578c01edda38b938bbbe8d035ee2bd1f17b36b31b3c3 +size 627 diff --git a/SFT/checkpoint-600/trainer_state.json b/SFT/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6842191420c6e4452def76315d500f5a808020ef --- /dev/null +++ b/SFT/checkpoint-600/trainer_state.json @@ -0,0 +1,787 @@ +{ + "best_metric": 2.1165168285369873, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-600", + "epoch": 0.821917808219178, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 1.7713291588042752e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-600/training_args.bin b/SFT/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-700/README.md b/SFT/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-700/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-700/adapter_config.json b/SFT/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-700/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-700/adapter_model.bin b/SFT/checkpoint-700/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..12f09e8b43f6b6465e597f0d3578792eae75f67c --- /dev/null +++ b/SFT/checkpoint-700/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de8fa61fd601e28210ffd256372fd0e63febceabc334c8182c8e064d118cbe2 +size 16821197 diff --git a/SFT/checkpoint-700/finetuning_args.json b/SFT/checkpoint-700/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-700/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-700/optimizer.pt b/SFT/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f416a9e5e0bc0ef971899344920db9409369688 --- /dev/null +++ b/SFT/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eabced39a5d88798db0d01626b283c4626daf12874e2af691cd8b915ca027c0b +size 33661637 diff --git a/SFT/checkpoint-700/rng_state.pth b/SFT/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f942e6561c1a4dcaf9eb6b7136c1c18fd4a38e17 --- /dev/null +++ b/SFT/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9205f0458b166c2c357b9ff86dd842ef7e5d8c6c7eff85f76acef0f0c6476d5 +size 14575 diff --git a/SFT/checkpoint-700/scheduler.pt b/SFT/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f9317dcc4cbd0643dcbeb519d818888757f58bf --- /dev/null +++ b/SFT/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a677467ef0a2bb886a1c8ff0091996edb1537eb68c4f32e08e2ec3edbba69730 +size 627 diff --git a/SFT/checkpoint-700/trainer_state.json b/SFT/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9781b787f8fb6f0bb981a4ba08efa813806f7726 --- /dev/null +++ b/SFT/checkpoint-700/trainer_state.json @@ -0,0 +1,915 @@ +{ + "best_metric": 2.0954582691192627, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-700", + "epoch": 0.958904109589041, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.0651100306210816e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-700/training_args.bin b/SFT/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-800/README.md b/SFT/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-800/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-800/adapter_config.json b/SFT/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-800/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-800/adapter_model.bin b/SFT/checkpoint-800/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..181ad72a2fb5ecbbd851d45475ad24638259cb51 --- /dev/null +++ b/SFT/checkpoint-800/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b7100d23a997418396835f57044d3cc53bfda13b068bba631bd3a7fbd74c47 +size 16821197 diff --git a/SFT/checkpoint-800/finetuning_args.json b/SFT/checkpoint-800/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-800/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-800/optimizer.pt b/SFT/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..55ff62b95558f844296846894bcc45d8ad4c7278 --- /dev/null +++ b/SFT/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f87c8aa7af4d6c6a23c3f32ae51e843b17416b0cd9b2bf8e4a314a91b77e1ca +size 33661637 diff --git a/SFT/checkpoint-800/rng_state.pth b/SFT/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..805613dbc9f9d30adbcac69b5f2ae03d5f6969d6 --- /dev/null +++ b/SFT/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5506f73a72272c93605af2b549b5fcefdf5fe30b64be61c2c76a2a7a6dd2c51d +size 14575 diff --git a/SFT/checkpoint-800/scheduler.pt b/SFT/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0da30e963d492645ccc1d69938158ac728d306e4 --- /dev/null +++ b/SFT/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eba8ba02dc556cfedf0768ba9f80866dc0271cde0f53fe174576c8a78bd5281 +size 627 diff --git a/SFT/checkpoint-800/trainer_state.json b/SFT/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..575dd9289faa986279c047bf7b810fece7a2fe31 --- /dev/null +++ b/SFT/checkpoint-800/trainer_state.json @@ -0,0 +1,1043 @@ +{ + "best_metric": 2.081965923309326, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-800", + "epoch": 1.095890410958904, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.3608466514886656e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-800/training_args.bin b/SFT/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/checkpoint-900/README.md b/SFT/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/SFT/checkpoint-900/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/SFT/checkpoint-900/adapter_config.json b/SFT/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c871f46929829c2765f5be7eede499cfb9a19a31 --- /dev/null +++ b/SFT/checkpoint-900/adapter_config.json @@ -0,0 +1,21 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "ziqingyang/chinese-alpaca-2-7b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/SFT/checkpoint-900/adapter_model.bin b/SFT/checkpoint-900/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..0524d7cd8f8850845cd13ba0e9bb8779d3ef89f6 --- /dev/null +++ b/SFT/checkpoint-900/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54c41322f65a5964cc697c17a103b6464666cfc1b75b7fd8bf9b081d5d872057 +size 16821197 diff --git a/SFT/checkpoint-900/finetuning_args.json b/SFT/checkpoint-900/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/checkpoint-900/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/checkpoint-900/optimizer.pt b/SFT/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d54af775a0b12de6e518c2fc079e7ab141ad652 --- /dev/null +++ b/SFT/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1e770ee0df2c41e1cfd43d80c02f16bd9e6479b9f0ba042087342844fc625a +size 33661637 diff --git a/SFT/checkpoint-900/rng_state.pth b/SFT/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..96df7f8b478c7081a0e8485a7c91157ed32c572e --- /dev/null +++ b/SFT/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40fcbf45884b49742c6abb9f8899213ad59dbbcf65d267b73f2cb3f8f5e2270c +size 14575 diff --git a/SFT/checkpoint-900/scheduler.pt b/SFT/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a29c7081590d5c3d8b7b6569e0745db458bb175f --- /dev/null +++ b/SFT/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0219ace2af72839d4140343fca73a9ef3451eab6382d38108c6978bb1686af47 +size 627 diff --git a/SFT/checkpoint-900/trainer_state.json b/SFT/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9a181e6ce2ba4198f2176481db5a3b60887c29f9 --- /dev/null +++ b/SFT/checkpoint-900/trainer_state.json @@ -0,0 +1,1171 @@ +{ + "best_metric": 2.0750818252563477, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-900", + "epoch": 1.2328767123287672, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 2.65399331332522e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/checkpoint-900/training_args.bin b/SFT/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/SFT/eval_results.json b/SFT/eval_results.json new file mode 100644 index 0000000000000000000000000000000000000000..12d4e023e2b537fd52b4ef93d887f2e4560fa1c5 --- /dev/null +++ b/SFT/eval_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "eval_loss": 1.98636794090271, + "eval_runtime": 8.8384, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697 +} \ No newline at end of file diff --git a/SFT/finetuning_args.json b/SFT/finetuning_args.json new file mode 100644 index 0000000000000000000000000000000000000000..d01efc206b59c6f88548e8f3940579f2ed2af33b --- /dev/null +++ b/SFT/finetuning_args.json @@ -0,0 +1,16 @@ +{ + "dpo_beta": 0.1, + "finetuning_type": "lora", + "lora_alpha": 32.0, + "lora_dropout": 0.1, + "lora_rank": 8, + "lora_target": [ + "q_proj", + "v_proj" + ], + "name_module_trainable": "mlp", + "num_hidden_layers": 32, + "num_layer_trainable": 3, + "ppo_score_norm": false, + "resume_lora_training": true +} diff --git a/SFT/train_results.json b/SFT/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2f56d29f718eb131b194b8c7a1b27a407bca25a7 --- /dev/null +++ b/SFT/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 3.0, + "train_loss": 1.969855450712927, + "train_runtime": 7239.9426, + "train_samples_per_second": 4.84, + "train_steps_per_second": 0.302 +} \ No newline at end of file diff --git a/SFT/trainer_log.jsonl b/SFT/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..59d584b762ecde5312acfcd087aab07edd649889 --- /dev/null +++ b/SFT/trainer_log.jsonl @@ -0,0 +1,461 @@ +{"current_steps": 5, "total_steps": 2190, "loss": 4.0099, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000999995369868095, "epoch": 0.01, "percentage": 0.23, "elapsed_time": "0:00:14", "remaining_time": "1:43:41"} +{"current_steps": 10, "total_steps": 2190, "loss": 2.887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009999670749281082, "epoch": 0.01, "percentage": 0.46, "elapsed_time": "0:00:32", "remaining_time": "1:56:26"} +{"current_steps": 15, "total_steps": 2190, "loss": 2.6742, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000999913058797528, "epoch": 0.02, "percentage": 0.68, "elapsed_time": "0:00:48", "remaining_time": "1:57:33"} +{"current_steps": 20, "total_steps": 2190, "loss": 2.7939, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009998333242552556, "epoch": 0.03, "percentage": 0.91, "elapsed_time": "0:01:01", "remaining_time": "1:52:04"} +{"current_steps": 25, "total_steps": 2190, "loss": 2.6062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009997278754032958, "epoch": 0.03, "percentage": 1.14, "elapsed_time": "0:01:17", "remaining_time": "1:51:51"} +{"current_steps": 30, "total_steps": 2190, "loss": 2.7508, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009995967176665461, "epoch": 0.04, "percentage": 1.37, "elapsed_time": "0:01:32", "remaining_time": "1:50:50"} +{"current_steps": 35, "total_steps": 2190, "loss": 2.547, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009994398577925167, "epoch": 0.05, "percentage": 1.6, "elapsed_time": "0:01:48", "remaining_time": "1:51:19"} +{"current_steps": 40, "total_steps": 2190, "loss": 2.5281, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009992573038509849, "epoch": 0.05, "percentage": 1.83, "elapsed_time": "0:02:04", "remaining_time": "1:51:24"} +{"current_steps": 45, "total_steps": 2190, "loss": 2.6397, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009990490652335784, "epoch": 0.06, "percentage": 2.05, "elapsed_time": "0:02:19", "remaining_time": "1:50:52"} +{"current_steps": 50, "total_steps": 2190, "loss": 2.608, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000998815152653293, "epoch": 0.07, "percentage": 2.28, "elapsed_time": "0:02:35", "remaining_time": "1:50:34"} +{"current_steps": 55, "total_steps": 2190, "loss": 2.4076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000998555578143941, "epoch": 0.08, "percentage": 2.51, "elapsed_time": "0:02:52", "remaining_time": "1:51:50"} +{"current_steps": 60, "total_steps": 2190, "loss": 2.5914, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009982703550595329, "epoch": 0.08, "percentage": 2.74, "elapsed_time": "0:03:11", "remaining_time": "1:53:17"} +{"current_steps": 65, "total_steps": 2190, "loss": 2.3517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009979594980735896, "epoch": 0.09, "percentage": 2.97, "elapsed_time": "0:03:27", "remaining_time": "1:52:58"} +{"current_steps": 70, "total_steps": 2190, "loss": 2.4019, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009976230231783876, "epoch": 0.1, "percentage": 3.2, "elapsed_time": "0:03:42", "remaining_time": "1:52:22"} +{"current_steps": 75, "total_steps": 2190, "loss": 2.4481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009972609476841367, "epoch": 0.1, "percentage": 3.42, "elapsed_time": "0:03:59", "remaining_time": "1:52:34"} +{"current_steps": 80, "total_steps": 2190, "loss": 2.3244, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000996873290218089, "epoch": 0.11, "percentage": 3.65, "elapsed_time": "0:04:16", "remaining_time": "1:52:55"} +{"current_steps": 85, "total_steps": 2190, "loss": 2.4821, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000996460070723581, "epoch": 0.12, "percentage": 3.88, "elapsed_time": "0:04:32", "remaining_time": "1:52:24"} +{"current_steps": 90, "total_steps": 2190, "loss": 2.438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009960213104590074, "epoch": 0.12, "percentage": 4.11, "elapsed_time": "0:04:50", "remaining_time": "1:53:02"} +{"current_steps": 95, "total_steps": 2190, "loss": 2.2925, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009955570319967273, "epoch": 0.13, "percentage": 4.34, "elapsed_time": "0:05:06", "remaining_time": "1:52:30"} +{"current_steps": 100, "total_steps": 2190, "loss": 2.3052, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009950672592219031, "epoch": 0.14, "percentage": 4.57, "elapsed_time": "0:05:22", "remaining_time": "1:52:23"} +{"current_steps": 100, "total_steps": 2190, "loss": null, "eval_loss": 2.3112449645996094, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.14, "percentage": 4.57, "elapsed_time": "0:05:22", "remaining_time": "1:52:23"} +{"current_steps": 105, "total_steps": 2190, "loss": 2.5839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000994552017331272, "epoch": 0.14, "percentage": 4.79, "elapsed_time": "0:05:49", "remaining_time": "1:55:37"} +{"current_steps": 110, "total_steps": 2190, "loss": 2.4983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009940113328318488, "epoch": 0.15, "percentage": 5.02, "elapsed_time": "0:06:06", "remaining_time": "1:55:36"} +{"current_steps": 115, "total_steps": 2190, "loss": 2.4598, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009934452335395637, "epoch": 0.16, "percentage": 5.25, "elapsed_time": "0:06:22", "remaining_time": "1:55:08"} +{"current_steps": 120, "total_steps": 2190, "loss": 2.4336, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009928537485778299, "epoch": 0.16, "percentage": 5.48, "elapsed_time": "0:06:38", "remaining_time": "1:54:35"} +{"current_steps": 125, "total_steps": 2190, "loss": 2.3799, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009922369083760461, "epoch": 0.17, "percentage": 5.71, "elapsed_time": "0:06:51", "remaining_time": "1:53:24"} +{"current_steps": 130, "total_steps": 2190, "loss": 2.5443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009915947446680307, "epoch": 0.18, "percentage": 5.94, "elapsed_time": "0:07:09", "remaining_time": "1:53:23"} +{"current_steps": 135, "total_steps": 2190, "loss": 2.3701, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009909272904903897, "epoch": 0.18, "percentage": 6.16, "elapsed_time": "0:07:24", "remaining_time": "1:52:41"} +{"current_steps": 140, "total_steps": 2190, "loss": 2.2718, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009902345801808161, "epoch": 0.19, "percentage": 6.39, "elapsed_time": "0:07:40", "remaining_time": "1:52:29"} +{"current_steps": 145, "total_steps": 2190, "loss": 2.3402, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009895166493763246, "epoch": 0.2, "percentage": 6.62, "elapsed_time": "0:07:56", "remaining_time": "1:52:03"} +{"current_steps": 150, "total_steps": 2190, "loss": 2.4072, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009887735350114174, "epoch": 0.21, "percentage": 6.85, "elapsed_time": "0:08:11", "remaining_time": "1:51:22"} +{"current_steps": 155, "total_steps": 2190, "loss": 2.3807, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000988005275316184, "epoch": 0.21, "percentage": 7.08, "elapsed_time": "0:08:27", "remaining_time": "1:51:08"} +{"current_steps": 160, "total_steps": 2190, "loss": 2.2149, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000987211909814336, "epoch": 0.22, "percentage": 7.31, "elapsed_time": "0:08:45", "remaining_time": "1:51:01"} +{"current_steps": 165, "total_steps": 2190, "loss": 2.2713, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000986393479321171, "epoch": 0.23, "percentage": 7.53, "elapsed_time": "0:09:01", "remaining_time": "1:50:48"} +{"current_steps": 170, "total_steps": 2190, "loss": 2.08, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009855500259414753, "epoch": 0.23, "percentage": 7.76, "elapsed_time": "0:09:18", "remaining_time": "1:50:31"} +{"current_steps": 175, "total_steps": 2190, "loss": 2.4682, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009846815930673563, "epoch": 0.24, "percentage": 7.99, "elapsed_time": "0:09:35", "remaining_time": "1:50:24"} +{"current_steps": 180, "total_steps": 2190, "loss": 2.3328, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000983788225376011, "epoch": 0.25, "percentage": 8.22, "elapsed_time": "0:09:50", "remaining_time": "1:49:48"} +{"current_steps": 185, "total_steps": 2190, "loss": 2.3671, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009828699688274275, "epoch": 0.25, "percentage": 8.45, "elapsed_time": "0:10:04", "remaining_time": "1:49:12"} +{"current_steps": 190, "total_steps": 2190, "loss": 2.3251, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009819268706620196, "epoch": 0.26, "percentage": 8.68, "elapsed_time": "0:10:20", "remaining_time": "1:48:53"} +{"current_steps": 195, "total_steps": 2190, "loss": 2.4429, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009809589793981978, "epoch": 0.27, "percentage": 8.9, "elapsed_time": "0:10:34", "remaining_time": "1:48:08"} +{"current_steps": 200, "total_steps": 2190, "loss": 2.2499, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009799663448298724, "epoch": 0.27, "percentage": 9.13, "elapsed_time": "0:10:52", "remaining_time": "1:48:13"} +{"current_steps": 200, "total_steps": 2190, "loss": null, "eval_loss": 2.234222650527954, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.27, "percentage": 9.13, "elapsed_time": "0:10:52", "remaining_time": "1:48:13"} +{"current_steps": 205, "total_steps": 2190, "loss": 2.2279, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009789490180238916, "epoch": 0.28, "percentage": 9.36, "elapsed_time": "0:11:17", "remaining_time": "1:49:21"} +{"current_steps": 210, "total_steps": 2190, "loss": 2.4875, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009779070513174157, "epoch": 0.29, "percentage": 9.59, "elapsed_time": "0:11:34", "remaining_time": "1:49:10"} +{"current_steps": 215, "total_steps": 2190, "loss": 2.3507, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009768404983152227, "epoch": 0.29, "percentage": 9.82, "elapsed_time": "0:11:50", "remaining_time": "1:48:43"} +{"current_steps": 220, "total_steps": 2190, "loss": 2.2189, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009757494138869523, "epoch": 0.3, "percentage": 10.05, "elapsed_time": "0:12:06", "remaining_time": "1:48:21"} +{"current_steps": 225, "total_steps": 2190, "loss": 2.2597, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009746338541642812, "epoch": 0.31, "percentage": 10.27, "elapsed_time": "0:12:22", "remaining_time": "1:48:03"} +{"current_steps": 230, "total_steps": 2190, "loss": 2.334, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009734938765380377, "epoch": 0.32, "percentage": 10.5, "elapsed_time": "0:12:38", "remaining_time": "1:47:44"} +{"current_steps": 235, "total_steps": 2190, "loss": 2.3133, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000972329539655247, "epoch": 0.32, "percentage": 10.73, "elapsed_time": "0:12:55", "remaining_time": "1:47:32"} +{"current_steps": 240, "total_steps": 2190, "loss": 2.2286, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009711409034161151, "epoch": 0.33, "percentage": 10.96, "elapsed_time": "0:13:11", "remaining_time": "1:47:11"} +{"current_steps": 245, "total_steps": 2190, "loss": 2.224, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009699280289709478, "epoch": 0.34, "percentage": 11.19, "elapsed_time": "0:13:25", "remaining_time": "1:46:37"} +{"current_steps": 250, "total_steps": 2190, "loss": 2.3772, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009686909787170031, "epoch": 0.34, "percentage": 11.42, "elapsed_time": "0:13:42", "remaining_time": "1:46:21"} +{"current_steps": 255, "total_steps": 2190, "loss": 2.3606, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009674298162952826, "epoch": 0.35, "percentage": 11.64, "elapsed_time": "0:13:59", "remaining_time": "1:46:11"} +{"current_steps": 260, "total_steps": 2190, "loss": 2.3207, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009661446065872568, "epoch": 0.36, "percentage": 11.87, "elapsed_time": "0:14:14", "remaining_time": "1:45:42"} +{"current_steps": 265, "total_steps": 2190, "loss": 2.2505, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009648354157115271, "epoch": 0.36, "percentage": 12.1, "elapsed_time": "0:14:30", "remaining_time": "1:45:26"} +{"current_steps": 270, "total_steps": 2190, "loss": 2.2192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009635023110204253, "epoch": 0.37, "percentage": 12.33, "elapsed_time": "0:14:45", "remaining_time": "1:44:57"} +{"current_steps": 275, "total_steps": 2190, "loss": 2.3082, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009621453610965467, "epoch": 0.38, "percentage": 12.56, "elapsed_time": "0:15:00", "remaining_time": "1:44:30"} +{"current_steps": 280, "total_steps": 2190, "loss": 2.2913, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009607646357492237, "epoch": 0.38, "percentage": 12.79, "elapsed_time": "0:15:18", "remaining_time": "1:44:28"} +{"current_steps": 285, "total_steps": 2190, "loss": 2.1133, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009593602060109334, "epoch": 0.39, "percentage": 13.01, "elapsed_time": "0:15:34", "remaining_time": "1:44:09"} +{"current_steps": 290, "total_steps": 2190, "loss": 2.1855, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009579321441336436, "epoch": 0.4, "percentage": 13.24, "elapsed_time": "0:15:52", "remaining_time": "1:43:58"} +{"current_steps": 295, "total_steps": 2190, "loss": 2.3691, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009564805235850955, "epoch": 0.4, "percentage": 13.47, "elapsed_time": "0:16:08", "remaining_time": "1:43:43"} +{"current_steps": 300, "total_steps": 2190, "loss": 2.1942, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009550054190450246, "epoch": 0.41, "percentage": 13.7, "elapsed_time": "0:16:26", "remaining_time": "1:43:33"} +{"current_steps": 300, "total_steps": 2190, "loss": null, "eval_loss": 2.184567451477051, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.41, "percentage": 13.7, "elapsed_time": "0:16:26", "remaining_time": "1:43:33"} +{"current_steps": 305, "total_steps": 2190, "loss": 2.2661, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000953506906401318, "epoch": 0.42, "percentage": 13.93, "elapsed_time": "0:16:49", "remaining_time": "1:43:58"} +{"current_steps": 310, "total_steps": 2190, "loss": 2.1222, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009519850627461109, "epoch": 0.42, "percentage": 14.16, "elapsed_time": "0:17:05", "remaining_time": "1:43:38"} +{"current_steps": 315, "total_steps": 2190, "loss": 2.2852, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009504399663718202, "epoch": 0.43, "percentage": 14.38, "elapsed_time": "0:17:21", "remaining_time": "1:43:20"} +{"current_steps": 320, "total_steps": 2190, "loss": 2.2543, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009488716967671169, "epoch": 0.44, "percentage": 14.61, "elapsed_time": "0:17:40", "remaining_time": "1:43:17"} +{"current_steps": 325, "total_steps": 2190, "loss": 2.4232, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009472803346128368, "epoch": 0.45, "percentage": 14.84, "elapsed_time": "0:17:56", "remaining_time": "1:42:58"} +{"current_steps": 330, "total_steps": 2190, "loss": 2.3455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009456659617778294, "epoch": 0.45, "percentage": 15.07, "elapsed_time": "0:18:13", "remaining_time": "1:42:44"} +{"current_steps": 335, "total_steps": 2190, "loss": 2.2071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009440286613147466, "epoch": 0.46, "percentage": 15.3, "elapsed_time": "0:18:29", "remaining_time": "1:42:24"} +{"current_steps": 340, "total_steps": 2190, "loss": 2.2561, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009423685174557695, "epoch": 0.47, "percentage": 15.53, "elapsed_time": "0:18:44", "remaining_time": "1:41:56"} +{"current_steps": 345, "total_steps": 2190, "loss": 2.2257, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009406856156082755, "epoch": 0.47, "percentage": 15.75, "elapsed_time": "0:19:00", "remaining_time": "1:41:37"} +{"current_steps": 350, "total_steps": 2190, "loss": 2.0418, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009389800423504441, "epoch": 0.48, "percentage": 15.98, "elapsed_time": "0:19:15", "remaining_time": "1:41:17"} +{"current_steps": 355, "total_steps": 2190, "loss": 2.1139, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000937251885426803, "epoch": 0.49, "percentage": 16.21, "elapsed_time": "0:19:30", "remaining_time": "1:40:51"} +{"current_steps": 360, "total_steps": 2190, "loss": 2.0349, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009355012337437138, "epoch": 0.49, "percentage": 16.44, "elapsed_time": "0:19:47", "remaining_time": "1:40:36"} +{"current_steps": 365, "total_steps": 2190, "loss": 2.1056, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009337281773647985, "epoch": 0.5, "percentage": 16.67, "elapsed_time": "0:20:02", "remaining_time": "1:40:14"} +{"current_steps": 370, "total_steps": 2190, "loss": 2.0944, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009319328075063059, "epoch": 0.51, "percentage": 16.89, "elapsed_time": "0:20:20", "remaining_time": "1:40:02"} +{"current_steps": 375, "total_steps": 2190, "loss": 2.0468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009301152165324185, "epoch": 0.51, "percentage": 17.12, "elapsed_time": "0:20:33", "remaining_time": "1:39:32"} +{"current_steps": 380, "total_steps": 2190, "loss": 2.4125, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009282754979505018, "epoch": 0.52, "percentage": 17.35, "elapsed_time": "0:20:52", "remaining_time": "1:39:23"} +{"current_steps": 385, "total_steps": 2190, "loss": 2.14, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009264137464062927, "epoch": 0.53, "percentage": 17.58, "elapsed_time": "0:21:08", "remaining_time": "1:39:07"} +{"current_steps": 390, "total_steps": 2190, "loss": 2.0077, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009245300576790309, "epoch": 0.53, "percentage": 17.81, "elapsed_time": "0:21:24", "remaining_time": "1:38:50"} +{"current_steps": 395, "total_steps": 2190, "loss": 2.0926, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009226245286765316, "epoch": 0.54, "percentage": 18.04, "elapsed_time": "0:21:41", "remaining_time": "1:38:32"} +{"current_steps": 400, "total_steps": 2190, "loss": 2.2612, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009206972574301991, "epoch": 0.55, "percentage": 18.26, "elapsed_time": "0:21:56", "remaining_time": "1:38:11"} +{"current_steps": 400, "total_steps": 2190, "loss": null, "eval_loss": 2.1513657569885254, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.55, "percentage": 18.26, "elapsed_time": "0:21:56", "remaining_time": "1:38:11"} +{"current_steps": 405, "total_steps": 2190, "loss": 2.1961, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009187483430899845, "epoch": 0.55, "percentage": 18.49, "elapsed_time": "0:22:21", "remaining_time": "1:38:32"} +{"current_steps": 410, "total_steps": 2190, "loss": 2.21, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000916777885919285, "epoch": 0.56, "percentage": 18.72, "elapsed_time": "0:22:36", "remaining_time": "1:38:09"} +{"current_steps": 415, "total_steps": 2190, "loss": 2.1734, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009147859872897843, "epoch": 0.57, "percentage": 18.95, "elapsed_time": "0:22:52", "remaining_time": "1:37:49"} +{"current_steps": 420, "total_steps": 2190, "loss": 2.2751, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009127727496762394, "epoch": 0.58, "percentage": 19.18, "elapsed_time": "0:23:07", "remaining_time": "1:37:28"} +{"current_steps": 425, "total_steps": 2190, "loss": 2.165, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009107382766512072, "epoch": 0.58, "percentage": 19.41, "elapsed_time": "0:23:25", "remaining_time": "1:37:16"} +{"current_steps": 430, "total_steps": 2190, "loss": 1.9987, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009086826728797165, "epoch": 0.59, "percentage": 19.63, "elapsed_time": "0:23:41", "remaining_time": "1:36:58"} +{"current_steps": 435, "total_steps": 2190, "loss": 2.2766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009066060441138841, "epoch": 0.6, "percentage": 19.86, "elapsed_time": "0:23:57", "remaining_time": "1:36:37"} +{"current_steps": 440, "total_steps": 2190, "loss": 2.2556, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009045084971874737, "epoch": 0.6, "percentage": 20.09, "elapsed_time": "0:24:13", "remaining_time": "1:36:20"} +{"current_steps": 445, "total_steps": 2190, "loss": 2.2067, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009023901400103995, "epoch": 0.61, "percentage": 20.32, "elapsed_time": "0:24:27", "remaining_time": "1:35:54"} +{"current_steps": 450, "total_steps": 2190, "loss": 2.1062, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0009002510815631754, "epoch": 0.62, "percentage": 20.55, "elapsed_time": "0:24:45", "remaining_time": "1:35:43"} +{"current_steps": 455, "total_steps": 2190, "loss": 2.2781, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008980914318913078, "epoch": 0.62, "percentage": 20.78, "elapsed_time": "0:25:00", "remaining_time": "1:35:23"} +{"current_steps": 460, "total_steps": 2190, "loss": 2.2977, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008959113020996348, "epoch": 0.63, "percentage": 21.0, "elapsed_time": "0:25:18", "remaining_time": "1:35:10"} +{"current_steps": 465, "total_steps": 2190, "loss": 2.1739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008937108043466098, "epoch": 0.64, "percentage": 21.23, "elapsed_time": "0:25:33", "remaining_time": "1:34:48"} +{"current_steps": 470, "total_steps": 2190, "loss": 2.2108, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008914900518385314, "epoch": 0.64, "percentage": 21.46, "elapsed_time": "0:25:47", "remaining_time": "1:34:23"} +{"current_steps": 475, "total_steps": 2190, "loss": 2.204, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008892491588237203, "epoch": 0.65, "percentage": 21.69, "elapsed_time": "0:26:03", "remaining_time": "1:34:05"} +{"current_steps": 480, "total_steps": 2190, "loss": 2.0958, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008869882405866404, "epoch": 0.66, "percentage": 21.92, "elapsed_time": "0:26:20", "remaining_time": "1:33:51"} +{"current_steps": 485, "total_steps": 2190, "loss": 2.1794, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000884707413441969, "epoch": 0.66, "percentage": 22.15, "elapsed_time": "0:26:35", "remaining_time": "1:33:30"} +{"current_steps": 490, "total_steps": 2190, "loss": 2.2172, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008824067947286121, "epoch": 0.67, "percentage": 22.37, "elapsed_time": "0:26:52", "remaining_time": "1:33:12"} +{"current_steps": 495, "total_steps": 2190, "loss": 2.1321, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008800865028036685, "epoch": 0.68, "percentage": 22.6, "elapsed_time": "0:27:07", "remaining_time": "1:32:54"} +{"current_steps": 500, "total_steps": 2190, "loss": 2.1153, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008777466570363402, "epoch": 0.68, "percentage": 22.83, "elapsed_time": "0:27:24", "remaining_time": "1:32:36"} +{"current_steps": 500, "total_steps": 2190, "loss": null, "eval_loss": 2.1377201080322266, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.68, "percentage": 22.83, "elapsed_time": "0:27:24", "remaining_time": "1:32:36"} +{"current_steps": 505, "total_steps": 2190, "loss": 2.0652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008753873778017918, "epoch": 0.69, "percentage": 23.06, "elapsed_time": "0:27:46", "remaining_time": "1:32:40"} +{"current_steps": 510, "total_steps": 2190, "loss": 2.1858, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008730087864749578, "epoch": 0.7, "percentage": 23.29, "elapsed_time": "0:28:03", "remaining_time": "1:32:25"} +{"current_steps": 515, "total_steps": 2190, "loss": 2.3533, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008706110054242979, "epoch": 0.71, "percentage": 23.52, "elapsed_time": "0:28:19", "remaining_time": "1:32:07"} +{"current_steps": 520, "total_steps": 2190, "loss": 2.0069, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008681941580055016, "epoch": 0.71, "percentage": 23.74, "elapsed_time": "0:28:37", "remaining_time": "1:31:55"} +{"current_steps": 525, "total_steps": 2190, "loss": 2.2022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008657583685551429, "epoch": 0.72, "percentage": 23.97, "elapsed_time": "0:28:53", "remaining_time": "1:31:37"} +{"current_steps": 530, "total_steps": 2190, "loss": 2.2126, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008633037623842828, "epoch": 0.73, "percentage": 24.2, "elapsed_time": "0:29:10", "remaining_time": "1:31:21"} +{"current_steps": 535, "total_steps": 2190, "loss": 2.1354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008608304657720232, "epoch": 0.73, "percentage": 24.43, "elapsed_time": "0:29:25", "remaining_time": "1:31:01"} +{"current_steps": 540, "total_steps": 2190, "loss": 2.3017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00085833860595901, "epoch": 0.74, "percentage": 24.66, "elapsed_time": "0:29:39", "remaining_time": "1:30:36"} +{"current_steps": 545, "total_steps": 2190, "loss": 2.1492, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008558283111408874, "epoch": 0.75, "percentage": 24.89, "elapsed_time": "0:29:56", "remaining_time": "1:30:21"} +{"current_steps": 550, "total_steps": 2190, "loss": 2.3461, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008532997104617022, "epoch": 0.75, "percentage": 25.11, "elapsed_time": "0:30:12", "remaining_time": "1:30:05"} +{"current_steps": 555, "total_steps": 2190, "loss": 2.2892, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008507529340072608, "epoch": 0.76, "percentage": 25.34, "elapsed_time": "0:30:28", "remaining_time": "1:29:45"} +{"current_steps": 560, "total_steps": 2190, "loss": 2.1384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008481881127984361, "epoch": 0.77, "percentage": 25.57, "elapsed_time": "0:30:46", "remaining_time": "1:29:34"} +{"current_steps": 565, "total_steps": 2190, "loss": 2.2022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008456053787844274, "epoch": 0.77, "percentage": 25.8, "elapsed_time": "0:31:01", "remaining_time": "1:29:15"} +{"current_steps": 570, "total_steps": 2190, "loss": 2.1867, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008430048648359713, "epoch": 0.78, "percentage": 26.03, "elapsed_time": "0:31:19", "remaining_time": "1:29:01"} +{"current_steps": 575, "total_steps": 2190, "loss": 2.0888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000840386704738508, "epoch": 0.79, "percentage": 26.26, "elapsed_time": "0:31:35", "remaining_time": "1:28:43"} +{"current_steps": 580, "total_steps": 2190, "loss": 2.1194, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008377510331852969, "epoch": 0.79, "percentage": 26.48, "elapsed_time": "0:31:51", "remaining_time": "1:28:26"} +{"current_steps": 585, "total_steps": 2190, "loss": 2.1946, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008350979857704872, "epoch": 0.8, "percentage": 26.71, "elapsed_time": "0:32:07", "remaining_time": "1:28:07"} +{"current_steps": 590, "total_steps": 2190, "loss": 2.2366, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008324276989821433, "epoch": 0.81, "percentage": 26.94, "elapsed_time": "0:32:23", "remaining_time": "1:27:49"} +{"current_steps": 595, "total_steps": 2190, "loss": 2.2807, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008297403101952221, "epoch": 0.82, "percentage": 27.17, "elapsed_time": "0:32:41", "remaining_time": "1:27:36"} +{"current_steps": 600, "total_steps": 2190, "loss": 2.2787, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008270359576645061, "epoch": 0.82, "percentage": 27.4, "elapsed_time": "0:32:57", "remaining_time": "1:27:19"} +{"current_steps": 600, "total_steps": 2190, "loss": null, "eval_loss": 2.1165168285369873, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.82, "percentage": 27.4, "elapsed_time": "0:32:57", "remaining_time": "1:27:19"} +{"current_steps": 605, "total_steps": 2190, "loss": 2.1761, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008243147805174907, "epoch": 0.83, "percentage": 27.63, "elapsed_time": "0:33:22", "remaining_time": "1:27:25"} +{"current_steps": 610, "total_steps": 2190, "loss": 2.036, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008215769187472266, "epoch": 0.84, "percentage": 27.85, "elapsed_time": "0:33:40", "remaining_time": "1:27:12"} +{"current_steps": 615, "total_steps": 2190, "loss": 2.3055, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008188225132051175, "epoch": 0.84, "percentage": 28.08, "elapsed_time": "0:33:56", "remaining_time": "1:26:55"} +{"current_steps": 620, "total_steps": 2190, "loss": 2.1386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008160517055936743, "epoch": 0.85, "percentage": 28.31, "elapsed_time": "0:34:11", "remaining_time": "1:26:36"} +{"current_steps": 625, "total_steps": 2190, "loss": 2.2383, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008132646384592254, "epoch": 0.86, "percentage": 28.54, "elapsed_time": "0:34:27", "remaining_time": "1:26:17"} +{"current_steps": 630, "total_steps": 2190, "loss": 2.0414, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008104614551845823, "epoch": 0.86, "percentage": 28.77, "elapsed_time": "0:34:40", "remaining_time": "1:25:52"} +{"current_steps": 635, "total_steps": 2190, "loss": 2.1539, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000807642299981664, "epoch": 0.87, "percentage": 29.0, "elapsed_time": "0:34:57", "remaining_time": "1:25:35"} +{"current_steps": 640, "total_steps": 2190, "loss": 2.2804, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008048073178840773, "epoch": 0.88, "percentage": 29.22, "elapsed_time": "0:35:11", "remaining_time": "1:25:14"} +{"current_steps": 645, "total_steps": 2190, "loss": 2.0363, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0008019566547396563, "epoch": 0.88, "percentage": 29.45, "elapsed_time": "0:35:27", "remaining_time": "1:24:56"} +{"current_steps": 650, "total_steps": 2190, "loss": 2.152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007990904572029582, "epoch": 0.89, "percentage": 29.68, "elapsed_time": "0:35:43", "remaining_time": "1:24:37"} +{"current_steps": 655, "total_steps": 2190, "loss": 2.1571, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007962088727277193, "epoch": 0.9, "percentage": 29.91, "elapsed_time": "0:35:56", "remaining_time": "1:24:14"} +{"current_steps": 660, "total_steps": 2190, "loss": 2.1813, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007933120495592682, "epoch": 0.9, "percentage": 30.14, "elapsed_time": "0:36:13", "remaining_time": "1:23:58"} +{"current_steps": 665, "total_steps": 2190, "loss": 2.2906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007904001367269004, "epoch": 0.91, "percentage": 30.37, "elapsed_time": "0:36:30", "remaining_time": "1:23:42"} +{"current_steps": 670, "total_steps": 2190, "loss": 2.2132, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007874732840362107, "epoch": 0.92, "percentage": 30.59, "elapsed_time": "0:36:48", "remaining_time": "1:23:30"} +{"current_steps": 675, "total_steps": 2190, "loss": 2.0774, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007845316420613859, "epoch": 0.92, "percentage": 30.82, "elapsed_time": "0:37:01", "remaining_time": "1:23:05"} +{"current_steps": 680, "total_steps": 2190, "loss": 2.1264, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007815753621374593, "epoch": 0.93, "percentage": 31.05, "elapsed_time": "0:37:16", "remaining_time": "1:22:46"} +{"current_steps": 685, "total_steps": 2190, "loss": 2.1156, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007786045963525249, "epoch": 0.94, "percentage": 31.28, "elapsed_time": "0:37:33", "remaining_time": "1:22:30"} +{"current_steps": 690, "total_steps": 2190, "loss": 2.0804, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007756194975399123, "epoch": 0.95, "percentage": 31.51, "elapsed_time": "0:37:51", "remaining_time": "1:22:18"} +{"current_steps": 695, "total_steps": 2190, "loss": 2.2225, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007726202192703255, "epoch": 0.95, "percentage": 31.74, "elapsed_time": "0:38:09", "remaining_time": "1:22:04"} +{"current_steps": 700, "total_steps": 2190, "loss": 2.0872, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007696069158439412, "epoch": 0.96, "percentage": 31.96, "elapsed_time": "0:38:25", "remaining_time": "1:21:48"} +{"current_steps": 700, "total_steps": 2190, "loss": null, "eval_loss": 2.0954582691192627, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.96, "percentage": 31.96, "elapsed_time": "0:38:25", "remaining_time": "1:21:48"} +{"current_steps": 705, "total_steps": 2190, "loss": 2.0316, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007665797422824708, "epoch": 0.97, "percentage": 32.19, "elapsed_time": "0:38:50", "remaining_time": "1:21:48"} +{"current_steps": 710, "total_steps": 2190, "loss": 2.1545, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007635388543211861, "epoch": 0.97, "percentage": 32.42, "elapsed_time": "0:39:07", "remaining_time": "1:21:33"} +{"current_steps": 715, "total_steps": 2190, "loss": 2.082, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007604844084009063, "epoch": 0.98, "percentage": 32.65, "elapsed_time": "0:39:23", "remaining_time": "1:21:14"} +{"current_steps": 720, "total_steps": 2190, "loss": 2.1494, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007574165616599501, "epoch": 0.99, "percentage": 32.88, "elapsed_time": "0:39:40", "remaining_time": "1:20:59"} +{"current_steps": 725, "total_steps": 2190, "loss": 2.2864, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007543354719260522, "epoch": 0.99, "percentage": 33.11, "elapsed_time": "0:39:57", "remaining_time": "1:20:44"} +{"current_steps": 730, "total_steps": 2190, "loss": 2.122, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000751241297708243, "epoch": 1.0, "percentage": 33.33, "elapsed_time": "0:40:13", "remaining_time": "1:20:26"} +{"current_steps": 735, "total_steps": 2190, "loss": 1.9816, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007481341981886942, "epoch": 1.01, "percentage": 33.56, "elapsed_time": "0:40:28", "remaining_time": "1:20:06"} +{"current_steps": 740, "total_steps": 2190, "loss": 1.8529, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007450143332145297, "epoch": 1.01, "percentage": 33.79, "elapsed_time": "0:40:42", "remaining_time": "1:19:45"} +{"current_steps": 745, "total_steps": 2190, "loss": 1.936, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007418818632896017, "epoch": 1.02, "percentage": 34.02, "elapsed_time": "0:40:59", "remaining_time": "1:19:30"} +{"current_steps": 750, "total_steps": 2190, "loss": 1.9031, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007387369495662343, "epoch": 1.03, "percentage": 34.25, "elapsed_time": "0:41:13", "remaining_time": "1:19:09"} +{"current_steps": 755, "total_steps": 2190, "loss": 1.8006, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000735579753836932, "epoch": 1.03, "percentage": 34.47, "elapsed_time": "0:41:31", "remaining_time": "1:18:54"} +{"current_steps": 760, "total_steps": 2190, "loss": 1.9221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007324104385260566, "epoch": 1.04, "percentage": 34.7, "elapsed_time": "0:41:48", "remaining_time": "1:18:39"} +{"current_steps": 765, "total_steps": 2190, "loss": 1.9734, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007292291666814713, "epoch": 1.05, "percentage": 34.93, "elapsed_time": "0:42:03", "remaining_time": "1:18:20"} +{"current_steps": 770, "total_steps": 2190, "loss": 2.0162, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007260361019661522, "epoch": 1.05, "percentage": 35.16, "elapsed_time": "0:42:19", "remaining_time": "1:18:03"} +{"current_steps": 775, "total_steps": 2190, "loss": 1.755, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007228314086497686, "epoch": 1.06, "percentage": 35.39, "elapsed_time": "0:42:35", "remaining_time": "1:17:46"} +{"current_steps": 780, "total_steps": 2190, "loss": 2.0059, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007196152516002323, "epoch": 1.07, "percentage": 35.62, "elapsed_time": "0:42:51", "remaining_time": "1:17:28"} +{"current_steps": 785, "total_steps": 2190, "loss": 2.0842, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007163877962752157, "epoch": 1.08, "percentage": 35.84, "elapsed_time": "0:43:08", "remaining_time": "1:17:12"} +{"current_steps": 790, "total_steps": 2190, "loss": 2.0298, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007131492087136393, "epoch": 1.08, "percentage": 36.07, "elapsed_time": "0:43:23", "remaining_time": "1:16:54"} +{"current_steps": 795, "total_steps": 2190, "loss": 2.0099, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007098996555271309, "epoch": 1.09, "percentage": 36.3, "elapsed_time": "0:43:39", "remaining_time": "1:16:36"} +{"current_steps": 800, "total_steps": 2190, "loss": 2.0318, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007066393038914522, "epoch": 1.1, "percentage": 36.53, "elapsed_time": "0:43:57", "remaining_time": "1:16:21"} +{"current_steps": 800, "total_steps": 2190, "loss": null, "eval_loss": 2.081965923309326, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.1, "percentage": 36.53, "elapsed_time": "0:43:57", "remaining_time": "1:16:21"} +{"current_steps": 805, "total_steps": 2190, "loss": 2.0159, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007033683215379002, "epoch": 1.1, "percentage": 36.76, "elapsed_time": "0:44:20", "remaining_time": "1:16:17"} +{"current_steps": 810, "total_steps": 2190, "loss": 2.0416, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0007000868767446771, "epoch": 1.11, "percentage": 36.99, "elapsed_time": "0:44:37", "remaining_time": "1:16:00"} +{"current_steps": 815, "total_steps": 2190, "loss": 1.9853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006967951383282334, "epoch": 1.12, "percentage": 37.21, "elapsed_time": "0:44:53", "remaining_time": "1:15:44"} +{"current_steps": 820, "total_steps": 2190, "loss": 2.0131, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000693493275634583, "epoch": 1.12, "percentage": 37.44, "elapsed_time": "0:45:11", "remaining_time": "1:15:30"} +{"current_steps": 825, "total_steps": 2190, "loss": 1.9054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006901814585305909, "epoch": 1.13, "percentage": 37.67, "elapsed_time": "0:45:26", "remaining_time": "1:15:10"} +{"current_steps": 830, "total_steps": 2190, "loss": 1.9213, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006868598573952345, "epoch": 1.14, "percentage": 37.9, "elapsed_time": "0:45:44", "remaining_time": "1:14:56"} +{"current_steps": 835, "total_steps": 2190, "loss": 1.8538, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006835286431108383, "epoch": 1.14, "percentage": 38.13, "elapsed_time": "0:45:59", "remaining_time": "1:14:38"} +{"current_steps": 840, "total_steps": 2190, "loss": 2.0632, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006801879870542821, "epoch": 1.15, "percentage": 38.36, "elapsed_time": "0:46:14", "remaining_time": "1:14:18"} +{"current_steps": 845, "total_steps": 2190, "loss": 2.0617, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006768380610881859, "epoch": 1.16, "percentage": 38.58, "elapsed_time": "0:46:30", "remaining_time": "1:14:02"} +{"current_steps": 850, "total_steps": 2190, "loss": 1.9131, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006734790375520663, "epoch": 1.16, "percentage": 38.81, "elapsed_time": "0:46:47", "remaining_time": "1:13:45"} +{"current_steps": 855, "total_steps": 2190, "loss": 1.969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006701110892534723, "epoch": 1.17, "percentage": 39.04, "elapsed_time": "0:47:01", "remaining_time": "1:13:25"} +{"current_steps": 860, "total_steps": 2190, "loss": 2.1041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006667343894590934, "epoch": 1.18, "percentage": 39.27, "elapsed_time": "0:47:15", "remaining_time": "1:13:05"} +{"current_steps": 865, "total_steps": 2190, "loss": 1.9544, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006633491118858471, "epoch": 1.18, "percentage": 39.5, "elapsed_time": "0:47:31", "remaining_time": "1:12:48"} +{"current_steps": 870, "total_steps": 2190, "loss": 1.9392, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006599554306919408, "epoch": 1.19, "percentage": 39.73, "elapsed_time": "0:47:47", "remaining_time": "1:12:31"} +{"current_steps": 875, "total_steps": 2190, "loss": 1.9857, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006565535204679134, "epoch": 1.2, "percentage": 39.95, "elapsed_time": "0:48:06", "remaining_time": "1:12:18"} +{"current_steps": 880, "total_steps": 2190, "loss": 2.0771, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006531435562276514, "epoch": 1.21, "percentage": 40.18, "elapsed_time": "0:48:20", "remaining_time": "1:11:57"} +{"current_steps": 885, "total_steps": 2190, "loss": 1.9266, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006497257133993877, "epoch": 1.21, "percentage": 40.41, "elapsed_time": "0:48:37", "remaining_time": "1:11:41"} +{"current_steps": 890, "total_steps": 2190, "loss": 1.9898, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006463001678166743, "epoch": 1.22, "percentage": 40.64, "elapsed_time": "0:48:53", "remaining_time": "1:11:24"} +{"current_steps": 895, "total_steps": 2190, "loss": 1.9723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006428670957093375, "epoch": 1.23, "percentage": 40.87, "elapsed_time": "0:49:08", "remaining_time": "1:11:06"} +{"current_steps": 900, "total_steps": 2190, "loss": 1.9746, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006394266736944118, "epoch": 1.23, "percentage": 41.1, "elapsed_time": "0:49:24", "remaining_time": "1:10:49"} +{"current_steps": 900, "total_steps": 2190, "loss": null, "eval_loss": 2.0750818252563477, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.23, "percentage": 41.1, "elapsed_time": "0:49:24", "remaining_time": "1:10:49"} +{"current_steps": 905, "total_steps": 2190, "loss": 1.8354, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006359790787670527, "epoch": 1.24, "percentage": 41.32, "elapsed_time": "0:49:51", "remaining_time": "1:10:47"} +{"current_steps": 910, "total_steps": 2190, "loss": 1.8571, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006325244882914327, "epoch": 1.25, "percentage": 41.55, "elapsed_time": "0:50:07", "remaining_time": "1:10:30"} +{"current_steps": 915, "total_steps": 2190, "loss": 2.0376, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006290630799916144, "epoch": 1.25, "percentage": 41.78, "elapsed_time": "0:50:25", "remaining_time": "1:10:15"} +{"current_steps": 920, "total_steps": 2190, "loss": 2.0141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006255950319424097, "epoch": 1.26, "percentage": 42.01, "elapsed_time": "0:50:39", "remaining_time": "1:09:55"} +{"current_steps": 925, "total_steps": 2190, "loss": 1.982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006221205225602169, "epoch": 1.27, "percentage": 42.24, "elapsed_time": "0:50:57", "remaining_time": "1:09:40"} +{"current_steps": 930, "total_steps": 2190, "loss": 1.9456, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006186397305938427, "epoch": 1.27, "percentage": 42.47, "elapsed_time": "0:51:12", "remaining_time": "1:09:23"} +{"current_steps": 935, "total_steps": 2190, "loss": 1.8855, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006151528351153061, "epoch": 1.28, "percentage": 42.69, "elapsed_time": "0:51:28", "remaining_time": "1:09:05"} +{"current_steps": 940, "total_steps": 2190, "loss": 1.9335, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006116600155106263, "epoch": 1.29, "percentage": 42.92, "elapsed_time": "0:51:43", "remaining_time": "1:08:47"} +{"current_steps": 945, "total_steps": 2190, "loss": 1.9459, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006081614514705933, "epoch": 1.29, "percentage": 43.15, "elapsed_time": "0:52:00", "remaining_time": "1:08:31"} +{"current_steps": 950, "total_steps": 2190, "loss": 1.9753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006046573229815243, "epoch": 1.3, "percentage": 43.38, "elapsed_time": "0:52:16", "remaining_time": "1:08:14"} +{"current_steps": 955, "total_steps": 2190, "loss": 1.969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0006011478103160037, "epoch": 1.31, "percentage": 43.61, "elapsed_time": "0:52:33", "remaining_time": "1:07:58"} +{"current_steps": 960, "total_steps": 2190, "loss": 1.9184, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005976330940236089, "epoch": 1.32, "percentage": 43.84, "elapsed_time": "0:52:51", "remaining_time": "1:07:43"} +{"current_steps": 965, "total_steps": 2190, "loss": 1.9287, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005941133549216221, "epoch": 1.32, "percentage": 44.06, "elapsed_time": "0:53:08", "remaining_time": "1:07:27"} +{"current_steps": 970, "total_steps": 2190, "loss": 1.9373, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005905887740857279, "epoch": 1.33, "percentage": 44.29, "elapsed_time": "0:53:23", "remaining_time": "1:07:08"} +{"current_steps": 975, "total_steps": 2190, "loss": 1.9323, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005870595328406971, "epoch": 1.34, "percentage": 44.52, "elapsed_time": "0:53:39", "remaining_time": "1:06:51"} +{"current_steps": 980, "total_steps": 2190, "loss": 1.9249, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005835258127510597, "epoch": 1.34, "percentage": 44.75, "elapsed_time": "0:53:54", "remaining_time": "1:06:33"} +{"current_steps": 985, "total_steps": 2190, "loss": 2.0135, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005799877956117621, "epoch": 1.35, "percentage": 44.98, "elapsed_time": "0:54:11", "remaining_time": "1:06:17"} +{"current_steps": 990, "total_steps": 2190, "loss": 1.9741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005764456634388171, "epoch": 1.36, "percentage": 45.21, "elapsed_time": "0:54:28", "remaining_time": "1:06:01"} +{"current_steps": 995, "total_steps": 2190, "loss": 1.9028, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005728995984599373, "epoch": 1.36, "percentage": 45.43, "elapsed_time": "0:54:43", "remaining_time": "1:05:43"} +{"current_steps": 1000, "total_steps": 2190, "loss": 1.9647, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005693497831051624, "epoch": 1.37, "percentage": 45.66, "elapsed_time": "0:54:59", "remaining_time": "1:05:25"} +{"current_steps": 1000, "total_steps": 2190, "loss": null, "eval_loss": 2.0705747604370117, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.37, "percentage": 45.66, "elapsed_time": "0:54:59", "remaining_time": "1:05:25"} +{"current_steps": 1005, "total_steps": 2190, "loss": 1.9312, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005657963999974728, "epoch": 1.38, "percentage": 45.89, "elapsed_time": "0:55:24", "remaining_time": "1:05:19"} +{"current_steps": 1010, "total_steps": 2190, "loss": 1.9319, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005622396319433947, "epoch": 1.38, "percentage": 46.12, "elapsed_time": "0:55:39", "remaining_time": "1:05:01"} +{"current_steps": 1015, "total_steps": 2190, "loss": 2.0215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005586796619235951, "epoch": 1.39, "percentage": 46.35, "elapsed_time": "0:55:54", "remaining_time": "1:04:43"} +{"current_steps": 1020, "total_steps": 2190, "loss": 1.9109, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005551166730834692, "epoch": 1.4, "percentage": 46.58, "elapsed_time": "0:56:11", "remaining_time": "1:04:27"} +{"current_steps": 1025, "total_steps": 2190, "loss": 1.9534, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005515508487237174, "epoch": 1.4, "percentage": 46.8, "elapsed_time": "0:56:28", "remaining_time": "1:04:11"} +{"current_steps": 1030, "total_steps": 2190, "loss": 1.9559, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005479823722909158, "epoch": 1.41, "percentage": 47.03, "elapsed_time": "0:56:45", "remaining_time": "1:03:55"} +{"current_steps": 1035, "total_steps": 2190, "loss": 1.9402, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005444114273680778, "epoch": 1.42, "percentage": 47.26, "elapsed_time": "0:57:03", "remaining_time": "1:03:39"} +{"current_steps": 1040, "total_steps": 2190, "loss": 1.8844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005408381976652113, "epoch": 1.42, "percentage": 47.49, "elapsed_time": "0:57:18", "remaining_time": "1:03:22"} +{"current_steps": 1045, "total_steps": 2190, "loss": 1.8787, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005372628670098654, "epoch": 1.43, "percentage": 47.72, "elapsed_time": "0:57:35", "remaining_time": "1:03:06"} +{"current_steps": 1050, "total_steps": 2190, "loss": 2.0642, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005336856193376748, "epoch": 1.44, "percentage": 47.95, "elapsed_time": "0:57:52", "remaining_time": "1:02:50"} +{"current_steps": 1055, "total_steps": 2190, "loss": 2.0661, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005301066386828965, "epoch": 1.45, "percentage": 48.17, "elapsed_time": "0:58:11", "remaining_time": "1:02:35"} +{"current_steps": 1060, "total_steps": 2190, "loss": 1.8911, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005265261091689423, "epoch": 1.45, "percentage": 48.4, "elapsed_time": "0:58:27", "remaining_time": "1:02:19"} +{"current_steps": 1065, "total_steps": 2190, "loss": 1.9742, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005229442149989058, "epoch": 1.46, "percentage": 48.63, "elapsed_time": "0:58:43", "remaining_time": "1:02:02"} +{"current_steps": 1070, "total_steps": 2190, "loss": 1.8662, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005193611404460872, "epoch": 1.47, "percentage": 48.86, "elapsed_time": "0:58:59", "remaining_time": "1:01:44"} +{"current_steps": 1075, "total_steps": 2190, "loss": 1.9766, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005157770698445116, "epoch": 1.47, "percentage": 49.09, "elapsed_time": "0:59:15", "remaining_time": "1:01:27"} +{"current_steps": 1080, "total_steps": 2190, "loss": 1.8823, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005121921875794468, "epoch": 1.48, "percentage": 49.32, "elapsed_time": "0:59:30", "remaining_time": "1:01:10"} +{"current_steps": 1085, "total_steps": 2190, "loss": 2.0215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005086066780779174, "epoch": 1.49, "percentage": 49.54, "elapsed_time": "0:59:47", "remaining_time": "1:00:53"} +{"current_steps": 1090, "total_steps": 2190, "loss": 2.002, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005050207257992166, "epoch": 1.49, "percentage": 49.77, "elapsed_time": "1:00:03", "remaining_time": "1:00:36"} +{"current_steps": 1095, "total_steps": 2190, "loss": 2.0568, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0005014345152254166, "epoch": 1.5, "percentage": 50.0, "elapsed_time": "1:00:22", "remaining_time": "1:00:22"} +{"current_steps": 1100, "total_steps": 2190, "loss": 2.0477, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004978482308518779, "epoch": 1.51, "percentage": 50.23, "elapsed_time": "1:00:35", "remaining_time": "1:00:02"} +{"current_steps": 1100, "total_steps": 2190, "loss": null, "eval_loss": 2.0488994121551514, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.51, "percentage": 50.23, "elapsed_time": "1:00:35", "remaining_time": "1:00:02"} +{"current_steps": 1105, "total_steps": 2190, "loss": 1.9615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004942620571777576, "epoch": 1.51, "percentage": 50.46, "elapsed_time": "1:00:58", "remaining_time": "0:59:52"} +{"current_steps": 1110, "total_steps": 2190, "loss": 1.9747, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004906761786965175, "epoch": 1.52, "percentage": 50.68, "elapsed_time": "1:01:15", "remaining_time": "0:59:36"} +{"current_steps": 1115, "total_steps": 2190, "loss": 2.0174, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00048709077988643367, "epoch": 1.53, "percentage": 50.91, "elapsed_time": "1:01:30", "remaining_time": "0:59:18"} +{"current_steps": 1120, "total_steps": 2190, "loss": 1.927, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004835060452011041, "epoch": 1.53, "percentage": 51.14, "elapsed_time": "1:01:46", "remaining_time": "0:59:01"} +{"current_steps": 1125, "total_steps": 2190, "loss": 1.8728, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047992215905996163, "epoch": 1.54, "percentage": 51.37, "elapsed_time": "1:02:04", "remaining_time": "0:58:46"} +{"current_steps": 1130, "total_steps": 2190, "loss": 2.0364, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004763393058387841, "epoch": 1.55, "percentage": 51.6, "elapsed_time": "1:02:21", "remaining_time": "0:58:29"} +{"current_steps": 1135, "total_steps": 2190, "loss": 1.9834, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00047275766986021046, "epoch": 1.55, "percentage": 51.83, "elapsed_time": "1:02:34", "remaining_time": "0:58:09"} +{"current_steps": 1140, "total_steps": 2190, "loss": 1.838, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004691774353842571, "epoch": 1.56, "percentage": 52.05, "elapsed_time": "1:02:50", "remaining_time": "0:57:52"} +{"current_steps": 1145, "total_steps": 2190, "loss": 1.8644, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004655987865988401, "epoch": 1.57, "percentage": 52.28, "elapsed_time": "1:03:04", "remaining_time": "0:57:34"} +{"current_steps": 1150, "total_steps": 2190, "loss": 1.9723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004620219076102975, "epoch": 1.58, "percentage": 52.51, "elapsed_time": "1:03:20", "remaining_time": "0:57:16"} +{"current_steps": 1155, "total_steps": 2190, "loss": 1.7848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004584469824339192, "epoch": 1.58, "percentage": 52.74, "elapsed_time": "1:03:36", "remaining_time": "0:57:00"} +{"current_steps": 1160, "total_steps": 2190, "loss": 1.9848, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004548741949844795, "epoch": 1.59, "percentage": 52.97, "elapsed_time": "1:03:51", "remaining_time": "0:56:42"} +{"current_steps": 1165, "total_steps": 2190, "loss": 1.9545, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004513037290667761, "epoch": 1.6, "percentage": 53.2, "elapsed_time": "1:04:07", "remaining_time": "0:56:24"} +{"current_steps": 1170, "total_steps": 2190, "loss": 2.0322, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00044773576836617336, "epoch": 1.6, "percentage": 53.42, "elapsed_time": "1:04:25", "remaining_time": "0:56:09"} +{"current_steps": 1175, "total_steps": 2190, "loss": 1.933, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004441704964391529, "epoch": 1.61, "percentage": 53.65, "elapsed_time": "1:04:41", "remaining_time": "0:55:53"} +{"current_steps": 1180, "total_steps": 2190, "loss": 1.8019, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004406080967038701, "epoch": 1.62, "percentage": 53.88, "elapsed_time": "1:04:58", "remaining_time": "0:55:36"} +{"current_steps": 1185, "total_steps": 2190, "loss": 1.9489, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004370487524307189, "epoch": 1.62, "percentage": 54.11, "elapsed_time": "1:05:14", "remaining_time": "0:55:20"} +{"current_steps": 1190, "total_steps": 2190, "loss": 1.9982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00043349264673290204, "epoch": 1.63, "percentage": 54.34, "elapsed_time": "1:05:30", "remaining_time": "0:55:03"} +{"current_steps": 1195, "total_steps": 2190, "loss": 1.9536, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004299399625570114, "epoch": 1.64, "percentage": 54.57, "elapsed_time": "1:05:45", "remaining_time": "0:54:45"} +{"current_steps": 1200, "total_steps": 2190, "loss": 1.849, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00042639088267361596, "epoch": 1.64, "percentage": 54.79, "elapsed_time": "1:06:01", "remaining_time": "0:54:28"} +{"current_steps": 1200, "total_steps": 2190, "loss": null, "eval_loss": 2.030912160873413, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.64, "percentage": 54.79, "elapsed_time": "1:06:01", "remaining_time": "0:54:28"} +{"current_steps": 1205, "total_steps": 2190, "loss": 1.9273, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00042284558966785944, "epoch": 1.65, "percentage": 55.02, "elapsed_time": "1:06:26", "remaining_time": "0:54:18"} +{"current_steps": 1210, "total_steps": 2190, "loss": 1.9215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00041930426593006633, "epoch": 1.66, "percentage": 55.25, "elapsed_time": "1:06:42", "remaining_time": "0:54:01"} +{"current_steps": 1215, "total_steps": 2190, "loss": 2.0457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004157670936463592, "epoch": 1.66, "percentage": 55.48, "elapsed_time": "1:06:56", "remaining_time": "0:53:42"} +{"current_steps": 1220, "total_steps": 2190, "loss": 1.9452, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00041223425478928595, "epoch": 1.67, "percentage": 55.71, "elapsed_time": "1:07:11", "remaining_time": "0:53:25"} +{"current_steps": 1225, "total_steps": 2190, "loss": 2.033, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004087059311084581, "epoch": 1.68, "percentage": 55.94, "elapsed_time": "1:07:28", "remaining_time": "0:53:08"} +{"current_steps": 1230, "total_steps": 2190, "loss": 1.9818, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0004051823041212002, "epoch": 1.68, "percentage": 56.16, "elapsed_time": "1:07:44", "remaining_time": "0:52:52"} +{"current_steps": 1235, "total_steps": 2190, "loss": 2.0649, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00040166355510321195, "epoch": 1.69, "percentage": 56.39, "elapsed_time": "1:08:00", "remaining_time": "0:52:35"} +{"current_steps": 1240, "total_steps": 2190, "loss": 1.8362, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00039814986507924195, "epoch": 1.7, "percentage": 56.62, "elapsed_time": "1:08:17", "remaining_time": "0:52:19"} +{"current_steps": 1245, "total_steps": 2190, "loss": 2.017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003946414148137756, "epoch": 1.71, "percentage": 56.85, "elapsed_time": "1:08:33", "remaining_time": "0:52:02"} +{"current_steps": 1250, "total_steps": 2190, "loss": 1.871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003911383848017341, "epoch": 1.71, "percentage": 57.08, "elapsed_time": "1:08:49", "remaining_time": "0:51:45"} +{"current_steps": 1255, "total_steps": 2190, "loss": 1.9843, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003876409552591901, "epoch": 1.72, "percentage": 57.31, "elapsed_time": "1:09:05", "remaining_time": "0:51:28"} +{"current_steps": 1260, "total_steps": 2190, "loss": 1.7489, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00038414930611409525, "epoch": 1.73, "percentage": 57.53, "elapsed_time": "1:09:21", "remaining_time": "0:51:11"} +{"current_steps": 1265, "total_steps": 2190, "loss": 2.071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000380663616997025, "epoch": 1.73, "percentage": 57.76, "elapsed_time": "1:09:37", "remaining_time": "0:50:54"} +{"current_steps": 1270, "total_steps": 2190, "loss": 2.0351, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00037718406723193576, "epoch": 1.74, "percentage": 57.99, "elapsed_time": "1:09:54", "remaining_time": "0:50:38"} +{"current_steps": 1275, "total_steps": 2190, "loss": 1.9891, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003737108358269408, "epoch": 1.75, "percentage": 58.22, "elapsed_time": "1:10:11", "remaining_time": "0:50:22"} +{"current_steps": 1280, "total_steps": 2190, "loss": 2.0055, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00037024410146510014, "epoch": 1.75, "percentage": 58.45, "elapsed_time": "1:10:29", "remaining_time": "0:50:06"} +{"current_steps": 1285, "total_steps": 2190, "loss": 1.9104, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003667840424952288, "epoch": 1.76, "percentage": 58.68, "elapsed_time": "1:10:44", "remaining_time": "0:49:49"} +{"current_steps": 1290, "total_steps": 2190, "loss": 1.9239, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00036333083692272083, "epoch": 1.77, "percentage": 58.9, "elapsed_time": "1:11:00", "remaining_time": "0:49:32"} +{"current_steps": 1295, "total_steps": 2190, "loss": 1.8592, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00035988466240039206, "epoch": 1.77, "percentage": 59.13, "elapsed_time": "1:11:15", "remaining_time": "0:49:14"} +{"current_steps": 1300, "total_steps": 2190, "loss": 2.0883, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003564456962193403, "epoch": 1.78, "percentage": 59.36, "elapsed_time": "1:11:33", "remaining_time": "0:48:59"} +{"current_steps": 1300, "total_steps": 2190, "loss": null, "eval_loss": 2.012606382369995, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.78, "percentage": 59.36, "elapsed_time": "1:11:33", "remaining_time": "0:48:59"} +{"current_steps": 1305, "total_steps": 2190, "loss": 1.8061, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003530141152998255, "epoch": 1.79, "percentage": 59.59, "elapsed_time": "1:11:57", "remaining_time": "0:48:47"} +{"current_steps": 1310, "total_steps": 2190, "loss": 1.8564, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003495900961821662, "epoch": 1.79, "percentage": 59.82, "elapsed_time": "1:12:12", "remaining_time": "0:48:30"} +{"current_steps": 1315, "total_steps": 2190, "loss": 2.0208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003461738150176588, "epoch": 1.8, "percentage": 60.05, "elapsed_time": "1:12:28", "remaining_time": "0:48:13"} +{"current_steps": 1320, "total_steps": 2190, "loss": 2.0325, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00034276544755951444, "epoch": 1.81, "percentage": 60.27, "elapsed_time": "1:12:44", "remaining_time": "0:47:56"} +{"current_steps": 1325, "total_steps": 2190, "loss": 1.9853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033936516915381774, "epoch": 1.82, "percentage": 60.5, "elapsed_time": "1:13:02", "remaining_time": "0:47:40"} +{"current_steps": 1330, "total_steps": 2190, "loss": 1.8627, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033597315473050596, "epoch": 1.82, "percentage": 60.73, "elapsed_time": "1:13:16", "remaining_time": "0:47:22"} +{"current_steps": 1335, "total_steps": 2190, "loss": 1.7516, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00033258957879436893, "epoch": 1.83, "percentage": 60.96, "elapsed_time": "1:13:31", "remaining_time": "0:47:05"} +{"current_steps": 1340, "total_steps": 2190, "loss": 1.8258, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00032921461541607225, "epoch": 1.84, "percentage": 61.19, "elapsed_time": "1:13:45", "remaining_time": "0:46:47"} +{"current_steps": 1345, "total_steps": 2190, "loss": 1.9302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003258484382232023, "epoch": 1.84, "percentage": 61.42, "elapsed_time": "1:14:02", "remaining_time": "0:46:31"} +{"current_steps": 1350, "total_steps": 2190, "loss": 1.8517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00032249122039133273, "epoch": 1.85, "percentage": 61.64, "elapsed_time": "1:14:20", "remaining_time": "0:46:15"} +{"current_steps": 1355, "total_steps": 2190, "loss": 1.9234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00031914313463511635, "epoch": 1.86, "percentage": 61.87, "elapsed_time": "1:14:35", "remaining_time": "0:45:57"} +{"current_steps": 1360, "total_steps": 2190, "loss": 1.7208, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000315804353199399, "epoch": 1.86, "percentage": 62.1, "elapsed_time": "1:14:50", "remaining_time": "0:45:40"} +{"current_steps": 1365, "total_steps": 2190, "loss": 1.9718, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003124750478503593, "epoch": 1.87, "percentage": 62.33, "elapsed_time": "1:15:05", "remaining_time": "0:45:22"} +{"current_steps": 1370, "total_steps": 2190, "loss": 1.9739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003091553898666705, "epoch": 1.88, "percentage": 62.56, "elapsed_time": "1:15:22", "remaining_time": "0:45:07"} +{"current_steps": 1375, "total_steps": 2190, "loss": 1.9678, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00030584555003069017, "epoch": 1.88, "percentage": 62.79, "elapsed_time": "1:15:35", "remaining_time": "0:44:48"} +{"current_steps": 1380, "total_steps": 2190, "loss": 1.859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0003025456986196734, "epoch": 1.89, "percentage": 63.01, "elapsed_time": "1:15:51", "remaining_time": "0:44:31"} +{"current_steps": 1385, "total_steps": 2190, "loss": 1.866, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002992560053970135, "epoch": 1.9, "percentage": 63.24, "elapsed_time": "1:16:06", "remaining_time": "0:44:14"} +{"current_steps": 1390, "total_steps": 2190, "loss": 1.8789, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002959766396035077, "epoch": 1.9, "percentage": 63.47, "elapsed_time": "1:16:23", "remaining_time": "0:43:58"} +{"current_steps": 1395, "total_steps": 2190, "loss": 1.9049, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002927077699486507, "epoch": 1.91, "percentage": 63.7, "elapsed_time": "1:16:38", "remaining_time": "0:43:40"} +{"current_steps": 1400, "total_steps": 2190, "loss": 1.9501, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00028944956460195514, "epoch": 1.92, "percentage": 63.93, "elapsed_time": "1:16:53", "remaining_time": "0:43:23"} +{"current_steps": 1400, "total_steps": 2190, "loss": null, "eval_loss": 2.006899833679199, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 1.92, "percentage": 63.93, "elapsed_time": "1:16:53", "remaining_time": "0:43:23"} +{"current_steps": 1405, "total_steps": 2190, "loss": 1.9477, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002862021911843008, "epoch": 1.92, "percentage": 64.16, "elapsed_time": "1:17:21", "remaining_time": "0:43:13"} +{"current_steps": 1410, "total_steps": 2190, "loss": 1.8011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00028296581675930964, "epoch": 1.93, "percentage": 64.38, "elapsed_time": "1:17:37", "remaining_time": "0:42:56"} +{"current_steps": 1415, "total_steps": 2190, "loss": 1.9646, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00027974060782475255, "epoch": 1.94, "percentage": 64.61, "elapsed_time": "1:17:54", "remaining_time": "0:42:40"} +{"current_steps": 1420, "total_steps": 2190, "loss": 1.7943, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000276526730303983, "epoch": 1.95, "percentage": 64.84, "elapsed_time": "1:18:11", "remaining_time": "0:42:24"} +{"current_steps": 1425, "total_steps": 2190, "loss": 1.8871, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002733243495374013, "epoch": 1.95, "percentage": 65.07, "elapsed_time": "1:18:30", "remaining_time": "0:42:08"} +{"current_steps": 1430, "total_steps": 2190, "loss": 1.9161, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000270133630273948, "epoch": 1.96, "percentage": 65.3, "elapsed_time": "1:18:48", "remaining_time": "0:41:53"} +{"current_steps": 1435, "total_steps": 2190, "loss": 1.7969, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00026695473666262925, "epoch": 1.97, "percentage": 65.53, "elapsed_time": "1:19:06", "remaining_time": "0:41:37"} +{"current_steps": 1440, "total_steps": 2190, "loss": 1.9717, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002637878322440708, "epoch": 1.97, "percentage": 65.75, "elapsed_time": "1:19:22", "remaining_time": "0:41:20"} +{"current_steps": 1445, "total_steps": 2190, "loss": 1.8904, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00026063307994210586, "epoch": 1.98, "percentage": 65.98, "elapsed_time": "1:19:38", "remaining_time": "0:41:03"} +{"current_steps": 1450, "total_steps": 2190, "loss": 1.8843, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025749064205539206, "epoch": 1.99, "percentage": 66.21, "elapsed_time": "1:19:56", "remaining_time": "0:40:47"} +{"current_steps": 1455, "total_steps": 2190, "loss": 1.9602, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002543606802490628, "epoch": 1.99, "percentage": 66.44, "elapsed_time": "1:20:13", "remaining_time": "0:40:31"} +{"current_steps": 1460, "total_steps": 2190, "loss": 1.8936, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00025124335554640965, "epoch": 2.0, "percentage": 66.67, "elapsed_time": "1:20:30", "remaining_time": "0:40:15"} +{"current_steps": 1465, "total_steps": 2190, "loss": 1.6483, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00024813882832059914, "epoch": 2.01, "percentage": 66.89, "elapsed_time": "1:20:46", "remaining_time": "0:39:58"} +{"current_steps": 1470, "total_steps": 2190, "loss": 1.7576, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00024504725828642125, "epoch": 2.01, "percentage": 67.12, "elapsed_time": "1:21:03", "remaining_time": "0:39:42"} +{"current_steps": 1475, "total_steps": 2190, "loss": 1.7134, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00024196880449207364, "epoch": 2.02, "percentage": 67.35, "elapsed_time": "1:21:19", "remaining_time": "0:39:25"} +{"current_steps": 1480, "total_steps": 2190, "loss": 1.7071, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002389036253109787, "epoch": 2.03, "percentage": 67.58, "elapsed_time": "1:21:34", "remaining_time": "0:39:07"} +{"current_steps": 1485, "total_steps": 2190, "loss": 1.7951, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023585187843363614, "epoch": 2.03, "percentage": 67.81, "elapsed_time": "1:21:52", "remaining_time": "0:38:52"} +{"current_steps": 1490, "total_steps": 2190, "loss": 1.6741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00023281372085951068, "epoch": 2.04, "percentage": 68.04, "elapsed_time": "1:22:07", "remaining_time": "0:38:35"} +{"current_steps": 1495, "total_steps": 2190, "loss": 1.6294, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022978930888895466, "epoch": 2.05, "percentage": 68.26, "elapsed_time": "1:22:24", "remaining_time": "0:38:18"} +{"current_steps": 1500, "total_steps": 2190, "loss": 1.7268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022677879811516715, "epoch": 2.05, "percentage": 68.49, "elapsed_time": "1:22:40", "remaining_time": "0:38:01"} +{"current_steps": 1500, "total_steps": 2190, "loss": null, "eval_loss": 2.005824327468872, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.05, "percentage": 68.49, "elapsed_time": "1:22:40", "remaining_time": "0:38:01"} +{"current_steps": 1505, "total_steps": 2190, "loss": 1.6531, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022378234341619019, "epoch": 2.06, "percentage": 68.72, "elapsed_time": "1:23:04", "remaining_time": "0:37:48"} +{"current_steps": 1510, "total_steps": 2190, "loss": 1.6599, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00022080009894693948, "epoch": 2.07, "percentage": 68.95, "elapsed_time": "1:23:20", "remaining_time": "0:37:31"} +{"current_steps": 1515, "total_steps": 2190, "loss": 1.7998, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021783221813127473, "epoch": 2.08, "percentage": 69.18, "elapsed_time": "1:23:36", "remaining_time": "0:37:14"} +{"current_steps": 1520, "total_steps": 2190, "loss": 1.6775, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0002148788536541064, "epoch": 2.08, "percentage": 69.41, "elapsed_time": "1:23:51", "remaining_time": "0:36:58"} +{"current_steps": 1525, "total_steps": 2190, "loss": 1.5615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00021194015745354123, "epoch": 2.09, "percentage": 69.63, "elapsed_time": "1:24:10", "remaining_time": "0:36:42"} +{"current_steps": 1530, "total_steps": 2190, "loss": 1.7347, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020901628071306455, "epoch": 2.1, "percentage": 69.86, "elapsed_time": "1:24:25", "remaining_time": "0:36:25"} +{"current_steps": 1535, "total_steps": 2190, "loss": 1.7952, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020610737385376348, "epoch": 2.1, "percentage": 70.09, "elapsed_time": "1:24:46", "remaining_time": "0:36:10"} +{"current_steps": 1540, "total_steps": 2190, "loss": 1.6405, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020321358652658806, "epoch": 2.11, "percentage": 70.32, "elapsed_time": "1:25:00", "remaining_time": "0:35:52"} +{"current_steps": 1545, "total_steps": 2190, "loss": 1.5677, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00020033506760465237, "epoch": 2.12, "percentage": 70.55, "elapsed_time": "1:25:17", "remaining_time": "0:35:36"} +{"current_steps": 1550, "total_steps": 2190, "loss": 1.7295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001974719651755756, "epoch": 2.12, "percentage": 70.78, "elapsed_time": "1:25:33", "remaining_time": "0:35:19"} +{"current_steps": 1555, "total_steps": 2190, "loss": 1.6888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001946244265338637, "epoch": 2.13, "percentage": 71.0, "elapsed_time": "1:25:50", "remaining_time": "0:35:03"} +{"current_steps": 1560, "total_steps": 2190, "loss": 1.6395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00019179259817333133, "epoch": 2.14, "percentage": 71.23, "elapsed_time": "1:26:09", "remaining_time": "0:34:47"} +{"current_steps": 1565, "total_steps": 2190, "loss": 1.6953, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001889766257795663, "epoch": 2.14, "percentage": 71.46, "elapsed_time": "1:26:26", "remaining_time": "0:34:31"} +{"current_steps": 1570, "total_steps": 2190, "loss": 1.7618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018617665422243336, "epoch": 2.15, "percentage": 71.69, "elapsed_time": "1:26:42", "remaining_time": "0:34:14"} +{"current_steps": 1575, "total_steps": 2190, "loss": 1.6839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018339282754862223, "epoch": 2.16, "percentage": 71.92, "elapsed_time": "1:26:58", "remaining_time": "0:33:57"} +{"current_steps": 1580, "total_steps": 2190, "loss": 1.654, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00018062528897423643, "epoch": 2.16, "percentage": 72.15, "elapsed_time": "1:27:15", "remaining_time": "0:33:41"} +{"current_steps": 1585, "total_steps": 2190, "loss": 1.7326, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017787418087742614, "epoch": 2.17, "percentage": 72.37, "elapsed_time": "1:27:32", "remaining_time": "0:33:25"} +{"current_steps": 1590, "total_steps": 2190, "loss": 1.7163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017513964479106266, "epoch": 2.18, "percentage": 72.6, "elapsed_time": "1:27:48", "remaining_time": "0:33:08"} +{"current_steps": 1595, "total_steps": 2190, "loss": 1.6667, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00017242182139545742, "epoch": 2.18, "percentage": 72.83, "elapsed_time": "1:28:03", "remaining_time": "0:32:50"} +{"current_steps": 1600, "total_steps": 2190, "loss": 1.7881, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001697208505111249, "epoch": 2.19, "percentage": 73.06, "elapsed_time": "1:28:20", "remaining_time": "0:32:34"} +{"current_steps": 1600, "total_steps": 2190, "loss": null, "eval_loss": 2.008232831954956, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.19, "percentage": 73.06, "elapsed_time": "1:28:20", "remaining_time": "0:32:34"} +{"current_steps": 1605, "total_steps": 2190, "loss": 1.6769, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016703687109158888, "epoch": 2.2, "percentage": 73.29, "elapsed_time": "1:28:45", "remaining_time": "0:32:20"} +{"current_steps": 1610, "total_steps": 2190, "loss": 1.7811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016437002121623434, "epoch": 2.21, "percentage": 73.52, "elapsed_time": "1:29:00", "remaining_time": "0:32:03"} +{"current_steps": 1615, "total_steps": 2190, "loss": 1.6699, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00016172043808320368, "epoch": 2.21, "percentage": 73.74, "elapsed_time": "1:29:15", "remaining_time": "0:31:46"} +{"current_steps": 1620, "total_steps": 2190, "loss": 1.7141, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015908825800233824, "epoch": 2.22, "percentage": 73.97, "elapsed_time": "1:29:31", "remaining_time": "0:31:29"} +{"current_steps": 1625, "total_steps": 2190, "loss": 1.7672, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015647361638816655, "epoch": 2.23, "percentage": 74.2, "elapsed_time": "1:29:48", "remaining_time": "0:31:13"} +{"current_steps": 1630, "total_steps": 2190, "loss": 1.7043, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015387664775293658, "epoch": 2.23, "percentage": 74.43, "elapsed_time": "1:30:05", "remaining_time": "0:30:57"} +{"current_steps": 1635, "total_steps": 2190, "loss": 1.8051, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00015129748569969663, "epoch": 2.24, "percentage": 74.66, "elapsed_time": "1:30:22", "remaining_time": "0:30:40"} +{"current_steps": 1640, "total_steps": 2190, "loss": 1.7196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014873626291542148, "epoch": 2.25, "percentage": 74.89, "elapsed_time": "1:30:39", "remaining_time": "0:30:24"} +{"current_steps": 1645, "total_steps": 2190, "loss": 1.6664, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014619311116418693, "epoch": 2.25, "percentage": 75.11, "elapsed_time": "1:30:56", "remaining_time": "0:30:07"} +{"current_steps": 1650, "total_steps": 2190, "loss": 1.6404, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014366816128039007, "epoch": 2.26, "percentage": 75.34, "elapsed_time": "1:31:15", "remaining_time": "0:29:52"} +{"current_steps": 1655, "total_steps": 2190, "loss": 1.7127, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00014116154316201908, "epoch": 2.27, "percentage": 75.57, "elapsed_time": "1:31:32", "remaining_time": "0:29:35"} +{"current_steps": 1660, "total_steps": 2190, "loss": 1.6906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013867338576397043, "epoch": 2.27, "percentage": 75.8, "elapsed_time": "1:31:47", "remaining_time": "0:29:18"} +{"current_steps": 1665, "total_steps": 2190, "loss": 1.5734, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00013620381709141455, "epoch": 2.28, "percentage": 76.03, "elapsed_time": "1:32:03", "remaining_time": "0:29:01"} +{"current_steps": 1670, "total_steps": 2190, "loss": 1.7073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001337529641932107, "epoch": 2.29, "percentage": 76.26, "elapsed_time": "1:32:20", "remaining_time": "0:28:45"} +{"current_steps": 1675, "total_steps": 2190, "loss": 1.7745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001313209531553707, "epoch": 2.29, "percentage": 76.48, "elapsed_time": "1:32:35", "remaining_time": "0:28:28"} +{"current_steps": 1680, "total_steps": 2190, "loss": 1.6972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012890790909457213, "epoch": 2.3, "percentage": 76.71, "elapsed_time": "1:32:52", "remaining_time": "0:28:11"} +{"current_steps": 1685, "total_steps": 2190, "loss": 1.6176, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012651395615172239, "epoch": 2.31, "percentage": 76.94, "elapsed_time": "1:33:08", "remaining_time": "0:27:55"} +{"current_steps": 1690, "total_steps": 2190, "loss": 1.6887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012413921748557127, "epoch": 2.32, "percentage": 77.17, "elapsed_time": "1:33:27", "remaining_time": "0:27:39"} +{"current_steps": 1695, "total_steps": 2190, "loss": 1.8215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00012178381526637533, "epoch": 2.32, "percentage": 77.4, "elapsed_time": "1:33:45", "remaining_time": "0:27:22"} +{"current_steps": 1700, "total_steps": 2190, "loss": 1.6511, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011944787066961266, "epoch": 2.33, "percentage": 77.63, "elapsed_time": "1:34:00", "remaining_time": "0:27:05"} +{"current_steps": 1700, "total_steps": 2190, "loss": null, "eval_loss": 2.0000417232513428, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.33, "percentage": 77.63, "elapsed_time": "1:34:00", "remaining_time": "0:27:05"} +{"current_steps": 1705, "total_steps": 2190, "loss": 1.7458, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011713150386974947, "epoch": 2.34, "percentage": 77.85, "elapsed_time": "1:34:23", "remaining_time": "0:26:51"} +{"current_steps": 1710, "total_steps": 2190, "loss": 1.6685, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011483483403405659, "epoch": 2.34, "percentage": 78.08, "elapsed_time": "1:34:40", "remaining_time": "0:26:34"} +{"current_steps": 1715, "total_steps": 2190, "loss": 1.733, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.0001125579793164797, "epoch": 2.35, "percentage": 78.31, "elapsed_time": "1:34:55", "remaining_time": "0:26:17"} +{"current_steps": 1720, "total_steps": 2190, "loss": 1.7281, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00011030105685156039, "epoch": 2.36, "percentage": 78.54, "elapsed_time": "1:35:10", "remaining_time": "0:26:00"} +{"current_steps": 1725, "total_steps": 2190, "loss": 1.7581, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010806418274841024, "epoch": 2.36, "percentage": 78.77, "elapsed_time": "1:35:26", "remaining_time": "0:25:43"} +{"current_steps": 1730, "total_steps": 2190, "loss": 1.6238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010584747208473738, "epoch": 2.37, "percentage": 79.0, "elapsed_time": "1:35:42", "remaining_time": "0:25:26"} +{"current_steps": 1735, "total_steps": 2190, "loss": 1.7462, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.00010365103890092636, "epoch": 2.38, "percentage": 79.22, "elapsed_time": "1:35:58", "remaining_time": "0:25:10"} +{"current_steps": 1740, "total_steps": 2190, "loss": 1.7897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 0.000101474996194171, "epoch": 2.38, "percentage": 79.45, "elapsed_time": "1:36:13", "remaining_time": "0:24:53"} +{"current_steps": 1745, "total_steps": 2190, "loss": 1.8603, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.931945591266172e-05, "epoch": 2.39, "percentage": 79.68, "elapsed_time": "1:36:28", "remaining_time": "0:24:36"} +{"current_steps": 1750, "total_steps": 2190, "loss": 1.8379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.718452894982571e-05, "epoch": 2.4, "percentage": 79.91, "elapsed_time": "1:36:44", "remaining_time": "0:24:19"} +{"current_steps": 1755, "total_steps": 2190, "loss": 1.7378, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.507032513862195e-05, "epoch": 2.4, "percentage": 80.14, "elapsed_time": "1:37:00", "remaining_time": "0:24:02"} +{"current_steps": 1760, "total_steps": 2190, "loss": 1.6022, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.297695324589106e-05, "epoch": 2.41, "percentage": 80.37, "elapsed_time": "1:37:18", "remaining_time": "0:23:46"} +{"current_steps": 1765, "total_steps": 2190, "loss": 1.7144, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.090452096675993e-05, "epoch": 2.42, "percentage": 80.59, "elapsed_time": "1:37:34", "remaining_time": "0:23:29"} +{"current_steps": 1770, "total_steps": 2190, "loss": 1.6529, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.885313491910052e-05, "epoch": 2.42, "percentage": 80.82, "elapsed_time": "1:37:49", "remaining_time": "0:23:12"} +{"current_steps": 1775, "total_steps": 2190, "loss": 1.7523, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.682290063804527e-05, "epoch": 2.43, "percentage": 81.05, "elapsed_time": "1:38:04", "remaining_time": "0:22:55"} +{"current_steps": 1780, "total_steps": 2190, "loss": 1.6469, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.48139225705578e-05, "epoch": 2.44, "percentage": 81.28, "elapsed_time": "1:38:20", "remaining_time": "0:22:39"} +{"current_steps": 1785, "total_steps": 2190, "loss": 1.7107, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.28263040700598e-05, "epoch": 2.45, "percentage": 81.51, "elapsed_time": "1:38:36", "remaining_time": "0:22:22"} +{"current_steps": 1790, "total_steps": 2190, "loss": 1.5931, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.086014739111297e-05, "epoch": 2.45, "percentage": 81.74, "elapsed_time": "1:38:53", "remaining_time": "0:22:05"} +{"current_steps": 1795, "total_steps": 2190, "loss": 1.7049, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.891555368415947e-05, "epoch": 2.46, "percentage": 81.96, "elapsed_time": "1:39:08", "remaining_time": "0:21:49"} +{"current_steps": 1800, "total_steps": 2190, "loss": 1.6175, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.699262299031778e-05, "epoch": 2.47, "percentage": 82.19, "elapsed_time": "1:39:23", "remaining_time": "0:21:32"} +{"current_steps": 1800, "total_steps": 2190, "loss": null, "eval_loss": 1.9944177865982056, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.47, "percentage": 82.19, "elapsed_time": "1:39:23", "remaining_time": "0:21:32"} +{"current_steps": 1805, "total_steps": 2190, "loss": 1.5914, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.509145423623608e-05, "epoch": 2.47, "percentage": 82.42, "elapsed_time": "1:39:50", "remaining_time": "0:21:17"} +{"current_steps": 1810, "total_steps": 2190, "loss": 1.5903, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.321214522900271e-05, "epoch": 2.48, "percentage": 82.65, "elapsed_time": "1:40:07", "remaining_time": "0:21:01"} +{"current_steps": 1815, "total_steps": 2190, "loss": 1.5715, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.13547926511145e-05, "epoch": 2.49, "percentage": 82.88, "elapsed_time": "1:40:22", "remaining_time": "0:20:44"} +{"current_steps": 1820, "total_steps": 2190, "loss": 1.5976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.951949205550284e-05, "epoch": 2.49, "percentage": 83.11, "elapsed_time": "1:40:37", "remaining_time": "0:20:27"} +{"current_steps": 1825, "total_steps": 2190, "loss": 1.7028, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.770633786061819e-05, "epoch": 2.5, "percentage": 83.33, "elapsed_time": "1:40:55", "remaining_time": "0:20:11"} +{"current_steps": 1830, "total_steps": 2190, "loss": 1.7763, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.591542334557222e-05, "epoch": 2.51, "percentage": 83.56, "elapsed_time": "1:41:09", "remaining_time": "0:19:53"} +{"current_steps": 1835, "total_steps": 2190, "loss": 1.5076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.41468406453391e-05, "epoch": 2.51, "percentage": 83.79, "elapsed_time": "1:41:26", "remaining_time": "0:19:37"} +{"current_steps": 1840, "total_steps": 2190, "loss": 1.665, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.240068074601568e-05, "epoch": 2.52, "percentage": 84.02, "elapsed_time": "1:41:42", "remaining_time": "0:19:20"} +{"current_steps": 1845, "total_steps": 2190, "loss": 1.7332, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.067703348014086e-05, "epoch": 2.53, "percentage": 84.25, "elapsed_time": "1:41:59", "remaining_time": "0:19:04"} +{"current_steps": 1850, "total_steps": 2190, "loss": 1.7152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.897598752207328e-05, "epoch": 2.53, "percentage": 84.47, "elapsed_time": "1:42:16", "remaining_time": "0:18:47"} +{"current_steps": 1855, "total_steps": 2190, "loss": 1.7113, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.729763038343022e-05, "epoch": 2.54, "percentage": 84.7, "elapsed_time": "1:42:33", "remaining_time": "0:18:31"} +{"current_steps": 1860, "total_steps": 2190, "loss": 1.8046, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.564204840858511e-05, "epoch": 2.55, "percentage": 84.93, "elapsed_time": "1:42:49", "remaining_time": "0:18:14"} +{"current_steps": 1865, "total_steps": 2190, "loss": 1.652, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.40093267702258e-05, "epoch": 2.55, "percentage": 85.16, "elapsed_time": "1:43:04", "remaining_time": "0:17:57"} +{"current_steps": 1870, "total_steps": 2190, "loss": 1.7188, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.239954946497227e-05, "epoch": 2.56, "percentage": 85.39, "elapsed_time": "1:43:19", "remaining_time": "0:17:40"} +{"current_steps": 1875, "total_steps": 2190, "loss": 1.7157, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.0812799309055746e-05, "epoch": 2.57, "percentage": 85.62, "elapsed_time": "1:43:33", "remaining_time": "0:17:23"} +{"current_steps": 1880, "total_steps": 2190, "loss": 1.7516, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9249157934057985e-05, "epoch": 2.58, "percentage": 85.84, "elapsed_time": "1:43:46", "remaining_time": "0:17:06"} +{"current_steps": 1885, "total_steps": 2190, "loss": 1.7406, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.770870578271197e-05, "epoch": 2.58, "percentage": 86.07, "elapsed_time": "1:44:01", "remaining_time": "0:16:49"} +{"current_steps": 1890, "total_steps": 2190, "loss": 1.6949, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.619152210476296e-05, "epoch": 2.59, "percentage": 86.3, "elapsed_time": "1:44:16", "remaining_time": "0:16:33"} +{"current_steps": 1895, "total_steps": 2190, "loss": 1.6593, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.469768495289189e-05, "epoch": 2.6, "percentage": 86.53, "elapsed_time": "1:44:30", "remaining_time": "0:16:16"} +{"current_steps": 1900, "total_steps": 2190, "loss": 1.762, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.322727117869951e-05, "epoch": 2.6, "percentage": 86.76, "elapsed_time": "1:44:47", "remaining_time": "0:15:59"} +{"current_steps": 1900, "total_steps": 2190, "loss": null, "eval_loss": 1.991329312324524, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.6, "percentage": 86.76, "elapsed_time": "1:44:47", "remaining_time": "0:15:59"} +{"current_steps": 1905, "total_steps": 2190, "loss": 1.8229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.178035642875322e-05, "epoch": 2.61, "percentage": 86.99, "elapsed_time": "1:45:13", "remaining_time": "0:15:44"} +{"current_steps": 1910, "total_steps": 2190, "loss": 1.7235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0357015140694843e-05, "epoch": 2.62, "percentage": 87.21, "elapsed_time": "1:45:28", "remaining_time": "0:15:27"} +{"current_steps": 1915, "total_steps": 2190, "loss": 1.6724, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.89573205394112e-05, "epoch": 2.62, "percentage": 87.44, "elapsed_time": "1:45:42", "remaining_time": "0:15:10"} +{"current_steps": 1920, "total_steps": 2190, "loss": 1.576, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.758134463326729e-05, "epoch": 2.63, "percentage": 87.67, "elapsed_time": "1:45:56", "remaining_time": "0:14:53"} +{"current_steps": 1925, "total_steps": 2190, "loss": 1.6975, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.622915821040174e-05, "epoch": 2.64, "percentage": 87.9, "elapsed_time": "1:46:11", "remaining_time": "0:14:37"} +{"current_steps": 1930, "total_steps": 2190, "loss": 1.8066, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.4900830835084604e-05, "epoch": 2.64, "percentage": 88.13, "elapsed_time": "1:46:28", "remaining_time": "0:14:20"} +{"current_steps": 1935, "total_steps": 2190, "loss": 1.6242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3596430844139216e-05, "epoch": 2.65, "percentage": 88.36, "elapsed_time": "1:46:47", "remaining_time": "0:14:04"} +{"current_steps": 1940, "total_steps": 2190, "loss": 1.6191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.231602534342587e-05, "epoch": 2.66, "percentage": 88.58, "elapsed_time": "1:47:02", "remaining_time": "0:13:47"} +{"current_steps": 1945, "total_steps": 2190, "loss": 1.7211, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.105968020439026e-05, "epoch": 2.66, "percentage": 88.81, "elapsed_time": "1:47:17", "remaining_time": "0:13:30"} +{"current_steps": 1950, "total_steps": 2190, "loss": 1.7227, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9827460060673938e-05, "epoch": 2.67, "percentage": 89.04, "elapsed_time": "1:47:32", "remaining_time": "0:13:14"} +{"current_steps": 1955, "total_steps": 2190, "loss": 1.7096, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8619428304789697e-05, "epoch": 2.68, "percentage": 89.27, "elapsed_time": "1:47:49", "remaining_time": "0:12:57"} +{"current_steps": 1960, "total_steps": 2190, "loss": 1.7527, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.743564708485996e-05, "epoch": 2.68, "percentage": 89.5, "elapsed_time": "1:48:06", "remaining_time": "0:12:41"} +{"current_steps": 1965, "total_steps": 2190, "loss": 1.6941, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.6276177301419955e-05, "epoch": 2.69, "percentage": 89.73, "elapsed_time": "1:48:23", "remaining_time": "0:12:24"} +{"current_steps": 1970, "total_steps": 2190, "loss": 1.6471, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5141078604284108e-05, "epoch": 2.7, "percentage": 89.95, "elapsed_time": "1:48:40", "remaining_time": "0:12:08"} +{"current_steps": 1975, "total_steps": 2190, "loss": 1.7648, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4030409389477757e-05, "epoch": 2.71, "percentage": 90.18, "elapsed_time": "1:48:58", "remaining_time": "0:11:51"} +{"current_steps": 1980, "total_steps": 2190, "loss": 1.5269, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.2944226796232537e-05, "epoch": 2.71, "percentage": 90.41, "elapsed_time": "1:49:13", "remaining_time": "0:11:35"} +{"current_steps": 1985, "total_steps": 2190, "loss": 1.6842, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.188258670404719e-05, "epoch": 2.72, "percentage": 90.64, "elapsed_time": "1:49:26", "remaining_time": "0:11:18"} +{"current_steps": 1990, "total_steps": 2190, "loss": 1.8391, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0845543729812566e-05, "epoch": 2.73, "percentage": 90.87, "elapsed_time": "1:49:42", "remaining_time": "0:11:01"} +{"current_steps": 1995, "total_steps": 2190, "loss": 1.6511, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.9833151225001734e-05, "epoch": 2.73, "percentage": 91.1, "elapsed_time": "1:49:56", "remaining_time": "0:10:44"} +{"current_steps": 2000, "total_steps": 2190, "loss": 1.6843, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.884546127292569e-05, "epoch": 2.74, "percentage": 91.32, "elapsed_time": "1:50:11", "remaining_time": "0:10:28"} +{"current_steps": 2000, "total_steps": 2190, "loss": null, "eval_loss": 1.988287329673767, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.74, "percentage": 91.32, "elapsed_time": "1:50:11", "remaining_time": "0:10:28"} +{"current_steps": 2005, "total_steps": 2190, "loss": 1.6908, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.7882524686053393e-05, "epoch": 2.75, "percentage": 91.55, "elapsed_time": "1:50:36", "remaining_time": "0:10:12"} +{"current_steps": 2010, "total_steps": 2190, "loss": 1.7248, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6944391003397895e-05, "epoch": 2.75, "percentage": 91.78, "elapsed_time": "1:50:53", "remaining_time": "0:09:55"} +{"current_steps": 2015, "total_steps": 2190, "loss": 1.6644, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.603110848796785e-05, "epoch": 2.76, "percentage": 92.01, "elapsed_time": "1:51:08", "remaining_time": "0:09:39"} +{"current_steps": 2020, "total_steps": 2190, "loss": 1.674, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.5142724124284579e-05, "epoch": 2.77, "percentage": 92.24, "elapsed_time": "1:51:25", "remaining_time": "0:09:22"} +{"current_steps": 2025, "total_steps": 2190, "loss": 1.7485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.42792836159647e-05, "epoch": 2.77, "percentage": 92.47, "elapsed_time": "1:51:40", "remaining_time": "0:09:05"} +{"current_steps": 2030, "total_steps": 2190, "loss": 1.7741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3440831383369045e-05, "epoch": 2.78, "percentage": 92.69, "elapsed_time": "1:51:56", "remaining_time": "0:08:49"} +{"current_steps": 2035, "total_steps": 2190, "loss": 1.7444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.2627410561317387e-05, "epoch": 2.79, "percentage": 92.92, "elapsed_time": "1:52:12", "remaining_time": "0:08:32"} +{"current_steps": 2040, "total_steps": 2190, "loss": 1.7367, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1839062996869377e-05, "epoch": 2.79, "percentage": 93.15, "elapsed_time": "1:52:29", "remaining_time": "0:08:16"} +{"current_steps": 2045, "total_steps": 2190, "loss": 1.6514, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1075829247171598e-05, "epoch": 2.8, "percentage": 93.38, "elapsed_time": "1:52:45", "remaining_time": "0:07:59"} +{"current_steps": 2050, "total_steps": 2190, "loss": 1.5851, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.0337748577371186e-05, "epoch": 2.81, "percentage": 93.61, "elapsed_time": "1:52:58", "remaining_time": "0:07:42"} +{"current_steps": 2055, "total_steps": 2190, "loss": 1.735, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.624858958595716e-06, "epoch": 2.82, "percentage": 93.84, "elapsed_time": "1:53:15", "remaining_time": "0:07:26"} +{"current_steps": 2060, "total_steps": 2190, "loss": 1.7278, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.937197065999714e-06, "epoch": 2.82, "percentage": 94.06, "elapsed_time": "1:53:31", "remaining_time": "0:07:09"} +{"current_steps": 2065, "total_steps": 2190, "loss": 1.4815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 8.274798276878049e-06, "epoch": 2.83, "percentage": 94.29, "elapsed_time": "1:53:45", "remaining_time": "0:06:53"} +{"current_steps": 2070, "total_steps": 2190, "loss": 1.7065, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.637696668845728e-06, "epoch": 2.84, "percentage": 94.52, "elapsed_time": "1:54:00", "remaining_time": "0:06:36"} +{"current_steps": 2075, "total_steps": 2190, "loss": 1.6512, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.0259250180848e-06, "epoch": 2.84, "percentage": 94.75, "elapsed_time": "1:54:18", "remaining_time": "0:06:20"} +{"current_steps": 2080, "total_steps": 2190, "loss": 1.6191, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 6.439514797658308e-06, "epoch": 2.85, "percentage": 94.98, "elapsed_time": "1:54:33", "remaining_time": "0:06:03"} +{"current_steps": 2085, "total_steps": 2190, "loss": 1.613, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.8784961758908685e-06, "epoch": 2.86, "percentage": 95.21, "elapsed_time": "1:54:49", "remaining_time": "0:05:46"} +{"current_steps": 2090, "total_steps": 2190, "loss": 1.5417, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.342898014816855e-06, "epoch": 2.86, "percentage": 95.43, "elapsed_time": "1:55:06", "remaining_time": "0:05:30"} +{"current_steps": 2095, "total_steps": 2190, "loss": 1.6967, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.832747868695475e-06, "epoch": 2.87, "percentage": 95.66, "elapsed_time": "1:55:23", "remaining_time": "0:05:13"} +{"current_steps": 2100, "total_steps": 2190, "loss": 1.7315, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.348071982593293e-06, "epoch": 2.88, "percentage": 95.89, "elapsed_time": "1:55:39", "remaining_time": "0:04:57"} +{"current_steps": 2100, "total_steps": 2190, "loss": null, "eval_loss": 1.98636794090271, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 2.88, "percentage": 95.89, "elapsed_time": "1:55:39", "remaining_time": "0:04:57"} +{"current_steps": 2105, "total_steps": 2190, "loss": 1.6421, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.888895291033867e-06, "epoch": 2.88, "percentage": 96.12, "elapsed_time": "1:56:04", "remaining_time": "0:04:41"} +{"current_steps": 2110, "total_steps": 2190, "loss": 1.7282, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.455241416715216e-06, "epoch": 2.89, "percentage": 96.35, "elapsed_time": "1:56:19", "remaining_time": "0:04:24"} +{"current_steps": 2115, "total_steps": 2190, "loss": 1.551, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0471326692942947e-06, "epoch": 2.9, "percentage": 96.58, "elapsed_time": "1:56:35", "remaining_time": "0:04:08"} +{"current_steps": 2120, "total_steps": 2190, "loss": 1.6687, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.664590044239468e-06, "epoch": 2.9, "percentage": 96.8, "elapsed_time": "1:56:53", "remaining_time": "0:03:51"} +{"current_steps": 2125, "total_steps": 2190, "loss": 1.6605, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.3076332217501496e-06, "epoch": 2.91, "percentage": 97.03, "elapsed_time": "1:57:10", "remaining_time": "0:03:35"} +{"current_steps": 2130, "total_steps": 2190, "loss": 1.5612, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.976280565744615e-06, "epoch": 2.92, "percentage": 97.26, "elapsed_time": "1:57:25", "remaining_time": "0:03:18"} +{"current_steps": 2135, "total_steps": 2190, "loss": 1.6839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.6705491229149216e-06, "epoch": 2.92, "percentage": 97.49, "elapsed_time": "1:57:41", "remaining_time": "0:03:01"} +{"current_steps": 2140, "total_steps": 2190, "loss": 1.615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.3904546218503344e-06, "epoch": 2.93, "percentage": 97.72, "elapsed_time": "1:57:57", "remaining_time": "0:02:45"} +{"current_steps": 2145, "total_steps": 2190, "loss": 1.6923, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.1360114722277493e-06, "epoch": 2.94, "percentage": 97.95, "elapsed_time": "1:58:13", "remaining_time": "0:02:28"} +{"current_steps": 2150, "total_steps": 2190, "loss": 1.6513, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 9.072327640706757e-07, "epoch": 2.95, "percentage": 98.17, "elapsed_time": "1:58:29", "remaining_time": "0:02:12"} +{"current_steps": 2155, "total_steps": 2190, "loss": 1.6785, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.041302670756645e-07, "epoch": 2.95, "percentage": 98.4, "elapsed_time": "1:58:46", "remaining_time": "0:01:55"} +{"current_steps": 2160, "total_steps": 2190, "loss": 1.5907, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 5.267144300067916e-07, "epoch": 2.96, "percentage": 98.63, "elapsed_time": "1:59:02", "remaining_time": "0:01:39"} +{"current_steps": 2165, "total_steps": 2190, "loss": 1.6464, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.749943801582556e-07, "epoch": 2.97, "percentage": 98.86, "elapsed_time": "1:59:20", "remaining_time": "0:01:22"} +{"current_steps": 2170, "total_steps": 2190, "loss": 1.6572, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.4897792288469667e-07, "epoch": 2.97, "percentage": 99.09, "elapsed_time": "1:59:37", "remaining_time": "0:01:06"} +{"current_steps": 2175, "total_steps": 2190, "loss": 1.6339, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 1.4867154119957428e-07, "epoch": 2.98, "percentage": 99.32, "elapsed_time": "1:59:51", "remaining_time": "0:00:49"} +{"current_steps": 2180, "total_steps": 2190, "loss": 1.615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 7.408039544187783e-08, "epoch": 2.99, "percentage": 99.54, "elapsed_time": "2:00:05", "remaining_time": "0:00:33"} +{"current_steps": 2185, "total_steps": 2190, "loss": 1.7196, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.5208323010450507e-08, "epoch": 2.99, "percentage": 99.77, "elapsed_time": "2:00:23", "remaining_time": "0:00:16"} +{"current_steps": 2190, "total_steps": 2190, "loss": 1.6404, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.0578381666469526e-09, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:00:39", "remaining_time": "0:00:00"} +{"current_steps": 2190, "total_steps": 2190, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:00:39", "remaining_time": "0:00:00"} +{"current_steps": 15, "total_steps": 15, "loss": null, "eval_loss": 1.98636794090271, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 3.0, "percentage": 100.0, "elapsed_time": "2:00:48", "remaining_time": "0:00:00"} diff --git a/SFT/trainer_state.json b/SFT/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..324592dbc9ed58dbf181a5faa3736390320cf318 --- /dev/null +++ b/SFT/trainer_state.json @@ -0,0 +1,2824 @@ +{ + "best_metric": 1.98636794090271, + "best_model_checkpoint": "saves/ChineseLLaMA2-7B-Chat/lora/SFT/checkpoint-2100", + "epoch": 3.0, + "eval_steps": 100, + "global_step": 2190, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 0.000999995369868095, + "loss": 4.0099, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 0.0009999670749281082, + "loss": 2.887, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 0.000999913058797528, + "loss": 2.6742, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009998333242552556, + "loss": 2.7939, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 0.0009997278754032958, + "loss": 2.6062, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 0.0009995967176665461, + "loss": 2.7508, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009994398577925167, + "loss": 2.547, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 0.0009992573038509849, + "loss": 2.5281, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 0.0009990490652335784, + "loss": 2.6397, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 0.000998815152653293, + "loss": 2.608, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 0.000998555578143941, + "loss": 2.4076, + "step": 55 + }, + { + "epoch": 0.08, + "learning_rate": 0.0009982703550595329, + "loss": 2.5914, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 0.0009979594980735896, + "loss": 2.3517, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009976230231783876, + "loss": 2.4019, + "step": 70 + }, + { + "epoch": 0.1, + "learning_rate": 0.0009972609476841367, + "loss": 2.4481, + "step": 75 + }, + { + "epoch": 0.11, + "learning_rate": 0.000996873290218089, + "loss": 2.3244, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 0.000996460070723581, + "loss": 2.4821, + "step": 85 + }, + { + "epoch": 0.12, + "learning_rate": 0.0009960213104590074, + "loss": 2.438, + "step": 90 + }, + { + "epoch": 0.13, + "learning_rate": 0.0009955570319967273, + "loss": 2.2925, + "step": 95 + }, + { + "epoch": 0.14, + "learning_rate": 0.0009950672592219031, + "loss": 2.3052, + "step": 100 + }, + { + "epoch": 0.14, + "eval_loss": 2.3112449645996094, + "eval_runtime": 8.9668, + "eval_samples_per_second": 13.16, + "eval_steps_per_second": 1.673, + "step": 100 + }, + { + "epoch": 0.14, + "learning_rate": 0.000994552017331272, + "loss": 2.5839, + "step": 105 + }, + { + "epoch": 0.15, + "learning_rate": 0.0009940113328318488, + "loss": 2.4983, + "step": 110 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009934452335395637, + "loss": 2.4598, + "step": 115 + }, + { + "epoch": 0.16, + "learning_rate": 0.0009928537485778299, + "loss": 2.4336, + "step": 120 + }, + { + "epoch": 0.17, + "learning_rate": 0.0009922369083760461, + "loss": 2.3799, + "step": 125 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009915947446680307, + "loss": 2.5443, + "step": 130 + }, + { + "epoch": 0.18, + "learning_rate": 0.0009909272904903897, + "loss": 2.3701, + "step": 135 + }, + { + "epoch": 0.19, + "learning_rate": 0.0009902345801808161, + "loss": 2.2718, + "step": 140 + }, + { + "epoch": 0.2, + "learning_rate": 0.0009895166493763246, + "loss": 2.3402, + "step": 145 + }, + { + "epoch": 0.21, + "learning_rate": 0.0009887735350114174, + "loss": 2.4072, + "step": 150 + }, + { + "epoch": 0.21, + "learning_rate": 0.000988005275316184, + "loss": 2.3807, + "step": 155 + }, + { + "epoch": 0.22, + "learning_rate": 0.000987211909814336, + "loss": 2.2149, + "step": 160 + }, + { + "epoch": 0.23, + "learning_rate": 0.000986393479321171, + "loss": 2.2713, + "step": 165 + }, + { + "epoch": 0.23, + "learning_rate": 0.0009855500259414753, + "loss": 2.08, + "step": 170 + }, + { + "epoch": 0.24, + "learning_rate": 0.0009846815930673563, + "loss": 2.4682, + "step": 175 + }, + { + "epoch": 0.25, + "learning_rate": 0.000983788225376011, + "loss": 2.3328, + "step": 180 + }, + { + "epoch": 0.25, + "learning_rate": 0.0009828699688274275, + "loss": 2.3671, + "step": 185 + }, + { + "epoch": 0.26, + "learning_rate": 0.0009819268706620196, + "loss": 2.3251, + "step": 190 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009809589793981978, + "loss": 2.4429, + "step": 195 + }, + { + "epoch": 0.27, + "learning_rate": 0.0009799663448298724, + "loss": 2.2499, + "step": 200 + }, + { + "epoch": 0.27, + "eval_loss": 2.234222650527954, + "eval_runtime": 8.8394, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0009789490180238916, + "loss": 2.2279, + "step": 205 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009779070513174157, + "loss": 2.4875, + "step": 210 + }, + { + "epoch": 0.29, + "learning_rate": 0.0009768404983152227, + "loss": 2.3507, + "step": 215 + }, + { + "epoch": 0.3, + "learning_rate": 0.0009757494138869523, + "loss": 2.2189, + "step": 220 + }, + { + "epoch": 0.31, + "learning_rate": 0.0009746338541642812, + "loss": 2.2597, + "step": 225 + }, + { + "epoch": 0.32, + "learning_rate": 0.0009734938765380377, + "loss": 2.334, + "step": 230 + }, + { + "epoch": 0.32, + "learning_rate": 0.000972329539655247, + "loss": 2.3133, + "step": 235 + }, + { + "epoch": 0.33, + "learning_rate": 0.0009711409034161151, + "loss": 2.2286, + "step": 240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009699280289709478, + "loss": 2.224, + "step": 245 + }, + { + "epoch": 0.34, + "learning_rate": 0.0009686909787170031, + "loss": 2.3772, + "step": 250 + }, + { + "epoch": 0.35, + "learning_rate": 0.0009674298162952826, + "loss": 2.3606, + "step": 255 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009661446065872568, + "loss": 2.3207, + "step": 260 + }, + { + "epoch": 0.36, + "learning_rate": 0.0009648354157115271, + "loss": 2.2505, + "step": 265 + }, + { + "epoch": 0.37, + "learning_rate": 0.0009635023110204253, + "loss": 2.2192, + "step": 270 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009621453610965467, + "loss": 2.3082, + "step": 275 + }, + { + "epoch": 0.38, + "learning_rate": 0.0009607646357492237, + "loss": 2.2913, + "step": 280 + }, + { + "epoch": 0.39, + "learning_rate": 0.0009593602060109334, + "loss": 2.1133, + "step": 285 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009579321441336436, + "loss": 2.1855, + "step": 290 + }, + { + "epoch": 0.4, + "learning_rate": 0.0009564805235850955, + "loss": 2.3691, + "step": 295 + }, + { + "epoch": 0.41, + "learning_rate": 0.0009550054190450246, + "loss": 2.1942, + "step": 300 + }, + { + "epoch": 0.41, + "eval_loss": 2.184567451477051, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 300 + }, + { + "epoch": 0.42, + "learning_rate": 0.000953506906401318, + "loss": 2.2661, + "step": 305 + }, + { + "epoch": 0.42, + "learning_rate": 0.0009519850627461109, + "loss": 2.1222, + "step": 310 + }, + { + "epoch": 0.43, + "learning_rate": 0.0009504399663718202, + "loss": 2.2852, + "step": 315 + }, + { + "epoch": 0.44, + "learning_rate": 0.0009488716967671169, + "loss": 2.2543, + "step": 320 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009472803346128368, + "loss": 2.4232, + "step": 325 + }, + { + "epoch": 0.45, + "learning_rate": 0.0009456659617778294, + "loss": 2.3455, + "step": 330 + }, + { + "epoch": 0.46, + "learning_rate": 0.0009440286613147466, + "loss": 2.2071, + "step": 335 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009423685174557695, + "loss": 2.2561, + "step": 340 + }, + { + "epoch": 0.47, + "learning_rate": 0.0009406856156082755, + "loss": 2.2257, + "step": 345 + }, + { + "epoch": 0.48, + "learning_rate": 0.0009389800423504441, + "loss": 2.0418, + "step": 350 + }, + { + "epoch": 0.49, + "learning_rate": 0.000937251885426803, + "loss": 2.1139, + "step": 355 + }, + { + "epoch": 0.49, + "learning_rate": 0.0009355012337437138, + "loss": 2.0349, + "step": 360 + }, + { + "epoch": 0.5, + "learning_rate": 0.0009337281773647985, + "loss": 2.1056, + "step": 365 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009319328075063059, + "loss": 2.0944, + "step": 370 + }, + { + "epoch": 0.51, + "learning_rate": 0.0009301152165324185, + "loss": 2.0468, + "step": 375 + }, + { + "epoch": 0.52, + "learning_rate": 0.0009282754979505018, + "loss": 2.4125, + "step": 380 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009264137464062927, + "loss": 2.14, + "step": 385 + }, + { + "epoch": 0.53, + "learning_rate": 0.0009245300576790309, + "loss": 2.0077, + "step": 390 + }, + { + "epoch": 0.54, + "learning_rate": 0.0009226245286765316, + "loss": 2.0926, + "step": 395 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009206972574301991, + "loss": 2.2612, + "step": 400 + }, + { + "epoch": 0.55, + "eval_loss": 2.1513657569885254, + "eval_runtime": 8.8395, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 400 + }, + { + "epoch": 0.55, + "learning_rate": 0.0009187483430899845, + "loss": 2.1961, + "step": 405 + }, + { + "epoch": 0.56, + "learning_rate": 0.000916777885919285, + "loss": 2.21, + "step": 410 + }, + { + "epoch": 0.57, + "learning_rate": 0.0009147859872897843, + "loss": 2.1734, + "step": 415 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009127727496762394, + "loss": 2.2751, + "step": 420 + }, + { + "epoch": 0.58, + "learning_rate": 0.0009107382766512072, + "loss": 2.165, + "step": 425 + }, + { + "epoch": 0.59, + "learning_rate": 0.0009086826728797165, + "loss": 1.9987, + "step": 430 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009066060441138841, + "loss": 2.2766, + "step": 435 + }, + { + "epoch": 0.6, + "learning_rate": 0.0009045084971874737, + "loss": 2.2556, + "step": 440 + }, + { + "epoch": 0.61, + "learning_rate": 0.0009023901400103995, + "loss": 2.2067, + "step": 445 + }, + { + "epoch": 0.62, + "learning_rate": 0.0009002510815631754, + "loss": 2.1062, + "step": 450 + }, + { + "epoch": 0.62, + "learning_rate": 0.0008980914318913078, + "loss": 2.2781, + "step": 455 + }, + { + "epoch": 0.63, + "learning_rate": 0.0008959113020996348, + "loss": 2.2977, + "step": 460 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008937108043466098, + "loss": 2.1739, + "step": 465 + }, + { + "epoch": 0.64, + "learning_rate": 0.0008914900518385314, + "loss": 2.2108, + "step": 470 + }, + { + "epoch": 0.65, + "learning_rate": 0.0008892491588237203, + "loss": 2.204, + "step": 475 + }, + { + "epoch": 0.66, + "learning_rate": 0.0008869882405866404, + "loss": 2.0958, + "step": 480 + }, + { + "epoch": 0.66, + "learning_rate": 0.000884707413441969, + "loss": 2.1794, + "step": 485 + }, + { + "epoch": 0.67, + "learning_rate": 0.0008824067947286121, + "loss": 2.2172, + "step": 490 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008800865028036685, + "loss": 2.1321, + "step": 495 + }, + { + "epoch": 0.68, + "learning_rate": 0.0008777466570363402, + "loss": 2.1153, + "step": 500 + }, + { + "epoch": 0.68, + "eval_loss": 2.1377201080322266, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 500 + }, + { + "epoch": 0.69, + "learning_rate": 0.0008753873778017918, + "loss": 2.0652, + "step": 505 + }, + { + "epoch": 0.7, + "learning_rate": 0.0008730087864749578, + "loss": 2.1858, + "step": 510 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008706110054242979, + "loss": 2.3533, + "step": 515 + }, + { + "epoch": 0.71, + "learning_rate": 0.0008681941580055016, + "loss": 2.0069, + "step": 520 + }, + { + "epoch": 0.72, + "learning_rate": 0.0008657583685551429, + "loss": 2.2022, + "step": 525 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008633037623842828, + "loss": 2.2126, + "step": 530 + }, + { + "epoch": 0.73, + "learning_rate": 0.0008608304657720232, + "loss": 2.1354, + "step": 535 + }, + { + "epoch": 0.74, + "learning_rate": 0.00085833860595901, + "loss": 2.3017, + "step": 540 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008558283111408874, + "loss": 2.1492, + "step": 545 + }, + { + "epoch": 0.75, + "learning_rate": 0.0008532997104617022, + "loss": 2.3461, + "step": 550 + }, + { + "epoch": 0.76, + "learning_rate": 0.0008507529340072608, + "loss": 2.2892, + "step": 555 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008481881127984361, + "loss": 2.1384, + "step": 560 + }, + { + "epoch": 0.77, + "learning_rate": 0.0008456053787844274, + "loss": 2.2022, + "step": 565 + }, + { + "epoch": 0.78, + "learning_rate": 0.0008430048648359713, + "loss": 2.1867, + "step": 570 + }, + { + "epoch": 0.79, + "learning_rate": 0.000840386704738508, + "loss": 2.0888, + "step": 575 + }, + { + "epoch": 0.79, + "learning_rate": 0.0008377510331852969, + "loss": 2.1194, + "step": 580 + }, + { + "epoch": 0.8, + "learning_rate": 0.0008350979857704872, + "loss": 2.1946, + "step": 585 + }, + { + "epoch": 0.81, + "learning_rate": 0.0008324276989821433, + "loss": 2.2366, + "step": 590 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008297403101952221, + "loss": 2.2807, + "step": 595 + }, + { + "epoch": 0.82, + "learning_rate": 0.0008270359576645061, + "loss": 2.2787, + "step": 600 + }, + { + "epoch": 0.82, + "eval_loss": 2.1165168285369873, + "eval_runtime": 8.8529, + "eval_samples_per_second": 13.329, + "eval_steps_per_second": 1.694, + "step": 600 + }, + { + "epoch": 0.83, + "learning_rate": 0.0008243147805174907, + "loss": 2.1761, + "step": 605 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008215769187472266, + "loss": 2.036, + "step": 610 + }, + { + "epoch": 0.84, + "learning_rate": 0.0008188225132051175, + "loss": 2.3055, + "step": 615 + }, + { + "epoch": 0.85, + "learning_rate": 0.0008160517055936743, + "loss": 2.1386, + "step": 620 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008132646384592254, + "loss": 2.2383, + "step": 625 + }, + { + "epoch": 0.86, + "learning_rate": 0.0008104614551845823, + "loss": 2.0414, + "step": 630 + }, + { + "epoch": 0.87, + "learning_rate": 0.000807642299981664, + "loss": 2.1539, + "step": 635 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008048073178840773, + "loss": 2.2804, + "step": 640 + }, + { + "epoch": 0.88, + "learning_rate": 0.0008019566547396563, + "loss": 2.0363, + "step": 645 + }, + { + "epoch": 0.89, + "learning_rate": 0.0007990904572029582, + "loss": 2.152, + "step": 650 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007962088727277193, + "loss": 2.1571, + "step": 655 + }, + { + "epoch": 0.9, + "learning_rate": 0.0007933120495592682, + "loss": 2.1813, + "step": 660 + }, + { + "epoch": 0.91, + "learning_rate": 0.0007904001367269004, + "loss": 2.2906, + "step": 665 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007874732840362107, + "loss": 2.2132, + "step": 670 + }, + { + "epoch": 0.92, + "learning_rate": 0.0007845316420613859, + "loss": 2.0774, + "step": 675 + }, + { + "epoch": 0.93, + "learning_rate": 0.0007815753621374593, + "loss": 2.1264, + "step": 680 + }, + { + "epoch": 0.94, + "learning_rate": 0.0007786045963525249, + "loss": 2.1156, + "step": 685 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007756194975399123, + "loss": 2.0804, + "step": 690 + }, + { + "epoch": 0.95, + "learning_rate": 0.0007726202192703255, + "loss": 2.2225, + "step": 695 + }, + { + "epoch": 0.96, + "learning_rate": 0.0007696069158439412, + "loss": 2.0872, + "step": 700 + }, + { + "epoch": 0.96, + "eval_loss": 2.0954582691192627, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 700 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007665797422824708, + "loss": 2.0316, + "step": 705 + }, + { + "epoch": 0.97, + "learning_rate": 0.0007635388543211861, + "loss": 2.1545, + "step": 710 + }, + { + "epoch": 0.98, + "learning_rate": 0.0007604844084009063, + "loss": 2.082, + "step": 715 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007574165616599501, + "loss": 2.1494, + "step": 720 + }, + { + "epoch": 0.99, + "learning_rate": 0.0007543354719260522, + "loss": 2.2864, + "step": 725 + }, + { + "epoch": 1.0, + "learning_rate": 0.000751241297708243, + "loss": 2.122, + "step": 730 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007481341981886942, + "loss": 1.9816, + "step": 735 + }, + { + "epoch": 1.01, + "learning_rate": 0.0007450143332145297, + "loss": 1.8529, + "step": 740 + }, + { + "epoch": 1.02, + "learning_rate": 0.0007418818632896017, + "loss": 1.936, + "step": 745 + }, + { + "epoch": 1.03, + "learning_rate": 0.0007387369495662343, + "loss": 1.9031, + "step": 750 + }, + { + "epoch": 1.03, + "learning_rate": 0.000735579753836932, + "loss": 1.8006, + "step": 755 + }, + { + "epoch": 1.04, + "learning_rate": 0.0007324104385260566, + "loss": 1.9221, + "step": 760 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007292291666814713, + "loss": 1.9734, + "step": 765 + }, + { + "epoch": 1.05, + "learning_rate": 0.0007260361019661522, + "loss": 2.0162, + "step": 770 + }, + { + "epoch": 1.06, + "learning_rate": 0.0007228314086497686, + "loss": 1.755, + "step": 775 + }, + { + "epoch": 1.07, + "learning_rate": 0.0007196152516002323, + "loss": 2.0059, + "step": 780 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007163877962752157, + "loss": 2.0842, + "step": 785 + }, + { + "epoch": 1.08, + "learning_rate": 0.0007131492087136393, + "loss": 2.0298, + "step": 790 + }, + { + "epoch": 1.09, + "learning_rate": 0.0007098996555271309, + "loss": 2.0099, + "step": 795 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007066393038914522, + "loss": 2.0318, + "step": 800 + }, + { + "epoch": 1.1, + "eval_loss": 2.081965923309326, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 800 + }, + { + "epoch": 1.1, + "learning_rate": 0.0007033683215379002, + "loss": 2.0159, + "step": 805 + }, + { + "epoch": 1.11, + "learning_rate": 0.0007000868767446771, + "loss": 2.0416, + "step": 810 + }, + { + "epoch": 1.12, + "learning_rate": 0.0006967951383282334, + "loss": 1.9853, + "step": 815 + }, + { + "epoch": 1.12, + "learning_rate": 0.000693493275634583, + "loss": 2.0131, + "step": 820 + }, + { + "epoch": 1.13, + "learning_rate": 0.0006901814585305909, + "loss": 1.9054, + "step": 825 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006868598573952345, + "loss": 1.9213, + "step": 830 + }, + { + "epoch": 1.14, + "learning_rate": 0.0006835286431108383, + "loss": 1.8538, + "step": 835 + }, + { + "epoch": 1.15, + "learning_rate": 0.0006801879870542821, + "loss": 2.0632, + "step": 840 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006768380610881859, + "loss": 2.0617, + "step": 845 + }, + { + "epoch": 1.16, + "learning_rate": 0.0006734790375520663, + "loss": 1.9131, + "step": 850 + }, + { + "epoch": 1.17, + "learning_rate": 0.0006701110892534723, + "loss": 1.969, + "step": 855 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006667343894590934, + "loss": 2.1041, + "step": 860 + }, + { + "epoch": 1.18, + "learning_rate": 0.0006633491118858471, + "loss": 1.9544, + "step": 865 + }, + { + "epoch": 1.19, + "learning_rate": 0.0006599554306919408, + "loss": 1.9392, + "step": 870 + }, + { + "epoch": 1.2, + "learning_rate": 0.0006565535204679134, + "loss": 1.9857, + "step": 875 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006531435562276514, + "loss": 2.0771, + "step": 880 + }, + { + "epoch": 1.21, + "learning_rate": 0.0006497257133993877, + "loss": 1.9266, + "step": 885 + }, + { + "epoch": 1.22, + "learning_rate": 0.0006463001678166743, + "loss": 1.9898, + "step": 890 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006428670957093375, + "loss": 1.9723, + "step": 895 + }, + { + "epoch": 1.23, + "learning_rate": 0.0006394266736944118, + "loss": 1.9746, + "step": 900 + }, + { + "epoch": 1.23, + "eval_loss": 2.0750818252563477, + "eval_runtime": 8.84, + "eval_samples_per_second": 13.348, + "eval_steps_per_second": 1.697, + "step": 900 + }, + { + "epoch": 1.24, + "learning_rate": 0.0006359790787670527, + "loss": 1.8354, + "step": 905 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006325244882914327, + "loss": 1.8571, + "step": 910 + }, + { + "epoch": 1.25, + "learning_rate": 0.0006290630799916144, + "loss": 2.0376, + "step": 915 + }, + { + "epoch": 1.26, + "learning_rate": 0.0006255950319424097, + "loss": 2.0141, + "step": 920 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006221205225602169, + "loss": 1.982, + "step": 925 + }, + { + "epoch": 1.27, + "learning_rate": 0.0006186397305938427, + "loss": 1.9456, + "step": 930 + }, + { + "epoch": 1.28, + "learning_rate": 0.0006151528351153061, + "loss": 1.8855, + "step": 935 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006116600155106263, + "loss": 1.9335, + "step": 940 + }, + { + "epoch": 1.29, + "learning_rate": 0.0006081614514705933, + "loss": 1.9459, + "step": 945 + }, + { + "epoch": 1.3, + "learning_rate": 0.0006046573229815243, + "loss": 1.9753, + "step": 950 + }, + { + "epoch": 1.31, + "learning_rate": 0.0006011478103160037, + "loss": 1.969, + "step": 955 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005976330940236089, + "loss": 1.9184, + "step": 960 + }, + { + "epoch": 1.32, + "learning_rate": 0.0005941133549216221, + "loss": 1.9287, + "step": 965 + }, + { + "epoch": 1.33, + "learning_rate": 0.0005905887740857279, + "loss": 1.9373, + "step": 970 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005870595328406971, + "loss": 1.9323, + "step": 975 + }, + { + "epoch": 1.34, + "learning_rate": 0.0005835258127510597, + "loss": 1.9249, + "step": 980 + }, + { + "epoch": 1.35, + "learning_rate": 0.0005799877956117621, + "loss": 2.0135, + "step": 985 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005764456634388171, + "loss": 1.9741, + "step": 990 + }, + { + "epoch": 1.36, + "learning_rate": 0.0005728995984599373, + "loss": 1.9028, + "step": 995 + }, + { + "epoch": 1.37, + "learning_rate": 0.0005693497831051624, + "loss": 1.9647, + "step": 1000 + }, + { + "epoch": 1.37, + "eval_loss": 2.0705747604370117, + "eval_runtime": 8.8388, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1000 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005657963999974728, + "loss": 1.9312, + "step": 1005 + }, + { + "epoch": 1.38, + "learning_rate": 0.0005622396319433947, + "loss": 1.9319, + "step": 1010 + }, + { + "epoch": 1.39, + "learning_rate": 0.0005586796619235951, + "loss": 2.0215, + "step": 1015 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005551166730834692, + "loss": 1.9109, + "step": 1020 + }, + { + "epoch": 1.4, + "learning_rate": 0.0005515508487237174, + "loss": 1.9534, + "step": 1025 + }, + { + "epoch": 1.41, + "learning_rate": 0.0005479823722909158, + "loss": 1.9559, + "step": 1030 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005444114273680778, + "loss": 1.9402, + "step": 1035 + }, + { + "epoch": 1.42, + "learning_rate": 0.0005408381976652113, + "loss": 1.8844, + "step": 1040 + }, + { + "epoch": 1.43, + "learning_rate": 0.0005372628670098654, + "loss": 1.8787, + "step": 1045 + }, + { + "epoch": 1.44, + "learning_rate": 0.0005336856193376748, + "loss": 2.0642, + "step": 1050 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005301066386828965, + "loss": 2.0661, + "step": 1055 + }, + { + "epoch": 1.45, + "learning_rate": 0.0005265261091689423, + "loss": 1.8911, + "step": 1060 + }, + { + "epoch": 1.46, + "learning_rate": 0.0005229442149989058, + "loss": 1.9742, + "step": 1065 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005193611404460872, + "loss": 1.8662, + "step": 1070 + }, + { + "epoch": 1.47, + "learning_rate": 0.0005157770698445116, + "loss": 1.9766, + "step": 1075 + }, + { + "epoch": 1.48, + "learning_rate": 0.0005121921875794468, + "loss": 1.8823, + "step": 1080 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005086066780779174, + "loss": 2.0215, + "step": 1085 + }, + { + "epoch": 1.49, + "learning_rate": 0.0005050207257992166, + "loss": 2.002, + "step": 1090 + }, + { + "epoch": 1.5, + "learning_rate": 0.0005014345152254166, + "loss": 2.0568, + "step": 1095 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004978482308518779, + "loss": 2.0477, + "step": 1100 + }, + { + "epoch": 1.51, + "eval_loss": 2.0488994121551514, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1100 + }, + { + "epoch": 1.51, + "learning_rate": 0.0004942620571777576, + "loss": 1.9615, + "step": 1105 + }, + { + "epoch": 1.52, + "learning_rate": 0.0004906761786965175, + "loss": 1.9747, + "step": 1110 + }, + { + "epoch": 1.53, + "learning_rate": 0.00048709077988643367, + "loss": 2.0174, + "step": 1115 + }, + { + "epoch": 1.53, + "learning_rate": 0.0004835060452011041, + "loss": 1.927, + "step": 1120 + }, + { + "epoch": 1.54, + "learning_rate": 0.00047992215905996163, + "loss": 1.8728, + "step": 1125 + }, + { + "epoch": 1.55, + "learning_rate": 0.0004763393058387841, + "loss": 2.0364, + "step": 1130 + }, + { + "epoch": 1.55, + "learning_rate": 0.00047275766986021046, + "loss": 1.9834, + "step": 1135 + }, + { + "epoch": 1.56, + "learning_rate": 0.0004691774353842571, + "loss": 1.838, + "step": 1140 + }, + { + "epoch": 1.57, + "learning_rate": 0.0004655987865988401, + "loss": 1.8644, + "step": 1145 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004620219076102975, + "loss": 1.9723, + "step": 1150 + }, + { + "epoch": 1.58, + "learning_rate": 0.0004584469824339192, + "loss": 1.7848, + "step": 1155 + }, + { + "epoch": 1.59, + "learning_rate": 0.0004548741949844795, + "loss": 1.9848, + "step": 1160 + }, + { + "epoch": 1.6, + "learning_rate": 0.0004513037290667761, + "loss": 1.9545, + "step": 1165 + }, + { + "epoch": 1.6, + "learning_rate": 0.00044773576836617336, + "loss": 2.0322, + "step": 1170 + }, + { + "epoch": 1.61, + "learning_rate": 0.0004441704964391529, + "loss": 1.933, + "step": 1175 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004406080967038701, + "loss": 1.8019, + "step": 1180 + }, + { + "epoch": 1.62, + "learning_rate": 0.0004370487524307189, + "loss": 1.9489, + "step": 1185 + }, + { + "epoch": 1.63, + "learning_rate": 0.00043349264673290204, + "loss": 1.9982, + "step": 1190 + }, + { + "epoch": 1.64, + "learning_rate": 0.0004299399625570114, + "loss": 1.9536, + "step": 1195 + }, + { + "epoch": 1.64, + "learning_rate": 0.00042639088267361596, + "loss": 1.849, + "step": 1200 + }, + { + "epoch": 1.64, + "eval_loss": 2.030912160873413, + "eval_runtime": 8.838, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1200 + }, + { + "epoch": 1.65, + "learning_rate": 0.00042284558966785944, + "loss": 1.9273, + "step": 1205 + }, + { + "epoch": 1.66, + "learning_rate": 0.00041930426593006633, + "loss": 1.9215, + "step": 1210 + }, + { + "epoch": 1.66, + "learning_rate": 0.0004157670936463592, + "loss": 2.0457, + "step": 1215 + }, + { + "epoch": 1.67, + "learning_rate": 0.00041223425478928595, + "loss": 1.9452, + "step": 1220 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004087059311084581, + "loss": 2.033, + "step": 1225 + }, + { + "epoch": 1.68, + "learning_rate": 0.0004051823041212002, + "loss": 1.9818, + "step": 1230 + }, + { + "epoch": 1.69, + "learning_rate": 0.00040166355510321195, + "loss": 2.0649, + "step": 1235 + }, + { + "epoch": 1.7, + "learning_rate": 0.00039814986507924195, + "loss": 1.8362, + "step": 1240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003946414148137756, + "loss": 2.017, + "step": 1245 + }, + { + "epoch": 1.71, + "learning_rate": 0.0003911383848017341, + "loss": 1.871, + "step": 1250 + }, + { + "epoch": 1.72, + "learning_rate": 0.0003876409552591901, + "loss": 1.9843, + "step": 1255 + }, + { + "epoch": 1.73, + "learning_rate": 0.00038414930611409525, + "loss": 1.7489, + "step": 1260 + }, + { + "epoch": 1.73, + "learning_rate": 0.000380663616997025, + "loss": 2.071, + "step": 1265 + }, + { + "epoch": 1.74, + "learning_rate": 0.00037718406723193576, + "loss": 2.0351, + "step": 1270 + }, + { + "epoch": 1.75, + "learning_rate": 0.0003737108358269408, + "loss": 1.9891, + "step": 1275 + }, + { + "epoch": 1.75, + "learning_rate": 0.00037024410146510014, + "loss": 2.0055, + "step": 1280 + }, + { + "epoch": 1.76, + "learning_rate": 0.0003667840424952288, + "loss": 1.9104, + "step": 1285 + }, + { + "epoch": 1.77, + "learning_rate": 0.00036333083692272083, + "loss": 1.9239, + "step": 1290 + }, + { + "epoch": 1.77, + "learning_rate": 0.00035988466240039206, + "loss": 1.8592, + "step": 1295 + }, + { + "epoch": 1.78, + "learning_rate": 0.0003564456962193403, + "loss": 2.0883, + "step": 1300 + }, + { + "epoch": 1.78, + "eval_loss": 2.012606382369995, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1300 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003530141152998255, + "loss": 1.8061, + "step": 1305 + }, + { + "epoch": 1.79, + "learning_rate": 0.0003495900961821662, + "loss": 1.8564, + "step": 1310 + }, + { + "epoch": 1.8, + "learning_rate": 0.0003461738150176588, + "loss": 2.0208, + "step": 1315 + }, + { + "epoch": 1.81, + "learning_rate": 0.00034276544755951444, + "loss": 2.0325, + "step": 1320 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033936516915381774, + "loss": 1.9853, + "step": 1325 + }, + { + "epoch": 1.82, + "learning_rate": 0.00033597315473050596, + "loss": 1.8627, + "step": 1330 + }, + { + "epoch": 1.83, + "learning_rate": 0.00033258957879436893, + "loss": 1.7516, + "step": 1335 + }, + { + "epoch": 1.84, + "learning_rate": 0.00032921461541607225, + "loss": 1.8258, + "step": 1340 + }, + { + "epoch": 1.84, + "learning_rate": 0.0003258484382232023, + "loss": 1.9302, + "step": 1345 + }, + { + "epoch": 1.85, + "learning_rate": 0.00032249122039133273, + "loss": 1.8517, + "step": 1350 + }, + { + "epoch": 1.86, + "learning_rate": 0.00031914313463511635, + "loss": 1.9234, + "step": 1355 + }, + { + "epoch": 1.86, + "learning_rate": 0.000315804353199399, + "loss": 1.7208, + "step": 1360 + }, + { + "epoch": 1.87, + "learning_rate": 0.0003124750478503593, + "loss": 1.9718, + "step": 1365 + }, + { + "epoch": 1.88, + "learning_rate": 0.0003091553898666705, + "loss": 1.9739, + "step": 1370 + }, + { + "epoch": 1.88, + "learning_rate": 0.00030584555003069017, + "loss": 1.9678, + "step": 1375 + }, + { + "epoch": 1.89, + "learning_rate": 0.0003025456986196734, + "loss": 1.859, + "step": 1380 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002992560053970135, + "loss": 1.866, + "step": 1385 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002959766396035077, + "loss": 1.8789, + "step": 1390 + }, + { + "epoch": 1.91, + "learning_rate": 0.0002927077699486507, + "loss": 1.9049, + "step": 1395 + }, + { + "epoch": 1.92, + "learning_rate": 0.00028944956460195514, + "loss": 1.9501, + "step": 1400 + }, + { + "epoch": 1.92, + "eval_loss": 2.006899833679199, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1400 + }, + { + "epoch": 1.92, + "learning_rate": 0.0002862021911843008, + "loss": 1.9477, + "step": 1405 + }, + { + "epoch": 1.93, + "learning_rate": 0.00028296581675930964, + "loss": 1.8011, + "step": 1410 + }, + { + "epoch": 1.94, + "learning_rate": 0.00027974060782475255, + "loss": 1.9646, + "step": 1415 + }, + { + "epoch": 1.95, + "learning_rate": 0.000276526730303983, + "loss": 1.7943, + "step": 1420 + }, + { + "epoch": 1.95, + "learning_rate": 0.0002733243495374013, + "loss": 1.8871, + "step": 1425 + }, + { + "epoch": 1.96, + "learning_rate": 0.000270133630273948, + "loss": 1.9161, + "step": 1430 + }, + { + "epoch": 1.97, + "learning_rate": 0.00026695473666262925, + "loss": 1.7969, + "step": 1435 + }, + { + "epoch": 1.97, + "learning_rate": 0.0002637878322440708, + "loss": 1.9717, + "step": 1440 + }, + { + "epoch": 1.98, + "learning_rate": 0.00026063307994210586, + "loss": 1.8904, + "step": 1445 + }, + { + "epoch": 1.99, + "learning_rate": 0.00025749064205539206, + "loss": 1.8843, + "step": 1450 + }, + { + "epoch": 1.99, + "learning_rate": 0.0002543606802490628, + "loss": 1.9602, + "step": 1455 + }, + { + "epoch": 2.0, + "learning_rate": 0.00025124335554640965, + "loss": 1.8936, + "step": 1460 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024813882832059914, + "loss": 1.6483, + "step": 1465 + }, + { + "epoch": 2.01, + "learning_rate": 0.00024504725828642125, + "loss": 1.7576, + "step": 1470 + }, + { + "epoch": 2.02, + "learning_rate": 0.00024196880449207364, + "loss": 1.7134, + "step": 1475 + }, + { + "epoch": 2.03, + "learning_rate": 0.0002389036253109787, + "loss": 1.7071, + "step": 1480 + }, + { + "epoch": 2.03, + "learning_rate": 0.00023585187843363614, + "loss": 1.7951, + "step": 1485 + }, + { + "epoch": 2.04, + "learning_rate": 0.00023281372085951068, + "loss": 1.6741, + "step": 1490 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022978930888895466, + "loss": 1.6294, + "step": 1495 + }, + { + "epoch": 2.05, + "learning_rate": 0.00022677879811516715, + "loss": 1.7268, + "step": 1500 + }, + { + "epoch": 2.05, + "eval_loss": 2.005824327468872, + "eval_runtime": 8.8391, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1500 + }, + { + "epoch": 2.06, + "learning_rate": 0.00022378234341619019, + "loss": 1.6531, + "step": 1505 + }, + { + "epoch": 2.07, + "learning_rate": 0.00022080009894693948, + "loss": 1.6599, + "step": 1510 + }, + { + "epoch": 2.08, + "learning_rate": 0.00021783221813127473, + "loss": 1.7998, + "step": 1515 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002148788536541064, + "loss": 1.6775, + "step": 1520 + }, + { + "epoch": 2.09, + "learning_rate": 0.00021194015745354123, + "loss": 1.5615, + "step": 1525 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020901628071306455, + "loss": 1.7347, + "step": 1530 + }, + { + "epoch": 2.1, + "learning_rate": 0.00020610737385376348, + "loss": 1.7952, + "step": 1535 + }, + { + "epoch": 2.11, + "learning_rate": 0.00020321358652658806, + "loss": 1.6405, + "step": 1540 + }, + { + "epoch": 2.12, + "learning_rate": 0.00020033506760465237, + "loss": 1.5677, + "step": 1545 + }, + { + "epoch": 2.12, + "learning_rate": 0.0001974719651755756, + "loss": 1.7295, + "step": 1550 + }, + { + "epoch": 2.13, + "learning_rate": 0.0001946244265338637, + "loss": 1.6888, + "step": 1555 + }, + { + "epoch": 2.14, + "learning_rate": 0.00019179259817333133, + "loss": 1.6395, + "step": 1560 + }, + { + "epoch": 2.14, + "learning_rate": 0.0001889766257795663, + "loss": 1.6953, + "step": 1565 + }, + { + "epoch": 2.15, + "learning_rate": 0.00018617665422243336, + "loss": 1.7618, + "step": 1570 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018339282754862223, + "loss": 1.6839, + "step": 1575 + }, + { + "epoch": 2.16, + "learning_rate": 0.00018062528897423643, + "loss": 1.654, + "step": 1580 + }, + { + "epoch": 2.17, + "learning_rate": 0.00017787418087742614, + "loss": 1.7326, + "step": 1585 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017513964479106266, + "loss": 1.7163, + "step": 1590 + }, + { + "epoch": 2.18, + "learning_rate": 0.00017242182139545742, + "loss": 1.6667, + "step": 1595 + }, + { + "epoch": 2.19, + "learning_rate": 0.0001697208505111249, + "loss": 1.7881, + "step": 1600 + }, + { + "epoch": 2.19, + "eval_loss": 2.008232831954956, + "eval_runtime": 8.8393, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1600 + }, + { + "epoch": 2.2, + "learning_rate": 0.00016703687109158888, + "loss": 1.6769, + "step": 1605 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016437002121623434, + "loss": 1.7811, + "step": 1610 + }, + { + "epoch": 2.21, + "learning_rate": 0.00016172043808320368, + "loss": 1.6699, + "step": 1615 + }, + { + "epoch": 2.22, + "learning_rate": 0.00015908825800233824, + "loss": 1.7141, + "step": 1620 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015647361638816655, + "loss": 1.7672, + "step": 1625 + }, + { + "epoch": 2.23, + "learning_rate": 0.00015387664775293658, + "loss": 1.7043, + "step": 1630 + }, + { + "epoch": 2.24, + "learning_rate": 0.00015129748569969663, + "loss": 1.8051, + "step": 1635 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014873626291542148, + "loss": 1.7196, + "step": 1640 + }, + { + "epoch": 2.25, + "learning_rate": 0.00014619311116418693, + "loss": 1.6664, + "step": 1645 + }, + { + "epoch": 2.26, + "learning_rate": 0.00014366816128039007, + "loss": 1.6404, + "step": 1650 + }, + { + "epoch": 2.27, + "learning_rate": 0.00014116154316201908, + "loss": 1.7127, + "step": 1655 + }, + { + "epoch": 2.27, + "learning_rate": 0.00013867338576397043, + "loss": 1.6906, + "step": 1660 + }, + { + "epoch": 2.28, + "learning_rate": 0.00013620381709141455, + "loss": 1.5734, + "step": 1665 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001337529641932107, + "loss": 1.7073, + "step": 1670 + }, + { + "epoch": 2.29, + "learning_rate": 0.0001313209531553707, + "loss": 1.7745, + "step": 1675 + }, + { + "epoch": 2.3, + "learning_rate": 0.00012890790909457213, + "loss": 1.6972, + "step": 1680 + }, + { + "epoch": 2.31, + "learning_rate": 0.00012651395615172239, + "loss": 1.6176, + "step": 1685 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012413921748557127, + "loss": 1.6887, + "step": 1690 + }, + { + "epoch": 2.32, + "learning_rate": 0.00012178381526637533, + "loss": 1.8215, + "step": 1695 + }, + { + "epoch": 2.33, + "learning_rate": 0.00011944787066961266, + "loss": 1.6511, + "step": 1700 + }, + { + "epoch": 2.33, + "eval_loss": 2.0000417232513428, + "eval_runtime": 8.8396, + "eval_samples_per_second": 13.349, + "eval_steps_per_second": 1.697, + "step": 1700 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011713150386974947, + "loss": 1.7458, + "step": 1705 + }, + { + "epoch": 2.34, + "learning_rate": 0.00011483483403405659, + "loss": 1.6685, + "step": 1710 + }, + { + "epoch": 2.35, + "learning_rate": 0.0001125579793164797, + "loss": 1.733, + "step": 1715 + }, + { + "epoch": 2.36, + "learning_rate": 0.00011030105685156039, + "loss": 1.7281, + "step": 1720 + }, + { + "epoch": 2.36, + "learning_rate": 0.00010806418274841024, + "loss": 1.7581, + "step": 1725 + }, + { + "epoch": 2.37, + "learning_rate": 0.00010584747208473738, + "loss": 1.6238, + "step": 1730 + }, + { + "epoch": 2.38, + "learning_rate": 0.00010365103890092636, + "loss": 1.7462, + "step": 1735 + }, + { + "epoch": 2.38, + "learning_rate": 0.000101474996194171, + "loss": 1.7897, + "step": 1740 + }, + { + "epoch": 2.39, + "learning_rate": 9.931945591266172e-05, + "loss": 1.8603, + "step": 1745 + }, + { + "epoch": 2.4, + "learning_rate": 9.718452894982571e-05, + "loss": 1.8379, + "step": 1750 + }, + { + "epoch": 2.4, + "learning_rate": 9.507032513862195e-05, + "loss": 1.7378, + "step": 1755 + }, + { + "epoch": 2.41, + "learning_rate": 9.297695324589106e-05, + "loss": 1.6022, + "step": 1760 + }, + { + "epoch": 2.42, + "learning_rate": 9.090452096675993e-05, + "loss": 1.7144, + "step": 1765 + }, + { + "epoch": 2.42, + "learning_rate": 8.885313491910052e-05, + "loss": 1.6529, + "step": 1770 + }, + { + "epoch": 2.43, + "learning_rate": 8.682290063804527e-05, + "loss": 1.7523, + "step": 1775 + }, + { + "epoch": 2.44, + "learning_rate": 8.48139225705578e-05, + "loss": 1.6469, + "step": 1780 + }, + { + "epoch": 2.45, + "learning_rate": 8.28263040700598e-05, + "loss": 1.7107, + "step": 1785 + }, + { + "epoch": 2.45, + "learning_rate": 8.086014739111297e-05, + "loss": 1.5931, + "step": 1790 + }, + { + "epoch": 2.46, + "learning_rate": 7.891555368415947e-05, + "loss": 1.7049, + "step": 1795 + }, + { + "epoch": 2.47, + "learning_rate": 7.699262299031778e-05, + "loss": 1.6175, + "step": 1800 + }, + { + "epoch": 2.47, + "eval_loss": 1.9944177865982056, + "eval_runtime": 8.8381, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 1800 + }, + { + "epoch": 2.47, + "learning_rate": 7.509145423623608e-05, + "loss": 1.5914, + "step": 1805 + }, + { + "epoch": 2.48, + "learning_rate": 7.321214522900271e-05, + "loss": 1.5903, + "step": 1810 + }, + { + "epoch": 2.49, + "learning_rate": 7.13547926511145e-05, + "loss": 1.5715, + "step": 1815 + }, + { + "epoch": 2.49, + "learning_rate": 6.951949205550284e-05, + "loss": 1.5976, + "step": 1820 + }, + { + "epoch": 2.5, + "learning_rate": 6.770633786061819e-05, + "loss": 1.7028, + "step": 1825 + }, + { + "epoch": 2.51, + "learning_rate": 6.591542334557222e-05, + "loss": 1.7763, + "step": 1830 + }, + { + "epoch": 2.51, + "learning_rate": 6.41468406453391e-05, + "loss": 1.5076, + "step": 1835 + }, + { + "epoch": 2.52, + "learning_rate": 6.240068074601568e-05, + "loss": 1.665, + "step": 1840 + }, + { + "epoch": 2.53, + "learning_rate": 6.067703348014086e-05, + "loss": 1.7332, + "step": 1845 + }, + { + "epoch": 2.53, + "learning_rate": 5.897598752207328e-05, + "loss": 1.7152, + "step": 1850 + }, + { + "epoch": 2.54, + "learning_rate": 5.729763038343022e-05, + "loss": 1.7113, + "step": 1855 + }, + { + "epoch": 2.55, + "learning_rate": 5.564204840858511e-05, + "loss": 1.8046, + "step": 1860 + }, + { + "epoch": 2.55, + "learning_rate": 5.40093267702258e-05, + "loss": 1.652, + "step": 1865 + }, + { + "epoch": 2.56, + "learning_rate": 5.239954946497227e-05, + "loss": 1.7188, + "step": 1870 + }, + { + "epoch": 2.57, + "learning_rate": 5.0812799309055746e-05, + "loss": 1.7157, + "step": 1875 + }, + { + "epoch": 2.58, + "learning_rate": 4.9249157934057985e-05, + "loss": 1.7516, + "step": 1880 + }, + { + "epoch": 2.58, + "learning_rate": 4.770870578271197e-05, + "loss": 1.7406, + "step": 1885 + }, + { + "epoch": 2.59, + "learning_rate": 4.619152210476296e-05, + "loss": 1.6949, + "step": 1890 + }, + { + "epoch": 2.6, + "learning_rate": 4.469768495289189e-05, + "loss": 1.6593, + "step": 1895 + }, + { + "epoch": 2.6, + "learning_rate": 4.322727117869951e-05, + "loss": 1.762, + "step": 1900 + }, + { + "epoch": 2.6, + "eval_loss": 1.991329312324524, + "eval_runtime": 8.8389, + "eval_samples_per_second": 13.35, + "eval_steps_per_second": 1.697, + "step": 1900 + }, + { + "epoch": 2.61, + "learning_rate": 4.178035642875322e-05, + "loss": 1.8229, + "step": 1905 + }, + { + "epoch": 2.62, + "learning_rate": 4.0357015140694843e-05, + "loss": 1.7235, + "step": 1910 + }, + { + "epoch": 2.62, + "learning_rate": 3.89573205394112e-05, + "loss": 1.6724, + "step": 1915 + }, + { + "epoch": 2.63, + "learning_rate": 3.758134463326729e-05, + "loss": 1.576, + "step": 1920 + }, + { + "epoch": 2.64, + "learning_rate": 3.622915821040174e-05, + "loss": 1.6975, + "step": 1925 + }, + { + "epoch": 2.64, + "learning_rate": 3.4900830835084604e-05, + "loss": 1.8066, + "step": 1930 + }, + { + "epoch": 2.65, + "learning_rate": 3.3596430844139216e-05, + "loss": 1.6242, + "step": 1935 + }, + { + "epoch": 2.66, + "learning_rate": 3.231602534342587e-05, + "loss": 1.6191, + "step": 1940 + }, + { + "epoch": 2.66, + "learning_rate": 3.105968020439026e-05, + "loss": 1.7211, + "step": 1945 + }, + { + "epoch": 2.67, + "learning_rate": 2.9827460060673938e-05, + "loss": 1.7227, + "step": 1950 + }, + { + "epoch": 2.68, + "learning_rate": 2.8619428304789697e-05, + "loss": 1.7096, + "step": 1955 + }, + { + "epoch": 2.68, + "learning_rate": 2.743564708485996e-05, + "loss": 1.7527, + "step": 1960 + }, + { + "epoch": 2.69, + "learning_rate": 2.6276177301419955e-05, + "loss": 1.6941, + "step": 1965 + }, + { + "epoch": 2.7, + "learning_rate": 2.5141078604284108e-05, + "loss": 1.6471, + "step": 1970 + }, + { + "epoch": 2.71, + "learning_rate": 2.4030409389477757e-05, + "loss": 1.7648, + "step": 1975 + }, + { + "epoch": 2.71, + "learning_rate": 2.2944226796232537e-05, + "loss": 1.5269, + "step": 1980 + }, + { + "epoch": 2.72, + "learning_rate": 2.188258670404719e-05, + "loss": 1.6842, + "step": 1985 + }, + { + "epoch": 2.73, + "learning_rate": 2.0845543729812566e-05, + "loss": 1.8391, + "step": 1990 + }, + { + "epoch": 2.73, + "learning_rate": 1.9833151225001734e-05, + "loss": 1.6511, + "step": 1995 + }, + { + "epoch": 2.74, + "learning_rate": 1.884546127292569e-05, + "loss": 1.6843, + "step": 2000 + }, + { + "epoch": 2.74, + "eval_loss": 1.988287329673767, + "eval_runtime": 8.8383, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 2000 + }, + { + "epoch": 2.75, + "learning_rate": 1.7882524686053393e-05, + "loss": 1.6908, + "step": 2005 + }, + { + "epoch": 2.75, + "learning_rate": 1.6944391003397895e-05, + "loss": 1.7248, + "step": 2010 + }, + { + "epoch": 2.76, + "learning_rate": 1.603110848796785e-05, + "loss": 1.6644, + "step": 2015 + }, + { + "epoch": 2.77, + "learning_rate": 1.5142724124284579e-05, + "loss": 1.674, + "step": 2020 + }, + { + "epoch": 2.77, + "learning_rate": 1.42792836159647e-05, + "loss": 1.7485, + "step": 2025 + }, + { + "epoch": 2.78, + "learning_rate": 1.3440831383369045e-05, + "loss": 1.7741, + "step": 2030 + }, + { + "epoch": 2.79, + "learning_rate": 1.2627410561317387e-05, + "loss": 1.7444, + "step": 2035 + }, + { + "epoch": 2.79, + "learning_rate": 1.1839062996869377e-05, + "loss": 1.7367, + "step": 2040 + }, + { + "epoch": 2.8, + "learning_rate": 1.1075829247171598e-05, + "loss": 1.6514, + "step": 2045 + }, + { + "epoch": 2.81, + "learning_rate": 1.0337748577371186e-05, + "loss": 1.5851, + "step": 2050 + }, + { + "epoch": 2.82, + "learning_rate": 9.624858958595716e-06, + "loss": 1.735, + "step": 2055 + }, + { + "epoch": 2.82, + "learning_rate": 8.937197065999714e-06, + "loss": 1.7278, + "step": 2060 + }, + { + "epoch": 2.83, + "learning_rate": 8.274798276878049e-06, + "loss": 1.4815, + "step": 2065 + }, + { + "epoch": 2.84, + "learning_rate": 7.637696668845728e-06, + "loss": 1.7065, + "step": 2070 + }, + { + "epoch": 2.84, + "learning_rate": 7.0259250180848e-06, + "loss": 1.6512, + "step": 2075 + }, + { + "epoch": 2.85, + "learning_rate": 6.439514797658308e-06, + "loss": 1.6191, + "step": 2080 + }, + { + "epoch": 2.86, + "learning_rate": 5.8784961758908685e-06, + "loss": 1.613, + "step": 2085 + }, + { + "epoch": 2.86, + "learning_rate": 5.342898014816855e-06, + "loss": 1.5417, + "step": 2090 + }, + { + "epoch": 2.87, + "learning_rate": 4.832747868695475e-06, + "loss": 1.6967, + "step": 2095 + }, + { + "epoch": 2.88, + "learning_rate": 4.348071982593293e-06, + "loss": 1.7315, + "step": 2100 + }, + { + "epoch": 2.88, + "eval_loss": 1.98636794090271, + "eval_runtime": 8.8386, + "eval_samples_per_second": 13.351, + "eval_steps_per_second": 1.697, + "step": 2100 + }, + { + "epoch": 2.88, + "learning_rate": 3.888895291033867e-06, + "loss": 1.6421, + "step": 2105 + }, + { + "epoch": 2.89, + "learning_rate": 3.455241416715216e-06, + "loss": 1.7282, + "step": 2110 + }, + { + "epoch": 2.9, + "learning_rate": 3.0471326692942947e-06, + "loss": 1.551, + "step": 2115 + }, + { + "epoch": 2.9, + "learning_rate": 2.664590044239468e-06, + "loss": 1.6687, + "step": 2120 + }, + { + "epoch": 2.91, + "learning_rate": 2.3076332217501496e-06, + "loss": 1.6605, + "step": 2125 + }, + { + "epoch": 2.92, + "learning_rate": 1.976280565744615e-06, + "loss": 1.5612, + "step": 2130 + }, + { + "epoch": 2.92, + "learning_rate": 1.6705491229149216e-06, + "loss": 1.6839, + "step": 2135 + }, + { + "epoch": 2.93, + "learning_rate": 1.3904546218503344e-06, + "loss": 1.615, + "step": 2140 + }, + { + "epoch": 2.94, + "learning_rate": 1.1360114722277493e-06, + "loss": 1.6923, + "step": 2145 + }, + { + "epoch": 2.95, + "learning_rate": 9.072327640706757e-07, + "loss": 1.6513, + "step": 2150 + }, + { + "epoch": 2.95, + "learning_rate": 7.041302670756645e-07, + "loss": 1.6785, + "step": 2155 + }, + { + "epoch": 2.96, + "learning_rate": 5.267144300067916e-07, + "loss": 1.5907, + "step": 2160 + }, + { + "epoch": 2.97, + "learning_rate": 3.749943801582556e-07, + "loss": 1.6464, + "step": 2165 + }, + { + "epoch": 2.97, + "learning_rate": 2.4897792288469667e-07, + "loss": 1.6572, + "step": 2170 + }, + { + "epoch": 2.98, + "learning_rate": 1.4867154119957428e-07, + "loss": 1.6339, + "step": 2175 + }, + { + "epoch": 2.99, + "learning_rate": 7.408039544187783e-08, + "loss": 1.615, + "step": 2180 + }, + { + "epoch": 2.99, + "learning_rate": 2.5208323010450507e-08, + "loss": 1.7196, + "step": 2185 + }, + { + "epoch": 3.0, + "learning_rate": 2.0578381666469526e-09, + "loss": 1.6404, + "step": 2190 + }, + { + "epoch": 3.0, + "step": 2190, + "total_flos": 6.467055265331282e+17, + "train_loss": 1.969855450712927, + "train_runtime": 7239.9426, + "train_samples_per_second": 4.84, + "train_steps_per_second": 0.302 + } + ], + "logging_steps": 5, + "max_steps": 2190, + "num_train_epochs": 3, + "save_steps": 100, + "total_flos": 6.467055265331282e+17, + "trial_name": null, + "trial_params": null +} diff --git a/SFT/training_args.bin b/SFT/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..51a2ab270395295bb9db3fd20d266b643e4157e5 --- /dev/null +++ b/SFT/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed3f466c279726e7beb9d2076fbc17a0b973f5a0cab1836fdae001b646086a5 +size 3417 diff --git a/Untitled.ipynb b/Untitled.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..226b5dbd5394a190ab6320aed284aa2db6c8971e --- /dev/null +++ b/Untitled.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "35767163-4bf3-4784-841e-6d4ea9b30c26", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bf1612ab9e194571b6dbb6054c2b7eea", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Upload 249 LFS files: 0%| | 0/249 [00:00