|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.728904366493225, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7312, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.7350375652313232, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7843, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.4805001020431519, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.6772, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0534298419952393, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6122, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8123345375061035, |
|
"learning_rate": 0.00019990212265199738, |
|
"loss": 0.5329, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6635419130325317, |
|
"learning_rate": 0.00019960868220749448, |
|
"loss": 0.4878, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6072973608970642, |
|
"learning_rate": 0.00019912025308994148, |
|
"loss": 0.4836, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5697150826454163, |
|
"learning_rate": 0.00019843779142227256, |
|
"loss": 0.5162, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5166112780570984, |
|
"learning_rate": 0.0001975626331552507, |
|
"loss": 0.4825, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5054742097854614, |
|
"learning_rate": 0.00019649649145228102, |
|
"loss": 0.4564, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4537505805492401, |
|
"learning_rate": 0.00019524145333581317, |
|
"loss": 0.4383, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4296068251132965, |
|
"learning_rate": 0.00019379997560189675, |
|
"loss": 0.4529, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.44367527961730957, |
|
"learning_rate": 0.00019217488001088784, |
|
"loss": 0.4545, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.4168623685836792, |
|
"learning_rate": 0.0001903693477637204, |
|
"loss": 0.418, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4231944680213928, |
|
"learning_rate": 0.0001883869132745561, |
|
"loss": 0.435, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4075939953327179, |
|
"learning_rate": 0.00018623145725200278, |
|
"loss": 0.4274, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.36284372210502625, |
|
"learning_rate": 0.00018390719910244487, |
|
"loss": 0.3972, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.3902932405471802, |
|
"learning_rate": 0.00018141868867035745, |
|
"loss": 0.3953, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.38837161660194397, |
|
"learning_rate": 0.00017877079733177184, |
|
"loss": 0.4294, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.38330700993537903, |
|
"learning_rate": 0.0001759687084583285, |
|
"loss": 0.4155, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.3897230625152588, |
|
"learning_rate": 0.00017301790727058345, |
|
"loss": 0.3791, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.40314897894859314, |
|
"learning_rate": 0.00016992417010043142, |
|
"loss": 0.4018, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4068446755409241, |
|
"learning_rate": 0.0001666935530836651, |
|
"loss": 0.3801, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.40863823890686035, |
|
"learning_rate": 0.0001633323803048047, |
|
"loss": 0.3879, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.3958357274532318, |
|
"learning_rate": 0.00015984723141740576, |
|
"loss": 0.3929, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.3210630416870117, |
|
"learning_rate": 0.0001562449287640781, |
|
"loss": 0.2722, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.34723371267318726, |
|
"learning_rate": 0.00015253252402142988, |
|
"loss": 0.2701, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.3267146050930023, |
|
"learning_rate": 0.00014871728439607966, |
|
"loss": 0.2654, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.3217560350894928, |
|
"learning_rate": 0.00014480667839875786, |
|
"loss": 0.2653, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.3129405975341797, |
|
"learning_rate": 0.0001408083612243465, |
|
"loss": 0.2495, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.3169604241847992, |
|
"learning_rate": 0.00013673015976647568, |
|
"loss": 0.2783, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.3302832543849945, |
|
"learning_rate": 0.00013258005729601177, |
|
"loss": 0.2589, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.3463418781757355, |
|
"learning_rate": 0.0001283661778334297, |
|
"loss": 0.2453, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.3463260531425476, |
|
"learning_rate": 0.00012409677024566144, |
|
"loss": 0.242, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.3702252209186554, |
|
"learning_rate": 0.00011978019209855174, |
|
"loss": 0.264, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.3509206771850586, |
|
"learning_rate": 0.00011542489329653024, |
|
"loss": 0.2593, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.3612159490585327, |
|
"learning_rate": 0.000111039399541527, |
|
"loss": 0.2411, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.3651520609855652, |
|
"learning_rate": 0.00010663229564351041, |
|
"loss": 0.2378, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.3665476441383362, |
|
"learning_rate": 0.00010221220871531869, |
|
"loss": 0.2334, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.34961438179016113, |
|
"learning_rate": 9.778779128468132e-05, |
|
"loss": 0.2307, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 0.379111111164093, |
|
"learning_rate": 9.336770435648964e-05, |
|
"loss": 0.2212, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.38593631982803345, |
|
"learning_rate": 8.896060045847304e-05, |
|
"loss": 0.2335, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.37961545586586, |
|
"learning_rate": 8.457510670346976e-05, |
|
"loss": 0.2306, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.3735259771347046, |
|
"learning_rate": 8.021980790144827e-05, |
|
"loss": 0.2499, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.40170496702194214, |
|
"learning_rate": 7.590322975433857e-05, |
|
"loss": 0.2275, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.3875046372413635, |
|
"learning_rate": 7.163382216657034e-05, |
|
"loss": 0.218, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.37028905749320984, |
|
"learning_rate": 6.741994270398826e-05, |
|
"loss": 0.2422, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.3737669289112091, |
|
"learning_rate": 6.326984023352435e-05, |
|
"loss": 0.2195, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.3924426734447479, |
|
"learning_rate": 5.91916387756535e-05, |
|
"loss": 0.2235, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.36921918392181396, |
|
"learning_rate": 5.5193321601242156e-05, |
|
"loss": 0.2141, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.28800830245018005, |
|
"learning_rate": 5.1282715603920374e-05, |
|
"loss": 0.1736, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.31100502610206604, |
|
"learning_rate": 4.746747597857014e-05, |
|
"loss": 0.1578, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.3049222528934479, |
|
"learning_rate": 4.375507123592194e-05, |
|
"loss": 0.1771, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.28219300508499146, |
|
"learning_rate": 4.015276858259427e-05, |
|
"loss": 0.1476, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.3022613525390625, |
|
"learning_rate": 3.6667619695195285e-05, |
|
"loss": 0.1779, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.2823966443538666, |
|
"learning_rate": 3.330644691633492e-05, |
|
"loss": 0.1501, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 0.28174689412117004, |
|
"learning_rate": 3.0075829899568597e-05, |
|
"loss": 0.1511, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.2776714861392975, |
|
"learning_rate": 2.6982092729416587e-05, |
|
"loss": 0.1568, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.2745690643787384, |
|
"learning_rate": 2.403129154167153e-05, |
|
"loss": 0.1393, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2862659692764282, |
|
"learning_rate": 2.1229202668228197e-05, |
|
"loss": 0.1568, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.30168795585632324, |
|
"learning_rate": 1.858131132964259e-05, |
|
"loss": 0.164, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.30739548802375793, |
|
"learning_rate": 1.609280089755515e-05, |
|
"loss": 0.1595, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.2983320355415344, |
|
"learning_rate": 1.3768542747997215e-05, |
|
"loss": 0.174, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.30526575446128845, |
|
"learning_rate": 1.161308672544389e-05, |
|
"loss": 0.168, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.28905755281448364, |
|
"learning_rate": 9.630652236279625e-06, |
|
"loss": 0.1557, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.29685401916503906, |
|
"learning_rate": 7.825119989112173e-06, |
|
"loss": 0.1531, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.309733510017395, |
|
"learning_rate": 6.200024398103255e-06, |
|
"loss": 0.1538, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.31325626373291016, |
|
"learning_rate": 4.758546664186869e-06, |
|
"loss": 0.16, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.29155057668685913, |
|
"learning_rate": 3.5035085477190143e-06, |
|
"loss": 0.1612, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.31043997406959534, |
|
"learning_rate": 2.4373668447493224e-06, |
|
"loss": 0.1655, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.30450770258903503, |
|
"learning_rate": 1.562208577727442e-06, |
|
"loss": 0.1581, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.2816106677055359, |
|
"learning_rate": 8.797469100585431e-07, |
|
"loss": 0.1369, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.3166625201702118, |
|
"learning_rate": 3.913177925055189e-07, |
|
"loss": 0.1724, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.30547070503234863, |
|
"learning_rate": 9.78773480026396e-08, |
|
"loss": 0.1555, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.29624301195144653, |
|
"learning_rate": 0.0, |
|
"loss": 0.1598, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.106714169779814e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|