|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 6232, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016046213093709884, |
|
"grad_norm": 0.17895939946174622, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4769, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03209242618741977, |
|
"grad_norm": 0.15942341089248657, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4803, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04813863928112965, |
|
"grad_norm": 0.24720104038715363, |
|
"learning_rate": 6e-06, |
|
"loss": 2.438, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.06418485237483953, |
|
"grad_norm": 0.39716702699661255, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.4056, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08023106546854943, |
|
"grad_norm": 0.3956620395183563, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3633, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0962772785622593, |
|
"grad_norm": 0.5021713972091675, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.3186, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1123234916559692, |
|
"grad_norm": 0.567215085029602, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.2991, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.12836970474967907, |
|
"grad_norm": 0.6055766344070435, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.2618, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.14441591784338895, |
|
"grad_norm": 0.5610828995704651, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.2398, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16046213093709885, |
|
"grad_norm": 0.6880731582641602, |
|
"learning_rate": 2e-05, |
|
"loss": 2.2216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17650834403080873, |
|
"grad_norm": 0.6766607761383057, |
|
"learning_rate": 1.9981977966686475e-05, |
|
"loss": 2.1763, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1925545571245186, |
|
"grad_norm": 0.8101206421852112, |
|
"learning_rate": 1.992797682548284e-05, |
|
"loss": 2.1818, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2086007702182285, |
|
"grad_norm": 0.8453436493873596, |
|
"learning_rate": 1.983819121846225e-05, |
|
"loss": 2.1666, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2246469833119384, |
|
"grad_norm": 0.8145064115524292, |
|
"learning_rate": 1.9712944769464864e-05, |
|
"loss": 2.1279, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.24069319640564826, |
|
"grad_norm": 0.8634337782859802, |
|
"learning_rate": 1.9552688917625927e-05, |
|
"loss": 2.1279, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.25673940949935814, |
|
"grad_norm": 1.1356147527694702, |
|
"learning_rate": 1.9358001290205542e-05, |
|
"loss": 2.1238, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.27278562259306804, |
|
"grad_norm": 0.7424245476722717, |
|
"learning_rate": 1.9129583620585137e-05, |
|
"loss": 2.1072, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2888318356867779, |
|
"grad_norm": 1.1842221021652222, |
|
"learning_rate": 1.886825921893497e-05, |
|
"loss": 2.0933, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3048780487804878, |
|
"grad_norm": 0.9710640907287598, |
|
"learning_rate": 1.8574970004669464e-05, |
|
"loss": 2.0651, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3209242618741977, |
|
"grad_norm": 1.1665947437286377, |
|
"learning_rate": 1.8250773111386633e-05, |
|
"loss": 2.0682, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.33697047496790755, |
|
"grad_norm": 1.1069215536117554, |
|
"learning_rate": 1.7896837076528647e-05, |
|
"loss": 2.089, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.35301668806161746, |
|
"grad_norm": 1.1600021123886108, |
|
"learning_rate": 1.751443762949772e-05, |
|
"loss": 2.0485, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.36906290115532736, |
|
"grad_norm": 1.4247888326644897, |
|
"learning_rate": 1.7104953093408548e-05, |
|
"loss": 2.0807, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.3851091142490372, |
|
"grad_norm": 1.3733174800872803, |
|
"learning_rate": 1.666985941705128e-05, |
|
"loss": 2.0503, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4011553273427471, |
|
"grad_norm": 1.2357269525527954, |
|
"learning_rate": 1.6210724854971885e-05, |
|
"loss": 2.059, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.417201540436457, |
|
"grad_norm": 1.0586000680923462, |
|
"learning_rate": 1.5729204314845002e-05, |
|
"loss": 2.0246, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.43324775353016687, |
|
"grad_norm": 1.1663570404052734, |
|
"learning_rate": 1.5227033392513684e-05, |
|
"loss": 2.0421, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4492939666238768, |
|
"grad_norm": 1.1528408527374268, |
|
"learning_rate": 1.4706022116196208e-05, |
|
"loss": 2.0494, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4653401797175867, |
|
"grad_norm": 1.0500417947769165, |
|
"learning_rate": 1.4168048422408272e-05, |
|
"loss": 2.008, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.4813863928112965, |
|
"grad_norm": 1.2136881351470947, |
|
"learning_rate": 1.3615051387116131e-05, |
|
"loss": 2.026, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.49743260590500643, |
|
"grad_norm": 1.1245627403259277, |
|
"learning_rate": 1.3049024236518244e-05, |
|
"loss": 1.9962, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5134788189987163, |
|
"grad_norm": 1.0942673683166504, |
|
"learning_rate": 1.24720071626475e-05, |
|
"loss": 2.0102, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5295250320924262, |
|
"grad_norm": 1.087973952293396, |
|
"learning_rate": 1.1886079969689454e-05, |
|
"loss": 1.9989, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5455712451861361, |
|
"grad_norm": 1.1616392135620117, |
|
"learning_rate": 1.1293354577522264e-05, |
|
"loss": 2.0078, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.561617458279846, |
|
"grad_norm": 1.0014444589614868, |
|
"learning_rate": 1.0695967409498614e-05, |
|
"loss": 1.9948, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5776636713735558, |
|
"grad_norm": 1.045386552810669, |
|
"learning_rate": 1.0096071691907137e-05, |
|
"loss": 2.0031, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5937098844672657, |
|
"grad_norm": 1.7991660833358765, |
|
"learning_rate": 9.495829692869255e-06, |
|
"loss": 2.0029, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6097560975609756, |
|
"grad_norm": 1.1647707223892212, |
|
"learning_rate": 8.897404928645529e-06, |
|
"loss": 1.9819, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6258023106546855, |
|
"grad_norm": 1.0043553113937378, |
|
"learning_rate": 8.302954365443264e-06, |
|
"loss": 2.0047, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6418485237483954, |
|
"grad_norm": 1.0303337574005127, |
|
"learning_rate": 7.71462064483311e-06, |
|
"loss": 1.986, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 1.3229223489761353, |
|
"learning_rate": 7.13452436079753e-06, |
|
"loss": 2.0151, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6739409499358151, |
|
"grad_norm": 1.4595571756362915, |
|
"learning_rate": 6.564756416247712e-06, |
|
"loss": 1.9585, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.689987163029525, |
|
"grad_norm": 1.1201053857803345, |
|
"learning_rate": 6.007370486559185e-06, |
|
"loss": 1.9846, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7060333761232349, |
|
"grad_norm": 1.0485244989395142, |
|
"learning_rate": 5.46437561729062e-06, |
|
"loss": 1.9853, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7220795892169448, |
|
"grad_norm": 1.0388679504394531, |
|
"learning_rate": 4.937728982766622e-06, |
|
"loss": 2.0268, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7381258023106547, |
|
"grad_norm": 1.3947858810424805, |
|
"learning_rate": 4.429328831625565e-06, |
|
"loss": 1.9645, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7541720154043645, |
|
"grad_norm": 1.3646990060806274, |
|
"learning_rate": 3.941007644759535e-06, |
|
"loss": 1.9704, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7702182284980744, |
|
"grad_norm": 1.2408106327056885, |
|
"learning_rate": 3.474525530308016e-06, |
|
"loss": 2.0122, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7862644415917843, |
|
"grad_norm": 1.3304516077041626, |
|
"learning_rate": 3.0315638795123726e-06, |
|
"loss": 1.9989, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8023106546854942, |
|
"grad_norm": 1.200531005859375, |
|
"learning_rate": 2.6137193062980506e-06, |
|
"loss": 2.0135, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8183568677792041, |
|
"grad_norm": 1.3504362106323242, |
|
"learning_rate": 2.2224978924287243e-06, |
|
"loss": 2.0073, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.834403080872914, |
|
"grad_norm": 1.4387364387512207, |
|
"learning_rate": 1.8593097589751318e-06, |
|
"loss": 1.9344, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8504492939666238, |
|
"grad_norm": 1.4003574848175049, |
|
"learning_rate": 1.5254639836653117e-06, |
|
"loss": 2.0001, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8664955070603337, |
|
"grad_norm": 1.2966829538345337, |
|
"learning_rate": 1.222163882436107e-06, |
|
"loss": 1.9852, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8825417201540436, |
|
"grad_norm": 1.3298251628875732, |
|
"learning_rate": 9.505026721931898e-07, |
|
"loss": 1.9768, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.8985879332477535, |
|
"grad_norm": 1.1031887531280518, |
|
"learning_rate": 7.114595304127536e-07, |
|
"loss": 1.9921, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9146341463414634, |
|
"grad_norm": 1.3485647439956665, |
|
"learning_rate": 5.077834417277494e-07, |
|
"loss": 1.9674, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9306803594351734, |
|
"grad_norm": 1.056166172027588, |
|
"learning_rate": 3.3609511784804427e-07, |
|
"loss": 2.0047, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9467265725288831, |
|
"grad_norm": 1.2811501026153564, |
|
"learning_rate": 1.9923943711331996e-07, |
|
"loss": 1.9755, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.962772785622593, |
|
"grad_norm": 1.4452365636825562, |
|
"learning_rate": 9.770968305104822e-08, |
|
"loss": 1.9793, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.978818998716303, |
|
"grad_norm": 1.8246029615402222, |
|
"learning_rate": 3.18718101832316e-08, |
|
"loss": 2.0008, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.9948652118100129, |
|
"grad_norm": 1.3338468074798584, |
|
"learning_rate": 1.963124977494291e-09, |
|
"loss": 1.9654, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 6232, |
|
"total_flos": 1.1351431053312e+17, |
|
"train_loss": 2.079003376893422, |
|
"train_runtime": 1659.922, |
|
"train_samples_per_second": 7.509, |
|
"train_steps_per_second": 3.754 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 6232, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1351431053312e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|