{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1e-06, "loss": 3.3352, "perplexity": 28.083999462050915, "step": 1 }, { "epoch": 0.01, "eval_accuracy": 0.23046520883543017, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 8.0774, "eval_samples_per_second": 45.064, "eval_steps_per_second": 5.695, "step": 1 }, { "epoch": 0.03, "learning_rate": 1e-06, "loss": 3.3091, "perplexity": 27.360489946627773, "step": 2 }, { "epoch": 0.03, "eval_accuracy": 0.23046520883543017, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 7.9509, "eval_samples_per_second": 45.781, "eval_steps_per_second": 5.785, "step": 2 }, { "epoch": 0.04, "learning_rate": 1e-06, "loss": 3.3347, "perplexity": 28.06996097223482, "step": 3 }, { "epoch": 0.04, "eval_accuracy": 0.23046520883543017, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 7.9602, "eval_samples_per_second": 45.728, "eval_steps_per_second": 5.779, "step": 3 }, { "epoch": 0.05, "learning_rate": 1e-06, "loss": 3.1445, "perplexity": 23.20806853752346, "step": 4 }, { "epoch": 0.05, "eval_accuracy": 0.23053154089371394, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 7.9267, "eval_samples_per_second": 45.921, "eval_steps_per_second": 5.803, "step": 4 }, { "epoch": 0.07, "learning_rate": 1e-06, "loss": 2.8918, "perplexity": 18.025726725492028, "step": 5 }, { "epoch": 0.07, "eval_accuracy": 0.23053154089371394, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 7.9947, "eval_samples_per_second": 45.53, "eval_steps_per_second": 5.754, "step": 5 }, { "epoch": 0.08, "learning_rate": 1e-06, "loss": 3.2068, "perplexity": 24.699919882371823, "step": 6 }, { "epoch": 0.08, "eval_accuracy": 0.23053154089371394, "eval_loss": 3.173828125, "eval_perplexity": 23.898797049169467, "eval_runtime": 7.9696, "eval_samples_per_second": 45.674, "eval_steps_per_second": 5.772, "step": 6 }, { "epoch": 0.09, "learning_rate": 1e-06, "loss": 3.6245, "perplexity": 37.50596548782992, "step": 7 }, { "epoch": 0.09, "eval_accuracy": 0.23047073650695382, "eval_loss": 3.171875, "eval_perplexity": 23.852165264858517, "eval_runtime": 7.9787, "eval_samples_per_second": 45.622, "eval_steps_per_second": 5.765, "step": 7 }, { "epoch": 0.11, "learning_rate": 1e-06, "loss": 3.2256, "perplexity": 25.168670828860865, "step": 8 }, { "epoch": 0.11, "eval_accuracy": 0.230514957879143, "eval_loss": 3.171875, "eval_perplexity": 23.852165264858517, "eval_runtime": 7.9349, "eval_samples_per_second": 45.873, "eval_steps_per_second": 5.797, "step": 8 }, { "epoch": 0.12, "learning_rate": 1e-06, "loss": 2.9991, "perplexity": 20.067468072159407, "step": 9 }, { "epoch": 0.12, "eval_accuracy": 0.23050943020761935, "eval_loss": 3.169921875, "eval_perplexity": 23.80562446936611, "eval_runtime": 7.9581, "eval_samples_per_second": 45.74, "eval_steps_per_second": 5.78, "step": 9 }, { "epoch": 0.13, "learning_rate": 1e-06, "loss": 3.3257, "perplexity": 27.818464754063648, "step": 10 }, { "epoch": 0.13, "eval_accuracy": 0.23057023459437948, "eval_loss": 3.16796875, "eval_perplexity": 23.75917448515314, "eval_runtime": 8.0497, "eval_samples_per_second": 45.219, "eval_steps_per_second": 5.715, "step": 10 }, { "epoch": 0.15, "learning_rate": 1e-06, "loss": 3.1199, "perplexity": 22.6441151184392, "step": 11 }, { "epoch": 0.15, "eval_accuracy": 0.23057023459437948, "eval_loss": 3.166015625, "eval_perplexity": 23.71281513502692, "eval_runtime": 7.9865, "eval_samples_per_second": 45.577, "eval_steps_per_second": 5.76, "step": 11 }, { "epoch": 0.16, "learning_rate": 1e-06, "loss": 3.3735, "perplexity": 29.180480216449844, "step": 12 }, { "epoch": 0.16, "eval_accuracy": 0.23062551130961595, "eval_loss": 3.166015625, "eval_perplexity": 23.71281513502692, "eval_runtime": 7.9024, "eval_samples_per_second": 46.062, "eval_steps_per_second": 5.821, "step": 12 }, { "epoch": 0.17, "learning_rate": 1e-06, "loss": 3.0051, "perplexity": 20.188234818531463, "step": 13 }, { "epoch": 0.17, "eval_accuracy": 0.2306531496672342, "eval_loss": 3.1640625, "eval_perplexity": 23.666546242140512, "eval_runtime": 7.9745, "eval_samples_per_second": 45.646, "eval_steps_per_second": 5.768, "step": 13 }, { "epoch": 0.19, "learning_rate": 1e-06, "loss": 3.2695, "perplexity": 26.298186961963587, "step": 14 }, { "epoch": 0.19, "eval_accuracy": 0.23081345214141996, "eval_loss": 3.162109375, "eval_perplexity": 23.620367629992042, "eval_runtime": 8.0031, "eval_samples_per_second": 45.483, "eval_steps_per_second": 5.748, "step": 14 }, { "epoch": 0.2, "learning_rate": 1e-06, "loss": 3.2004, "perplexity": 24.54234517205232, "step": 15 }, { "epoch": 0.2, "eval_accuracy": 0.23086872885665644, "eval_loss": 3.16015625, "eval_perplexity": 23.574279122424027, "eval_runtime": 7.96, "eval_samples_per_second": 45.729, "eval_steps_per_second": 5.779, "step": 15 }, { "epoch": 0.21, "learning_rate": 1e-06, "loss": 3.2075, "perplexity": 24.71721587918212, "step": 16 }, { "epoch": 0.21, "eval_accuracy": 0.2308300351559909, "eval_loss": 3.158203125, "eval_perplexity": 23.52828054362271, "eval_runtime": 7.9635, "eval_samples_per_second": 45.708, "eval_steps_per_second": 5.776, "step": 16 }, { "epoch": 0.23, "learning_rate": 1e-06, "loss": 3.321, "perplexity": 27.688024743861764, "step": 17 }, { "epoch": 0.23, "eval_accuracy": 0.2308300351559909, "eval_loss": 3.15625, "eval_perplexity": 23.482371718117374, "eval_runtime": 7.9658, "eval_samples_per_second": 45.695, "eval_steps_per_second": 5.775, "step": 17 }, { "epoch": 0.24, "learning_rate": 1e-06, "loss": 3.4026, "perplexity": 30.04210807401033, "step": 18 }, { "epoch": 0.24, "eval_accuracy": 0.23086872885665644, "eval_loss": 3.154296875, "eval_perplexity": 23.4365524707797, "eval_runtime": 7.934, "eval_samples_per_second": 45.878, "eval_steps_per_second": 5.798, "step": 18 }, { "epoch": 0.25, "learning_rate": 1e-06, "loss": 3.0383, "perplexity": 20.869734512558935, "step": 19 }, { "epoch": 0.25, "eval_accuracy": 0.23087425652818008, "eval_loss": 3.15234375, "eval_perplexity": 23.390822626823073, "eval_runtime": 7.6988, "eval_samples_per_second": 47.28, "eval_steps_per_second": 5.975, "step": 19 }, { "epoch": 0.27, "learning_rate": 1e-06, "loss": 3.166, "perplexity": 23.71244462518505, "step": 20 }, { "epoch": 0.27, "eval_accuracy": 0.23091847790036926, "eval_loss": 3.150390625, "eval_perplexity": 23.345182011801924, "eval_runtime": 7.9469, "eval_samples_per_second": 45.804, "eval_steps_per_second": 5.788, "step": 20 }, { "epoch": 0.28, "learning_rate": 1e-06, "loss": 3.144, "perplexity": 23.196467403779828, "step": 21 }, { "epoch": 0.28, "eval_accuracy": 0.2309626992725584, "eval_loss": 3.1484375, "eval_perplexity": 23.299630451611073, "eval_runtime": 7.957, "eval_samples_per_second": 45.746, "eval_steps_per_second": 5.781, "step": 21 }, { "epoch": 0.29, "learning_rate": 1e-06, "loss": 3.1624, "perplexity": 23.627233296953413, "step": 22 }, { "epoch": 0.29, "eval_accuracy": 0.23100139297322395, "eval_loss": 3.1484375, "eval_perplexity": 23.299630451611073, "eval_runtime": 8.0068, "eval_samples_per_second": 45.461, "eval_steps_per_second": 5.745, "step": 22 }, { "epoch": 0.31, "learning_rate": 1e-06, "loss": 3.0332, "perplexity": 20.763569816631378, "step": 23 }, { "epoch": 0.31, "eval_accuracy": 0.23103455900236583, "eval_loss": 3.146484375, "eval_perplexity": 23.25416777248505, "eval_runtime": 7.9578, "eval_samples_per_second": 45.741, "eval_steps_per_second": 5.78, "step": 23 }, { "epoch": 0.32, "learning_rate": 1e-06, "loss": 3.3745, "perplexity": 29.209675291771028, "step": 24 }, { "epoch": 0.32, "eval_accuracy": 0.23106772503150771, "eval_loss": 3.14453125, "eval_perplexity": 23.20879380099744, "eval_runtime": 7.9682, "eval_samples_per_second": 45.682, "eval_steps_per_second": 5.773, "step": 24 }, { "epoch": 0.33, "learning_rate": 1e-06, "loss": 3.0823, "perplexity": 21.808504316830465, "step": 25 }, { "epoch": 0.33, "eval_accuracy": 0.23117827846198066, "eval_loss": 3.142578125, "eval_perplexity": 23.16350836406023, "eval_runtime": 7.916, "eval_samples_per_second": 45.983, "eval_steps_per_second": 5.811, "step": 25 }, { "epoch": 0.35, "learning_rate": 1e-06, "loss": 3.6021, "perplexity": 36.67517149163571, "step": 26 }, { "epoch": 0.35, "eval_accuracy": 0.2312280275056935, "eval_loss": 3.140625, "eval_perplexity": 23.118311288923124, "eval_runtime": 7.9551, "eval_samples_per_second": 45.757, "eval_steps_per_second": 5.782, "step": 26 }, { "epoch": 0.36, "learning_rate": 1e-06, "loss": 3.1125, "perplexity": 22.477167135936607, "step": 27 }, { "epoch": 0.36, "eval_accuracy": 0.23127777654940632, "eval_loss": 3.138671875, "eval_perplexity": 23.073202403172917, "eval_runtime": 7.9108, "eval_samples_per_second": 46.013, "eval_steps_per_second": 5.815, "step": 27 }, { "epoch": 0.37, "learning_rate": 1e-06, "loss": 3.1406, "perplexity": 23.117733338365316, "step": 28 }, { "epoch": 0.37, "eval_accuracy": 0.23136621929378468, "eval_loss": 3.138671875, "eval_perplexity": 23.073202403172917, "eval_runtime": 7.9649, "eval_samples_per_second": 45.7, "eval_steps_per_second": 5.775, "step": 28 }, { "epoch": 0.39, "learning_rate": 1e-06, "loss": 3.1736, "perplexity": 23.893345757904175, "step": 29 }, { "epoch": 0.39, "eval_accuracy": 0.2314049129944502, "eval_loss": 3.13671875, "eval_perplexity": 23.028181534732802, "eval_runtime": 7.9854, "eval_samples_per_second": 45.583, "eval_steps_per_second": 5.76, "step": 29 }, { "epoch": 0.4, "learning_rate": 1e-06, "loss": 3.1104, "perplexity": 22.43001461242937, "step": 30 }, { "epoch": 0.4, "eval_accuracy": 0.23154863245406504, "eval_loss": 3.134765625, "eval_perplexity": 22.98324851186175, "eval_runtime": 7.9536, "eval_samples_per_second": 45.765, "eval_steps_per_second": 5.784, "step": 30 }, { "epoch": 0.41, "learning_rate": 1e-06, "loss": 3.1301, "perplexity": 22.876267054768768, "step": 31 }, { "epoch": 0.41, "eval_accuracy": 0.23159285382625422, "eval_loss": 3.1328125, "eval_perplexity": 22.938403163153815, "eval_runtime": 7.904, "eval_samples_per_second": 46.053, "eval_steps_per_second": 5.82, "step": 31 }, { "epoch": 0.43, "learning_rate": 1e-06, "loss": 3.3376, "perplexity": 28.151482007422672, "step": 32 }, { "epoch": 0.43, "eval_accuracy": 0.23154863245406504, "eval_loss": 3.130859375, "eval_perplexity": 22.893645317537526, "eval_runtime": 7.6495, "eval_samples_per_second": 47.585, "eval_steps_per_second": 6.013, "step": 32 }, { "epoch": 0.44, "learning_rate": 1e-06, "loss": 3.218, "perplexity": 24.978113963861347, "step": 33 }, { "epoch": 0.44, "eval_accuracy": 0.23159285382625422, "eval_loss": 3.130859375, "eval_perplexity": 22.893645317537526, "eval_runtime": 7.9297, "eval_samples_per_second": 45.904, "eval_steps_per_second": 5.801, "step": 33 }, { "epoch": 0.45, "learning_rate": 1e-06, "loss": 3.0786, "perplexity": 21.727961946129383, "step": 34 }, { "epoch": 0.45, "eval_accuracy": 0.2316481305414907, "eval_loss": 3.12890625, "eval_perplexity": 22.84897480427519, "eval_runtime": 7.9813, "eval_samples_per_second": 45.607, "eval_steps_per_second": 5.763, "step": 34 }, { "epoch": 0.47, "learning_rate": 1e-06, "loss": 3.0125, "perplexity": 20.33818187604361, "step": 35 }, { "epoch": 0.47, "eval_accuracy": 0.23167024122758528, "eval_loss": 3.126953125, "eval_perplexity": 22.80439145296227, "eval_runtime": 7.9533, "eval_samples_per_second": 45.767, "eval_steps_per_second": 5.784, "step": 35 }, { "epoch": 0.48, "learning_rate": 1e-06, "loss": 3.2634, "perplexity": 26.138256305914563, "step": 36 }, { "epoch": 0.48, "eval_accuracy": 0.2317199902712981, "eval_loss": 3.126953125, "eval_perplexity": 22.80439145296227, "eval_runtime": 6.9427, "eval_samples_per_second": 52.429, "eval_steps_per_second": 6.626, "step": 36 }, { "epoch": 0.49, "learning_rate": 1e-06, "loss": 2.9888, "perplexity": 19.861833984540883, "step": 37 }, { "epoch": 0.49, "eval_accuracy": 0.23177526698653458, "eval_loss": 3.125, "eval_perplexity": 22.75989509352673, "eval_runtime": 7.9347, "eval_samples_per_second": 45.875, "eval_steps_per_second": 5.797, "step": 37 }, { "epoch": 0.51, "learning_rate": 1e-06, "loss": 3.1624, "perplexity": 23.627233296953413, "step": 38 }, { "epoch": 0.51, "eval_accuracy": 0.23179737767262917, "eval_loss": 3.123046875, "eval_perplexity": 22.715485556228362, "eval_runtime": 7.9127, "eval_samples_per_second": 46.002, "eval_steps_per_second": 5.813, "step": 38 }, { "epoch": 0.52, "learning_rate": 1e-06, "loss": 2.9807, "perplexity": 19.7016029410545, "step": 39 }, { "epoch": 0.52, "eval_accuracy": 0.2318581820593893, "eval_loss": 3.12109375, "eval_perplexity": 22.67116267165818, "eval_runtime": 7.9324, "eval_samples_per_second": 45.888, "eval_steps_per_second": 5.799, "step": 39 }, { "epoch": 0.53, "learning_rate": 1e-06, "loss": 3.446, "perplexity": 31.374642406982968, "step": 40 }, { "epoch": 0.53, "eval_accuracy": 0.2319300417891967, "eval_loss": 3.12109375, "eval_perplexity": 22.67116267165818, "eval_runtime": 7.9334, "eval_samples_per_second": 45.882, "eval_steps_per_second": 5.798, "step": 40 }, { "epoch": 0.55, "learning_rate": 1e-06, "loss": 3.1338, "perplexity": 22.96106602422343, "step": 41 }, { "epoch": 0.55, "eval_accuracy": 0.23195768014681495, "eval_loss": 3.119140625, "eval_perplexity": 22.626926270737744, "eval_runtime": 7.926, "eval_samples_per_second": 45.925, "eval_steps_per_second": 5.804, "step": 41 }, { "epoch": 0.56, "learning_rate": 1e-06, "loss": 3.1841, "perplexity": 24.145547631095972, "step": 42 }, { "epoch": 0.56, "eval_accuracy": 0.23199084617595683, "eval_loss": 3.119140625, "eval_perplexity": 22.626926270737744, "eval_runtime": 7.9503, "eval_samples_per_second": 45.785, "eval_steps_per_second": 5.786, "step": 42 }, { "epoch": 0.57, "learning_rate": 1e-06, "loss": 3.1079, "perplexity": 22.374009611318957, "step": 43 }, { "epoch": 0.57, "eval_accuracy": 0.232035067548146, "eval_loss": 3.1171875, "eval_perplexity": 22.582776184718522, "eval_runtime": 7.7126, "eval_samples_per_second": 47.195, "eval_steps_per_second": 5.964, "step": 43 }, { "epoch": 0.59, "learning_rate": 1e-06, "loss": 3.0918, "perplexity": 22.016672340357456, "step": 44 }, { "epoch": 0.59, "eval_accuracy": 0.23214562097861896, "eval_loss": 3.115234375, "eval_perplexity": 22.538712245181248, "eval_runtime": 7.9205, "eval_samples_per_second": 45.956, "eval_steps_per_second": 5.808, "step": 44 }, { "epoch": 0.6, "learning_rate": 1e-06, "loss": 3.0302, "perplexity": 20.701372449879624, "step": 45 }, { "epoch": 0.6, "eval_accuracy": 0.23220089769385543, "eval_loss": 3.115234375, "eval_perplexity": 22.538712245181248, "eval_runtime": 7.9218, "eval_samples_per_second": 45.949, "eval_steps_per_second": 5.807, "step": 45 }, { "epoch": 0.61, "learning_rate": 1e-06, "loss": 3.1123, "perplexity": 22.472672152022792, "step": 46 }, { "epoch": 0.61, "eval_accuracy": 0.23228381276671015, "eval_loss": 3.11328125, "eval_perplexity": 22.494734284035275, "eval_runtime": 7.9089, "eval_samples_per_second": 46.024, "eval_steps_per_second": 5.816, "step": 46 }, { "epoch": 0.63, "learning_rate": 1e-06, "loss": 2.9985, "perplexity": 20.055431202738045, "step": 47 }, { "epoch": 0.63, "eval_accuracy": 0.23235567249651756, "eval_loss": 3.111328125, "eval_perplexity": 22.450842133517945, "eval_runtime": 7.9743, "eval_samples_per_second": 45.646, "eval_steps_per_second": 5.769, "step": 47 }, { "epoch": 0.64, "learning_rate": 1e-06, "loss": 3.3816, "perplexity": 29.417801961716197, "step": 48 }, { "epoch": 0.64, "eval_accuracy": 0.23235567249651756, "eval_loss": 3.111328125, "eval_perplexity": 22.450842133517945, "eval_runtime": 7.9744, "eval_samples_per_second": 45.646, "eval_steps_per_second": 5.768, "step": 48 }, { "epoch": 0.65, "learning_rate": 1e-06, "loss": 3.0813, "perplexity": 21.786706713131952, "step": 49 }, { "epoch": 0.65, "eval_accuracy": 0.23242753222632498, "eval_loss": 3.109375, "eval_perplexity": 22.40703562619394, "eval_runtime": 8.0068, "eval_samples_per_second": 45.461, "eval_steps_per_second": 5.745, "step": 49 }, { "epoch": 0.67, "learning_rate": 1e-06, "loss": 3.2024, "perplexity": 24.591478979826256, "step": 50 }, { "epoch": 0.67, "eval_accuracy": 0.2324662259269905, "eval_loss": 3.109375, "eval_perplexity": 22.40703562619394, "eval_runtime": 7.9621, "eval_samples_per_second": 45.717, "eval_steps_per_second": 5.777, "step": 50 }, { "epoch": 0.68, "learning_rate": 1e-06, "loss": 3.0178, "perplexity": 20.446260395068368, "step": 51 }, { "epoch": 0.68, "eval_accuracy": 0.23248280894156145, "eval_loss": 3.107421875, "eval_perplexity": 22.36331459495464, "eval_runtime": 7.9553, "eval_samples_per_second": 45.756, "eval_steps_per_second": 5.782, "step": 51 }, { "epoch": 0.69, "learning_rate": 1e-06, "loss": 3.1646, "perplexity": 23.67927043006483, "step": 52 }, { "epoch": 0.69, "eval_accuracy": 0.23263758374422358, "eval_loss": 3.107421875, "eval_perplexity": 22.36331459495464, "eval_runtime": 7.9632, "eval_samples_per_second": 45.71, "eval_steps_per_second": 5.777, "step": 52 }, { "epoch": 0.71, "learning_rate": 1e-06, "loss": 3.0046, "perplexity": 20.17814322423101, "step": 53 }, { "epoch": 0.71, "eval_accuracy": 0.23268180511641276, "eval_loss": 3.10546875, "eval_perplexity": 22.319678873017494, "eval_runtime": 7.9716, "eval_samples_per_second": 45.662, "eval_steps_per_second": 5.77, "step": 53 }, { "epoch": 0.72, "learning_rate": 1e-06, "loss": 3.0266, "perplexity": 20.626981493124443, "step": 54 }, { "epoch": 0.72, "eval_accuracy": 0.23273708183164923, "eval_loss": 3.10546875, "eval_perplexity": 22.319678873017494, "eval_runtime": 7.9222, "eval_samples_per_second": 45.947, "eval_steps_per_second": 5.806, "step": 54 }, { "epoch": 0.73, "learning_rate": 1e-06, "loss": 3.3857, "perplexity": 29.538662544648755, "step": 55 }, { "epoch": 0.73, "eval_accuracy": 0.23274813717469653, "eval_loss": 3.103515625, "eval_perplexity": 22.27612829392538, "eval_runtime": 8.0121, "eval_samples_per_second": 45.431, "eval_steps_per_second": 5.741, "step": 55 }, { "epoch": 0.75, "learning_rate": 1e-06, "loss": 3.064, "perplexity": 21.413038238853925, "step": 56 }, { "epoch": 0.75, "eval_accuracy": 0.23277577553231477, "eval_loss": 3.103515625, "eval_perplexity": 22.27612829392538, "eval_runtime": 7.9323, "eval_samples_per_second": 45.888, "eval_steps_per_second": 5.799, "step": 56 }, { "epoch": 0.76, "learning_rate": 1e-06, "loss": 3.176, "perplexity": 23.950758655642247, "step": 57 }, { "epoch": 0.76, "eval_accuracy": 0.2328144692329803, "eval_loss": 3.1015625, "eval_perplexity": 22.232662691545976, "eval_runtime": 7.9184, "eval_samples_per_second": 45.969, "eval_steps_per_second": 5.809, "step": 57 }, { "epoch": 0.77, "learning_rate": 1e-06, "loss": 3.1851, "perplexity": 24.169705255526146, "step": 58 }, { "epoch": 0.77, "eval_accuracy": 0.23286974594821677, "eval_loss": 3.1015625, "eval_perplexity": 22.232662691545976, "eval_runtime": 7.9669, "eval_samples_per_second": 45.689, "eval_steps_per_second": 5.774, "step": 58 }, { "epoch": 0.79, "learning_rate": 1e-06, "loss": 3.0811, "perplexity": 21.782349807494416, "step": 59 }, { "epoch": 0.79, "eval_accuracy": 0.232897384305835, "eval_loss": 3.099609375, "eval_perplexity": 22.189281900071105, "eval_runtime": 7.9339, "eval_samples_per_second": 45.879, "eval_steps_per_second": 5.798, "step": 59 }, { "epoch": 0.8, "learning_rate": 1e-06, "loss": 3.0205, "perplexity": 20.501539891873456, "step": 60 }, { "epoch": 0.8, "eval_accuracy": 0.23296924403564243, "eval_loss": 3.099609375, "eval_perplexity": 22.189281900071105, "eval_runtime": 7.9929, "eval_samples_per_second": 45.54, "eval_steps_per_second": 5.755, "step": 60 }, { "epoch": 0.81, "learning_rate": 1e-06, "loss": 3.26, "perplexity": 26.049537142518336, "step": 61 }, { "epoch": 0.81, "eval_accuracy": 0.23296371636411878, "eval_loss": 3.09765625, "eval_perplexity": 22.145985754016134, "eval_runtime": 7.9124, "eval_samples_per_second": 46.004, "eval_steps_per_second": 5.814, "step": 61 }, { "epoch": 0.83, "learning_rate": 1e-06, "loss": 3.2922, "perplexity": 26.90198296333493, "step": 62 }, { "epoch": 0.83, "eval_accuracy": 0.23305768678002078, "eval_loss": 3.09765625, "eval_perplexity": 22.145985754016134, "eval_runtime": 7.9262, "eval_samples_per_second": 45.923, "eval_steps_per_second": 5.804, "step": 62 }, { "epoch": 0.84, "learning_rate": 1e-06, "loss": 3.5349, "perplexity": 34.29158538422678, "step": 63 }, { "epoch": 0.84, "eval_accuracy": 0.2331295465098282, "eval_loss": 3.095703125, "eval_perplexity": 22.10277408821932, "eval_runtime": 7.9843, "eval_samples_per_second": 45.589, "eval_steps_per_second": 5.761, "step": 63 }, { "epoch": 0.85, "learning_rate": 1e-06, "loss": 3.3525, "perplexity": 28.57407962319162, "step": 64 }, { "epoch": 0.85, "eval_accuracy": 0.23312401883830455, "eval_loss": 3.095703125, "eval_perplexity": 22.10277408821932, "eval_runtime": 7.9231, "eval_samples_per_second": 45.942, "eval_steps_per_second": 5.806, "step": 64 }, { "epoch": 0.87, "learning_rate": 1e-06, "loss": 3.135, "perplexity": 22.988635842034803, "step": 65 }, { "epoch": 0.87, "eval_accuracy": 0.23312401883830455, "eval_loss": 3.09375, "eval_perplexity": 22.059646737841184, "eval_runtime": 7.9384, "eval_samples_per_second": 45.853, "eval_steps_per_second": 5.795, "step": 65 }, { "epoch": 0.88, "learning_rate": 1e-06, "loss": 3.1707, "perplexity": 23.824155429673073, "step": 66 }, { "epoch": 0.88, "eval_accuracy": 0.23318482322506467, "eval_loss": 3.09375, "eval_perplexity": 22.059646737841184, "eval_runtime": 7.9764, "eval_samples_per_second": 45.635, "eval_steps_per_second": 5.767, "step": 66 }, { "epoch": 0.89, "learning_rate": 1e-06, "loss": 3.0127, "perplexity": 20.342249919209575, "step": 67 }, { "epoch": 0.89, "eval_accuracy": 0.23324009994030115, "eval_loss": 3.091796875, "eval_perplexity": 22.016603538363892, "eval_runtime": 7.9476, "eval_samples_per_second": 45.8, "eval_steps_per_second": 5.788, "step": 67 }, { "epoch": 0.91, "learning_rate": 1e-06, "loss": 3.0952, "perplexity": 22.091656427027353, "step": 68 }, { "epoch": 0.91, "eval_accuracy": 0.23322904459725385, "eval_loss": 3.091796875, "eval_perplexity": 22.016603538363892, "eval_runtime": 8.0177, "eval_samples_per_second": 45.4, "eval_steps_per_second": 5.737, "step": 68 }, { "epoch": 0.92, "learning_rate": 1e-06, "loss": 3.1023, "perplexity": 22.24906532800973, "step": 69 }, { "epoch": 0.92, "eval_accuracy": 0.23338934707143963, "eval_loss": 3.08984375, "eval_perplexity": 21.973644325590612, "eval_runtime": 7.7593, "eval_samples_per_second": 46.911, "eval_steps_per_second": 5.928, "step": 69 }, { "epoch": 0.93, "learning_rate": 1e-06, "loss": 3.3821, "perplexity": 29.43251454053524, "step": 70 }, { "epoch": 0.93, "eval_accuracy": 0.23339487474296328, "eval_loss": 3.08984375, "eval_perplexity": 21.973644325590612, "eval_runtime": 7.9307, "eval_samples_per_second": 45.898, "eval_steps_per_second": 5.8, "step": 70 }, { "epoch": 0.95, "learning_rate": 1e-06, "loss": 3.1118, "perplexity": 22.461438624562685, "step": 71 }, { "epoch": 0.95, "eval_accuracy": 0.23339487474296328, "eval_loss": 3.087890625, "eval_perplexity": 21.930768935644906, "eval_runtime": 8.0017, "eval_samples_per_second": 45.49, "eval_steps_per_second": 5.749, "step": 71 }, { "epoch": 0.96, "learning_rate": 1e-06, "loss": 3.1143, "perplexity": 22.5176624716497, "step": 72 }, { "epoch": 0.96, "eval_accuracy": 0.2334667344727707, "eval_loss": 3.087890625, "eval_perplexity": 21.930768935644906, "eval_runtime": 7.9822, "eval_samples_per_second": 45.602, "eval_steps_per_second": 5.763, "step": 72 }, { "epoch": 0.97, "learning_rate": 1e-06, "loss": 3.1118, "perplexity": 22.461438624562685, "step": 73 }, { "epoch": 0.97, "eval_accuracy": 0.23352201118800717, "eval_loss": 3.087890625, "eval_perplexity": 21.930768935644906, "eval_runtime": 6.928, "eval_samples_per_second": 52.54, "eval_steps_per_second": 6.64, "step": 73 }, { "epoch": 0.99, "learning_rate": 1e-06, "loss": 3.0596, "perplexity": 21.319027845139193, "step": 74 }, { "epoch": 0.99, "eval_accuracy": 0.2335828155747673, "eval_loss": 3.0859375, "eval_perplexity": 21.887977204970085, "eval_runtime": 7.9803, "eval_samples_per_second": 45.612, "eval_steps_per_second": 5.764, "step": 74 }, { "epoch": 1.0, "learning_rate": 1e-06, "loss": 3.1033, "perplexity": 22.271325521579506, "step": 75 }, { "epoch": 1.0, "eval_accuracy": 0.23359387091781458, "eval_loss": 3.0859375, "eval_perplexity": 21.887977204970085, "eval_runtime": 7.9699, "eval_samples_per_second": 45.672, "eval_steps_per_second": 5.772, "step": 75 }, { "epoch": 1.0, "step": 75, "total_flos": 352569360384.0, "train_loss": 3.1779069010416667, "train_runtime": 632.8224, "train_samples_per_second": 0.947, "train_steps_per_second": 0.119 } ], "max_steps": 75, "num_train_epochs": 1, "total_flos": 352569360384.0, "trial_name": null, "trial_params": null }