diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.16778523489932887, + "epoch": 0.18875838926174496, "eval_steps": 199, - "global_step": 1700, + "global_step": 2600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -13679,14 +13679,7259 @@ "loss": 2.2722, "num_input_tokens_seen": 3565158400, "step": 1700 + }, + { + "epoch": 0.0, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2666, + "num_input_tokens_seen": 3567255552, + "step": 1701 + }, + { + "epoch": 0.0, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.2513, + "num_input_tokens_seen": 3569352704, + "step": 1702 + }, + { + "epoch": 0.0, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.261, + "num_input_tokens_seen": 3571449856, + "step": 1703 + }, + { + "epoch": 0.0, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.3282, + "num_input_tokens_seen": 3573547008, + "step": 1704 + }, + { + "epoch": 0.0, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.267, + "num_input_tokens_seen": 3575644160, + "step": 1705 + }, + { + "epoch": 0.0, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2673, + "num_input_tokens_seen": 3577741312, + "step": 1706 + }, + { + "epoch": 0.0, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.3082, + "num_input_tokens_seen": 3579838464, + "step": 1707 + }, + { + "epoch": 0.0, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2932, + "num_input_tokens_seen": 3581935616, + "step": 1708 + }, + { + "epoch": 0.0, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3123, + "num_input_tokens_seen": 3584032768, + "step": 1709 + }, + { + "epoch": 0.0, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2871, + "num_input_tokens_seen": 3586129920, + "step": 1710 + }, + { + "epoch": 0.0, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.268, + "num_input_tokens_seen": 3588227072, + "step": 1711 + }, + { + "epoch": 0.0, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2696, + "num_input_tokens_seen": 3590324224, + "step": 1712 + }, + { + "epoch": 0.0, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2469, + "num_input_tokens_seen": 3592421376, + "step": 1713 + }, + { + "epoch": 0.0, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2936, + "num_input_tokens_seen": 3594518528, + "step": 1714 + }, + { + "epoch": 0.0, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2657, + "num_input_tokens_seen": 3596615680, + "step": 1715 + }, + { + "epoch": 0.0, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.3095, + "num_input_tokens_seen": 3598712832, + "step": 1716 + }, + { + "epoch": 0.0, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2402, + "num_input_tokens_seen": 3600809984, + "step": 1717 + }, + { + "epoch": 0.0, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2438, + "num_input_tokens_seen": 3602907136, + "step": 1718 + }, + { + "epoch": 0.0, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2802, + "num_input_tokens_seen": 3605004288, + "step": 1719 + }, + { + "epoch": 0.0, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2693, + "num_input_tokens_seen": 3607101440, + "step": 1720 + }, + { + "epoch": 0.0, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2613, + "num_input_tokens_seen": 3609198592, + "step": 1721 + }, + { + "epoch": 0.0, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2496, + "num_input_tokens_seen": 3611295744, + "step": 1722 + }, + { + "epoch": 0.0, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2641, + "num_input_tokens_seen": 3613392896, + "step": 1723 + }, + { + "epoch": 0.01, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2621, + "num_input_tokens_seen": 3615490048, + "step": 1724 + }, + { + "epoch": 0.01, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2827, + "num_input_tokens_seen": 3617587200, + "step": 1725 + }, + { + "epoch": 0.01, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.2275, + "num_input_tokens_seen": 3619684352, + "step": 1726 + }, + { + "epoch": 0.01, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.2833, + "num_input_tokens_seen": 3621781504, + "step": 1727 + }, + { + "epoch": 0.01, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2839, + "num_input_tokens_seen": 3623878656, + "step": 1728 + }, + { + "epoch": 0.01, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2622, + "num_input_tokens_seen": 3625975808, + "step": 1729 + }, + { + "epoch": 0.01, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.3042, + "num_input_tokens_seen": 3628072960, + "step": 1730 + }, + { + "epoch": 0.01, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2832, + "num_input_tokens_seen": 3630170112, + "step": 1731 + }, + { + "epoch": 0.01, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2657, + "num_input_tokens_seen": 3632267264, + "step": 1732 + }, + { + "epoch": 0.01, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2854, + "num_input_tokens_seen": 3634364416, + "step": 1733 + }, + { + "epoch": 0.01, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2651, + "num_input_tokens_seen": 3636461568, + "step": 1734 + }, + { + "epoch": 0.01, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2769, + "num_input_tokens_seen": 3638558720, + "step": 1735 + }, + { + "epoch": 0.01, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2749, + "num_input_tokens_seen": 3640655872, + "step": 1736 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.261, + "num_input_tokens_seen": 3642753024, + "step": 1737 + }, + { + "epoch": 0.01, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2924, + "num_input_tokens_seen": 3644850176, + "step": 1738 + }, + { + "epoch": 0.01, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 2.254, + "num_input_tokens_seen": 3646947328, + "step": 1739 + }, + { + "epoch": 0.01, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2607, + "num_input_tokens_seen": 3649044480, + "step": 1740 + }, + { + "epoch": 0.01, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2747, + "num_input_tokens_seen": 3651141632, + "step": 1741 + }, + { + "epoch": 0.01, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2796, + "num_input_tokens_seen": 3653238784, + "step": 1742 + }, + { + "epoch": 0.01, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2827, + "num_input_tokens_seen": 3655335936, + "step": 1743 + }, + { + "epoch": 0.01, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.2542, + "num_input_tokens_seen": 3657433088, + "step": 1744 + }, + { + "epoch": 0.01, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.2911, + "num_input_tokens_seen": 3659530240, + "step": 1745 + }, + { + "epoch": 0.01, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2536, + "num_input_tokens_seen": 3661627392, + "step": 1746 + }, + { + "epoch": 0.01, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2906, + "num_input_tokens_seen": 3663724544, + "step": 1747 + }, + { + "epoch": 0.01, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.2732, + "num_input_tokens_seen": 3665821696, + "step": 1748 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2547, + "num_input_tokens_seen": 3667918848, + "step": 1749 + }, + { + "epoch": 0.01, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2798, + "num_input_tokens_seen": 3670016000, + "step": 1750 + }, + { + "epoch": 0.01, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2336, + "num_input_tokens_seen": 3672113152, + "step": 1751 + }, + { + "epoch": 0.01, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2673, + "num_input_tokens_seen": 3674210304, + "step": 1752 + }, + { + "epoch": 0.01, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2288, + "num_input_tokens_seen": 3676307456, + "step": 1753 + }, + { + "epoch": 0.01, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2763, + "num_input_tokens_seen": 3678404608, + "step": 1754 + }, + { + "epoch": 0.01, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.2709, + "num_input_tokens_seen": 3680501760, + "step": 1755 + }, + { + "epoch": 0.01, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2476, + "num_input_tokens_seen": 3682598912, + "step": 1756 + }, + { + "epoch": 0.01, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.2921, + "num_input_tokens_seen": 3684696064, + "step": 1757 + }, + { + "epoch": 0.01, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.256, + "num_input_tokens_seen": 3686793216, + "step": 1758 + }, + { + "epoch": 0.01, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2576, + "num_input_tokens_seen": 3688890368, + "step": 1759 + }, + { + "epoch": 0.01, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.2661, + "num_input_tokens_seen": 3690987520, + "step": 1760 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.274, + "num_input_tokens_seen": 3693084672, + "step": 1761 + }, + { + "epoch": 0.01, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.2661, + "num_input_tokens_seen": 3695181824, + "step": 1762 + }, + { + "epoch": 0.01, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.272, + "num_input_tokens_seen": 3697278976, + "step": 1763 + }, + { + "epoch": 0.01, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2918, + "num_input_tokens_seen": 3699376128, + "step": 1764 + }, + { + "epoch": 0.01, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2393, + "num_input_tokens_seen": 3701473280, + "step": 1765 + }, + { + "epoch": 0.01, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2422, + "num_input_tokens_seen": 3703570432, + "step": 1766 + }, + { + "epoch": 0.01, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.25, + "num_input_tokens_seen": 3705667584, + "step": 1767 + }, + { + "epoch": 0.01, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2633, + "num_input_tokens_seen": 3707764736, + "step": 1768 + }, + { + "epoch": 0.01, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.3031, + "num_input_tokens_seen": 3709861888, + "step": 1769 + }, + { + "epoch": 0.01, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2718, + "num_input_tokens_seen": 3711959040, + "step": 1770 + }, + { + "epoch": 0.01, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2791, + "num_input_tokens_seen": 3714056192, + "step": 1771 + }, + { + "epoch": 0.02, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2702, + "num_input_tokens_seen": 3716153344, + "step": 1772 + }, + { + "epoch": 0.02, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.2664, + "num_input_tokens_seen": 3718250496, + "step": 1773 + }, + { + "epoch": 0.02, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2805, + "num_input_tokens_seen": 3720347648, + "step": 1774 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2621, + "num_input_tokens_seen": 3722444800, + "step": 1775 + }, + { + "epoch": 0.02, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2668, + "num_input_tokens_seen": 3724541952, + "step": 1776 + }, + { + "epoch": 0.02, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2762, + "num_input_tokens_seen": 3726639104, + "step": 1777 + }, + { + "epoch": 0.02, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2618, + "num_input_tokens_seen": 3728736256, + "step": 1778 + }, + { + "epoch": 0.02, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2736, + "num_input_tokens_seen": 3730833408, + "step": 1779 + }, + { + "epoch": 0.02, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2861, + "num_input_tokens_seen": 3732930560, + "step": 1780 + }, + { + "epoch": 0.02, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2288, + "num_input_tokens_seen": 3735027712, + "step": 1781 + }, + { + "epoch": 0.02, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.261, + "num_input_tokens_seen": 3737124864, + "step": 1782 + }, + { + "epoch": 0.02, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2725, + "num_input_tokens_seen": 3739222016, + "step": 1783 + }, + { + "epoch": 0.02, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.2813, + "num_input_tokens_seen": 3741319168, + "step": 1784 + }, + { + "epoch": 0.02, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2767, + "num_input_tokens_seen": 3743416320, + "step": 1785 + }, + { + "epoch": 0.02, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2856, + "num_input_tokens_seen": 3745513472, + "step": 1786 + }, + { + "epoch": 0.02, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.2418, + "num_input_tokens_seen": 3747610624, + "step": 1787 + }, + { + "epoch": 0.02, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.3087, + "num_input_tokens_seen": 3749707776, + "step": 1788 + }, + { + "epoch": 0.02, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2991, + "num_input_tokens_seen": 3751804928, + "step": 1789 + }, + { + "epoch": 0.02, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.267, + "num_input_tokens_seen": 3753902080, + "step": 1790 + }, + { + "epoch": 0.02, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.278, + "num_input_tokens_seen": 3755999232, + "step": 1791 + }, + { + "epoch": 0.02, + "eval_loss": 2.3124513626098633, + "eval_runtime": 1977.395, + "eval_samples_per_second": 1.994, + "eval_steps_per_second": 0.499, + "num_input_tokens_seen": 3755999232, + "step": 1791 + }, + { + "epoch": 0.02, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2862, + "num_input_tokens_seen": 3758096384, + "step": 1792 + }, + { + "epoch": 0.02, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.2916, + "num_input_tokens_seen": 3760193536, + "step": 1793 + }, + { + "epoch": 0.02, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2969, + "num_input_tokens_seen": 3762290688, + "step": 1794 + }, + { + "epoch": 0.02, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2444, + "num_input_tokens_seen": 3764387840, + "step": 1795 + }, + { + "epoch": 0.02, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.2639, + "num_input_tokens_seen": 3766484992, + "step": 1796 + }, + { + "epoch": 0.02, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2653, + "num_input_tokens_seen": 3768582144, + "step": 1797 + }, + { + "epoch": 0.02, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2386, + "num_input_tokens_seen": 3770679296, + "step": 1798 + }, + { + "epoch": 0.02, + "grad_norm": 1.4375, + "learning_rate": 2e-05, + "loss": 2.2753, + "num_input_tokens_seen": 3772776448, + "step": 1799 + }, + { + "epoch": 0.02, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2456, + "num_input_tokens_seen": 3774873600, + "step": 1800 + }, + { + "epoch": 0.02, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2582, + "num_input_tokens_seen": 3776970752, + "step": 1801 + }, + { + "epoch": 0.02, + "grad_norm": 1.78125, + "learning_rate": 2e-05, + "loss": 2.2532, + "num_input_tokens_seen": 3779067904, + "step": 1802 + }, + { + "epoch": 0.02, + "grad_norm": 1.390625, + "learning_rate": 2e-05, + "loss": 2.2598, + "num_input_tokens_seen": 3781165056, + "step": 1803 + }, + { + "epoch": 0.02, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 2.2891, + "num_input_tokens_seen": 3783262208, + "step": 1804 + }, + { + "epoch": 0.02, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2787, + "num_input_tokens_seen": 3785359360, + "step": 1805 + }, + { + "epoch": 0.02, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.2896, + "num_input_tokens_seen": 3787456512, + "step": 1806 + }, + { + "epoch": 0.02, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2622, + "num_input_tokens_seen": 3789553664, + "step": 1807 + }, + { + "epoch": 0.02, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2527, + "num_input_tokens_seen": 3791650816, + "step": 1808 + }, + { + "epoch": 0.02, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2761, + "num_input_tokens_seen": 3793747968, + "step": 1809 + }, + { + "epoch": 0.02, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2731, + "num_input_tokens_seen": 3795845120, + "step": 1810 + }, + { + "epoch": 0.02, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2938, + "num_input_tokens_seen": 3797942272, + "step": 1811 + }, + { + "epoch": 0.02, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.278, + "num_input_tokens_seen": 3800039424, + "step": 1812 + }, + { + "epoch": 0.02, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.2834, + "num_input_tokens_seen": 3802136576, + "step": 1813 + }, + { + "epoch": 0.02, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.2525, + "num_input_tokens_seen": 3804233728, + "step": 1814 + }, + { + "epoch": 0.02, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2818, + "num_input_tokens_seen": 3806330880, + "step": 1815 + }, + { + "epoch": 0.02, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2897, + "num_input_tokens_seen": 3808428032, + "step": 1816 + }, + { + "epoch": 0.02, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2676, + "num_input_tokens_seen": 3810525184, + "step": 1817 + }, + { + "epoch": 0.02, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.2816, + "num_input_tokens_seen": 3812622336, + "step": 1818 + }, + { + "epoch": 0.02, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.308, + "num_input_tokens_seen": 3814719488, + "step": 1819 + }, + { + "epoch": 0.03, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2763, + "num_input_tokens_seen": 3816816640, + "step": 1820 + }, + { + "epoch": 0.03, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.3067, + "num_input_tokens_seen": 3818913792, + "step": 1821 + }, + { + "epoch": 0.03, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.268, + "num_input_tokens_seen": 3821010944, + "step": 1822 + }, + { + "epoch": 0.03, + "grad_norm": 0.74609375, + "learning_rate": 2e-05, + "loss": 2.308, + "num_input_tokens_seen": 3823108096, + "step": 1823 + }, + { + "epoch": 0.03, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.3235, + "num_input_tokens_seen": 3825205248, + "step": 1824 + }, + { + "epoch": 0.03, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2718, + "num_input_tokens_seen": 3827302400, + "step": 1825 + }, + { + "epoch": 0.03, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.275, + "num_input_tokens_seen": 3829399552, + "step": 1826 + }, + { + "epoch": 0.03, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2931, + "num_input_tokens_seen": 3831496704, + "step": 1827 + }, + { + "epoch": 0.03, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.2712, + "num_input_tokens_seen": 3833593856, + "step": 1828 + }, + { + "epoch": 0.03, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2832, + "num_input_tokens_seen": 3835691008, + "step": 1829 + }, + { + "epoch": 0.03, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.2866, + "num_input_tokens_seen": 3837788160, + "step": 1830 + }, + { + "epoch": 0.03, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.2607, + "num_input_tokens_seen": 3839885312, + "step": 1831 + }, + { + "epoch": 0.03, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.283, + "num_input_tokens_seen": 3841982464, + "step": 1832 + }, + { + "epoch": 0.03, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.317, + "num_input_tokens_seen": 3844079616, + "step": 1833 + }, + { + "epoch": 0.03, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.3115, + "num_input_tokens_seen": 3846176768, + "step": 1834 + }, + { + "epoch": 0.03, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.2787, + "num_input_tokens_seen": 3848273920, + "step": 1835 + }, + { + "epoch": 0.03, + "grad_norm": 1.625, + "learning_rate": 2e-05, + "loss": 2.2913, + "num_input_tokens_seen": 3850371072, + "step": 1836 + }, + { + "epoch": 0.03, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.281, + "num_input_tokens_seen": 3852468224, + "step": 1837 + }, + { + "epoch": 0.03, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 2.2691, + "num_input_tokens_seen": 3854565376, + "step": 1838 + }, + { + "epoch": 0.03, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 2.2422, + "num_input_tokens_seen": 3856662528, + "step": 1839 + }, + { + "epoch": 0.03, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.2639, + "num_input_tokens_seen": 3858759680, + "step": 1840 + }, + { + "epoch": 0.03, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2793, + "num_input_tokens_seen": 3860856832, + "step": 1841 + }, + { + "epoch": 0.03, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.2604, + "num_input_tokens_seen": 3862953984, + "step": 1842 + }, + { + "epoch": 0.03, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 2.2825, + "num_input_tokens_seen": 3865051136, + "step": 1843 + }, + { + "epoch": 0.03, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2733, + "num_input_tokens_seen": 3867148288, + "step": 1844 + }, + { + "epoch": 0.03, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.2563, + "num_input_tokens_seen": 3869245440, + "step": 1845 + }, + { + "epoch": 0.03, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.2643, + "num_input_tokens_seen": 3871342592, + "step": 1846 + }, + { + "epoch": 0.03, + "grad_norm": 1.5625, + "learning_rate": 2e-05, + "loss": 2.2678, + "num_input_tokens_seen": 3873439744, + "step": 1847 + }, + { + "epoch": 0.03, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2887, + "num_input_tokens_seen": 3875536896, + "step": 1848 + }, + { + "epoch": 0.03, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 2.2596, + "num_input_tokens_seen": 3877634048, + "step": 1849 + }, + { + "epoch": 0.03, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.2691, + "num_input_tokens_seen": 3879731200, + "step": 1850 + }, + { + "epoch": 0.03, + "grad_norm": 1.5390625, + "learning_rate": 2e-05, + "loss": 2.2658, + "num_input_tokens_seen": 3881828352, + "step": 1851 + }, + { + "epoch": 0.03, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.279, + "num_input_tokens_seen": 3883925504, + "step": 1852 + }, + { + "epoch": 0.03, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2648, + "num_input_tokens_seen": 3886022656, + "step": 1853 + }, + { + "epoch": 0.03, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.2621, + "num_input_tokens_seen": 3888119808, + "step": 1854 + }, + { + "epoch": 0.03, + "grad_norm": 1.4140625, + "learning_rate": 2e-05, + "loss": 2.2786, + "num_input_tokens_seen": 3890216960, + "step": 1855 + }, + { + "epoch": 0.03, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 2.2639, + "num_input_tokens_seen": 3892314112, + "step": 1856 + }, + { + "epoch": 0.03, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2535, + "num_input_tokens_seen": 3894411264, + "step": 1857 + }, + { + "epoch": 0.03, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 2.2753, + "num_input_tokens_seen": 3896508416, + "step": 1858 + }, + { + "epoch": 0.03, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2683, + "num_input_tokens_seen": 3898605568, + "step": 1859 + }, + { + "epoch": 0.03, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2422, + "num_input_tokens_seen": 3900702720, + "step": 1860 + }, + { + "epoch": 0.03, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2575, + "num_input_tokens_seen": 3902799872, + "step": 1861 + }, + { + "epoch": 0.03, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.2694, + "num_input_tokens_seen": 3904897024, + "step": 1862 + }, + { + "epoch": 0.03, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2493, + "num_input_tokens_seen": 3906994176, + "step": 1863 + }, + { + "epoch": 0.03, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.2533, + "num_input_tokens_seen": 3909091328, + "step": 1864 + }, + { + "epoch": 0.03, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.2826, + "num_input_tokens_seen": 3911188480, + "step": 1865 + }, + { + "epoch": 0.03, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2654, + "num_input_tokens_seen": 3913285632, + "step": 1866 + }, + { + "epoch": 0.04, + "grad_norm": 0.95703125, + "learning_rate": 2e-05, + "loss": 2.2886, + "num_input_tokens_seen": 3915382784, + "step": 1867 + }, + { + "epoch": 0.04, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2768, + "num_input_tokens_seen": 3917479936, + "step": 1868 + }, + { + "epoch": 0.04, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2558, + "num_input_tokens_seen": 3919577088, + "step": 1869 + }, + { + "epoch": 0.04, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2596, + "num_input_tokens_seen": 3921674240, + "step": 1870 + }, + { + "epoch": 0.04, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2925, + "num_input_tokens_seen": 3923771392, + "step": 1871 + }, + { + "epoch": 0.04, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.257, + "num_input_tokens_seen": 3925868544, + "step": 1872 + }, + { + "epoch": 0.04, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2747, + "num_input_tokens_seen": 3927965696, + "step": 1873 + }, + { + "epoch": 0.04, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2832, + "num_input_tokens_seen": 3930062848, + "step": 1874 + }, + { + "epoch": 0.04, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.2889, + "num_input_tokens_seen": 3932160000, + "step": 1875 + }, + { + "epoch": 0.04, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.2738, + "num_input_tokens_seen": 3934257152, + "step": 1876 + }, + { + "epoch": 0.04, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2712, + "num_input_tokens_seen": 3936354304, + "step": 1877 + }, + { + "epoch": 0.04, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.2532, + "num_input_tokens_seen": 3938451456, + "step": 1878 + }, + { + "epoch": 0.04, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.2394, + "num_input_tokens_seen": 3940548608, + "step": 1879 + }, + { + "epoch": 0.04, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2577, + "num_input_tokens_seen": 3942645760, + "step": 1880 + }, + { + "epoch": 0.04, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2637, + "num_input_tokens_seen": 3944742912, + "step": 1881 + }, + { + "epoch": 0.04, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.2607, + "num_input_tokens_seen": 3946840064, + "step": 1882 + }, + { + "epoch": 0.04, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2183, + "num_input_tokens_seen": 3948937216, + "step": 1883 + }, + { + "epoch": 0.04, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2656, + "num_input_tokens_seen": 3951034368, + "step": 1884 + }, + { + "epoch": 0.04, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2803, + "num_input_tokens_seen": 3953131520, + "step": 1885 + }, + { + "epoch": 0.04, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.2664, + "num_input_tokens_seen": 3955228672, + "step": 1886 + }, + { + "epoch": 0.04, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2447, + "num_input_tokens_seen": 3957325824, + "step": 1887 + }, + { + "epoch": 0.04, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2562, + "num_input_tokens_seen": 3959422976, + "step": 1888 + }, + { + "epoch": 0.04, + "grad_norm": 1.46875, + "learning_rate": 2e-05, + "loss": 2.2404, + "num_input_tokens_seen": 3961520128, + "step": 1889 + }, + { + "epoch": 0.04, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2782, + "num_input_tokens_seen": 3963617280, + "step": 1890 + }, + { + "epoch": 0.04, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2295, + "num_input_tokens_seen": 3965714432, + "step": 1891 + }, + { + "epoch": 0.04, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2327, + "num_input_tokens_seen": 3967811584, + "step": 1892 + }, + { + "epoch": 0.04, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2564, + "num_input_tokens_seen": 3969908736, + "step": 1893 + }, + { + "epoch": 0.04, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2454, + "num_input_tokens_seen": 3972005888, + "step": 1894 + }, + { + "epoch": 0.04, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2514, + "num_input_tokens_seen": 3974103040, + "step": 1895 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2506, + "num_input_tokens_seen": 3976200192, + "step": 1896 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2528, + "num_input_tokens_seen": 3978297344, + "step": 1897 + }, + { + "epoch": 0.04, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2347, + "num_input_tokens_seen": 3980394496, + "step": 1898 + }, + { + "epoch": 0.04, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2677, + "num_input_tokens_seen": 3982491648, + "step": 1899 + }, + { + "epoch": 0.04, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.2871, + "num_input_tokens_seen": 3984588800, + "step": 1900 + }, + { + "epoch": 0.04, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2671, + "num_input_tokens_seen": 3986685952, + "step": 1901 + }, + { + "epoch": 0.04, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2609, + "num_input_tokens_seen": 3988783104, + "step": 1902 + }, + { + "epoch": 0.04, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.25, + "num_input_tokens_seen": 3990880256, + "step": 1903 + }, + { + "epoch": 0.04, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2753, + "num_input_tokens_seen": 3992977408, + "step": 1904 + }, + { + "epoch": 0.04, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2388, + "num_input_tokens_seen": 3995074560, + "step": 1905 + }, + { + "epoch": 0.04, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.225, + "num_input_tokens_seen": 3997171712, + "step": 1906 + }, + { + "epoch": 0.04, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2758, + "num_input_tokens_seen": 3999268864, + "step": 1907 + }, + { + "epoch": 0.04, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2452, + "num_input_tokens_seen": 4001366016, + "step": 1908 + }, + { + "epoch": 0.04, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2403, + "num_input_tokens_seen": 4003463168, + "step": 1909 + }, + { + "epoch": 0.04, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.2516, + "num_input_tokens_seen": 4005560320, + "step": 1910 + }, + { + "epoch": 0.04, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2475, + "num_input_tokens_seen": 4007657472, + "step": 1911 + }, + { + "epoch": 0.04, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2799, + "num_input_tokens_seen": 4009754624, + "step": 1912 + }, + { + "epoch": 0.04, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2409, + "num_input_tokens_seen": 4011851776, + "step": 1913 + }, + { + "epoch": 0.04, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2585, + "num_input_tokens_seen": 4013948928, + "step": 1914 + }, + { + "epoch": 0.05, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2801, + "num_input_tokens_seen": 4016046080, + "step": 1915 + }, + { + "epoch": 0.05, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2397, + "num_input_tokens_seen": 4018143232, + "step": 1916 + }, + { + "epoch": 0.05, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2618, + "num_input_tokens_seen": 4020240384, + "step": 1917 + }, + { + "epoch": 0.05, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2926, + "num_input_tokens_seen": 4022337536, + "step": 1918 + }, + { + "epoch": 0.05, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2566, + "num_input_tokens_seen": 4024434688, + "step": 1919 + }, + { + "epoch": 0.05, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2638, + "num_input_tokens_seen": 4026531840, + "step": 1920 + }, + { + "epoch": 0.05, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.2595, + "num_input_tokens_seen": 4028628992, + "step": 1921 + }, + { + "epoch": 0.05, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2581, + "num_input_tokens_seen": 4030726144, + "step": 1922 + }, + { + "epoch": 0.05, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.2805, + "num_input_tokens_seen": 4032823296, + "step": 1923 + }, + { + "epoch": 0.05, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.2686, + "num_input_tokens_seen": 4034920448, + "step": 1924 + }, + { + "epoch": 0.05, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2826, + "num_input_tokens_seen": 4037017600, + "step": 1925 + }, + { + "epoch": 0.05, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.2745, + "num_input_tokens_seen": 4039114752, + "step": 1926 + }, + { + "epoch": 0.05, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.2493, + "num_input_tokens_seen": 4041211904, + "step": 1927 + }, + { + "epoch": 0.05, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2325, + "num_input_tokens_seen": 4043309056, + "step": 1928 + }, + { + "epoch": 0.05, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.2482, + "num_input_tokens_seen": 4045406208, + "step": 1929 + }, + { + "epoch": 0.05, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 2.2656, + "num_input_tokens_seen": 4047503360, + "step": 1930 + }, + { + "epoch": 0.05, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2079, + "num_input_tokens_seen": 4049600512, + "step": 1931 + }, + { + "epoch": 0.05, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 4051697664, + "step": 1932 + }, + { + "epoch": 0.05, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.2515, + "num_input_tokens_seen": 4053794816, + "step": 1933 + }, + { + "epoch": 0.05, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2567, + "num_input_tokens_seen": 4055891968, + "step": 1934 + }, + { + "epoch": 0.05, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2395, + "num_input_tokens_seen": 4057989120, + "step": 1935 + }, + { + "epoch": 0.05, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2636, + "num_input_tokens_seen": 4060086272, + "step": 1936 + }, + { + "epoch": 0.05, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2678, + "num_input_tokens_seen": 4062183424, + "step": 1937 + }, + { + "epoch": 0.05, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.25, + "num_input_tokens_seen": 4064280576, + "step": 1938 + }, + { + "epoch": 0.05, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2339, + "num_input_tokens_seen": 4066377728, + "step": 1939 + }, + { + "epoch": 0.05, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2674, + "num_input_tokens_seen": 4068474880, + "step": 1940 + }, + { + "epoch": 0.05, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.2556, + "num_input_tokens_seen": 4070572032, + "step": 1941 + }, + { + "epoch": 0.05, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2508, + "num_input_tokens_seen": 4072669184, + "step": 1942 + }, + { + "epoch": 0.05, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2367, + "num_input_tokens_seen": 4074766336, + "step": 1943 + }, + { + "epoch": 0.05, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2733, + "num_input_tokens_seen": 4076863488, + "step": 1944 + }, + { + "epoch": 0.05, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.2416, + "num_input_tokens_seen": 4078960640, + "step": 1945 + }, + { + "epoch": 0.05, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2408, + "num_input_tokens_seen": 4081057792, + "step": 1946 + }, + { + "epoch": 0.05, + "grad_norm": 1.265625, + "learning_rate": 2e-05, + "loss": 2.2676, + "num_input_tokens_seen": 4083154944, + "step": 1947 + }, + { + "epoch": 0.05, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.2693, + "num_input_tokens_seen": 4085252096, + "step": 1948 + }, + { + "epoch": 0.05, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2369, + "num_input_tokens_seen": 4087349248, + "step": 1949 + }, + { + "epoch": 0.05, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2695, + "num_input_tokens_seen": 4089446400, + "step": 1950 + }, + { + "epoch": 0.05, + "grad_norm": 1.3046875, + "learning_rate": 2e-05, + "loss": 2.2472, + "num_input_tokens_seen": 4091543552, + "step": 1951 + }, + { + "epoch": 0.05, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.2432, + "num_input_tokens_seen": 4093640704, + "step": 1952 + }, + { + "epoch": 0.05, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2716, + "num_input_tokens_seen": 4095737856, + "step": 1953 + }, + { + "epoch": 0.05, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2521, + "num_input_tokens_seen": 4097835008, + "step": 1954 + }, + { + "epoch": 0.05, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2508, + "num_input_tokens_seen": 4099932160, + "step": 1955 + }, + { + "epoch": 0.05, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.2603, + "num_input_tokens_seen": 4102029312, + "step": 1956 + }, + { + "epoch": 0.05, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.25, + "num_input_tokens_seen": 4104126464, + "step": 1957 + }, + { + "epoch": 0.05, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.2533, + "num_input_tokens_seen": 4106223616, + "step": 1958 + }, + { + "epoch": 0.05, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.2629, + "num_input_tokens_seen": 4108320768, + "step": 1959 + }, + { + "epoch": 0.05, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.2577, + "num_input_tokens_seen": 4110417920, + "step": 1960 + }, + { + "epoch": 0.05, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2608, + "num_input_tokens_seen": 4112515072, + "step": 1961 + }, + { + "epoch": 0.05, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.2312, + "num_input_tokens_seen": 4114612224, + "step": 1962 + }, + { + "epoch": 0.06, + "grad_norm": 0.7109375, + "learning_rate": 2e-05, + "loss": 2.2245, + "num_input_tokens_seen": 4116709376, + "step": 1963 + }, + { + "epoch": 0.06, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2604, + "num_input_tokens_seen": 4118806528, + "step": 1964 + }, + { + "epoch": 0.06, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.2348, + "num_input_tokens_seen": 4120903680, + "step": 1965 + }, + { + "epoch": 0.06, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2435, + "num_input_tokens_seen": 4123000832, + "step": 1966 + }, + { + "epoch": 0.06, + "grad_norm": 0.6875, + "learning_rate": 2e-05, + "loss": 2.2598, + "num_input_tokens_seen": 4125097984, + "step": 1967 + }, + { + "epoch": 0.06, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.2712, + "num_input_tokens_seen": 4127195136, + "step": 1968 + }, + { + "epoch": 0.06, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2479, + "num_input_tokens_seen": 4129292288, + "step": 1969 + }, + { + "epoch": 0.06, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2934, + "num_input_tokens_seen": 4131389440, + "step": 1970 + }, + { + "epoch": 0.06, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2353, + "num_input_tokens_seen": 4133486592, + "step": 1971 + }, + { + "epoch": 0.06, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.3041, + "num_input_tokens_seen": 4135583744, + "step": 1972 + }, + { + "epoch": 0.06, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2473, + "num_input_tokens_seen": 4137680896, + "step": 1973 + }, + { + "epoch": 0.06, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2624, + "num_input_tokens_seen": 4139778048, + "step": 1974 + }, + { + "epoch": 0.06, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2411, + "num_input_tokens_seen": 4141875200, + "step": 1975 + }, + { + "epoch": 0.06, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2376, + "num_input_tokens_seen": 4143972352, + "step": 1976 + }, + { + "epoch": 0.06, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2363, + "num_input_tokens_seen": 4146069504, + "step": 1977 + }, + { + "epoch": 0.06, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.261, + "num_input_tokens_seen": 4148166656, + "step": 1978 + }, + { + "epoch": 0.06, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.2939, + "num_input_tokens_seen": 4150263808, + "step": 1979 + }, + { + "epoch": 0.06, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2356, + "num_input_tokens_seen": 4152360960, + "step": 1980 + }, + { + "epoch": 0.06, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2609, + "num_input_tokens_seen": 4154458112, + "step": 1981 + }, + { + "epoch": 0.06, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.2245, + "num_input_tokens_seen": 4156555264, + "step": 1982 + }, + { + "epoch": 0.06, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2427, + "num_input_tokens_seen": 4158652416, + "step": 1983 + }, + { + "epoch": 0.06, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2955, + "num_input_tokens_seen": 4160749568, + "step": 1984 + }, + { + "epoch": 0.06, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2335, + "num_input_tokens_seen": 4162846720, + "step": 1985 + }, + { + "epoch": 0.06, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2697, + "num_input_tokens_seen": 4164943872, + "step": 1986 + }, + { + "epoch": 0.06, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2625, + "num_input_tokens_seen": 4167041024, + "step": 1987 + }, + { + "epoch": 0.06, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2812, + "num_input_tokens_seen": 4169138176, + "step": 1988 + }, + { + "epoch": 0.06, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.2425, + "num_input_tokens_seen": 4171235328, + "step": 1989 + }, + { + "epoch": 0.06, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2335, + "num_input_tokens_seen": 4173332480, + "step": 1990 + }, + { + "epoch": 0.06, + "eval_loss": 2.301752805709839, + "eval_runtime": 3367.1268, + "eval_samples_per_second": 1.171, + "eval_steps_per_second": 0.293, + "num_input_tokens_seen": 4173332480, + "step": 1990 + }, + { + "epoch": 0.06, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2671, + "num_input_tokens_seen": 4175429632, + "step": 1991 + }, + { + "epoch": 0.06, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2731, + "num_input_tokens_seen": 4177526784, + "step": 1992 + }, + { + "epoch": 0.06, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2155, + "num_input_tokens_seen": 4179623936, + "step": 1993 + }, + { + "epoch": 0.06, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2369, + "num_input_tokens_seen": 4181721088, + "step": 1994 + }, + { + "epoch": 0.06, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2369, + "num_input_tokens_seen": 4183818240, + "step": 1995 + }, + { + "epoch": 0.06, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.2395, + "num_input_tokens_seen": 4185915392, + "step": 1996 + }, + { + "epoch": 0.06, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2817, + "num_input_tokens_seen": 4188012544, + "step": 1997 + }, + { + "epoch": 0.06, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2454, + "num_input_tokens_seen": 4190109696, + "step": 1998 + }, + { + "epoch": 0.06, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.2574, + "num_input_tokens_seen": 4192206848, + "step": 1999 + }, + { + "epoch": 0.06, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.2191, + "num_input_tokens_seen": 4194304000, + "step": 2000 + }, + { + "epoch": 0.06, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.2409, + "num_input_tokens_seen": 4196401152, + "step": 2001 + }, + { + "epoch": 0.06, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.28, + "num_input_tokens_seen": 4198498304, + "step": 2002 + }, + { + "epoch": 0.06, + "grad_norm": 1.7109375, + "learning_rate": 2e-05, + "loss": 2.2531, + "num_input_tokens_seen": 4200595456, + "step": 2003 + }, + { + "epoch": 0.06, + "grad_norm": 1.8515625, + "learning_rate": 2e-05, + "loss": 2.257, + "num_input_tokens_seen": 4202692608, + "step": 2004 + }, + { + "epoch": 0.06, + "grad_norm": 1.5234375, + "learning_rate": 2e-05, + "loss": 2.2379, + "num_input_tokens_seen": 4204789760, + "step": 2005 + }, + { + "epoch": 0.06, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.233, + "num_input_tokens_seen": 4206886912, + "step": 2006 + }, + { + "epoch": 0.06, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.2742, + "num_input_tokens_seen": 4208984064, + "step": 2007 + }, + { + "epoch": 0.06, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2709, + "num_input_tokens_seen": 4211081216, + "step": 2008 + }, + { + "epoch": 0.06, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2703, + "num_input_tokens_seen": 4213178368, + "step": 2009 + }, + { + "epoch": 0.07, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.26, + "num_input_tokens_seen": 4215275520, + "step": 2010 + }, + { + "epoch": 0.07, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2376, + "num_input_tokens_seen": 4217372672, + "step": 2011 + }, + { + "epoch": 0.07, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2375, + "num_input_tokens_seen": 4219469824, + "step": 2012 + }, + { + "epoch": 0.07, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2525, + "num_input_tokens_seen": 4221566976, + "step": 2013 + }, + { + "epoch": 0.07, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.2557, + "num_input_tokens_seen": 4223664128, + "step": 2014 + }, + { + "epoch": 0.07, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2603, + "num_input_tokens_seen": 4225761280, + "step": 2015 + }, + { + "epoch": 0.07, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2631, + "num_input_tokens_seen": 4227858432, + "step": 2016 + }, + { + "epoch": 0.07, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2324, + "num_input_tokens_seen": 4229955584, + "step": 2017 + }, + { + "epoch": 0.07, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2694, + "num_input_tokens_seen": 4232052736, + "step": 2018 + }, + { + "epoch": 0.07, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2485, + "num_input_tokens_seen": 4234149888, + "step": 2019 + }, + { + "epoch": 0.07, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2445, + "num_input_tokens_seen": 4236247040, + "step": 2020 + }, + { + "epoch": 0.07, + "grad_norm": 0.68359375, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 4238344192, + "step": 2021 + }, + { + "epoch": 0.07, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.2574, + "num_input_tokens_seen": 4240441344, + "step": 2022 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2371, + "num_input_tokens_seen": 4242538496, + "step": 2023 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2538, + "num_input_tokens_seen": 4244635648, + "step": 2024 + }, + { + "epoch": 0.07, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.2459, + "num_input_tokens_seen": 4246732800, + "step": 2025 + }, + { + "epoch": 0.07, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2343, + "num_input_tokens_seen": 4248829952, + "step": 2026 + }, + { + "epoch": 0.07, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2579, + "num_input_tokens_seen": 4250927104, + "step": 2027 + }, + { + "epoch": 0.07, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2443, + "num_input_tokens_seen": 4253024256, + "step": 2028 + }, + { + "epoch": 0.07, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2548, + "num_input_tokens_seen": 4255121408, + "step": 2029 + }, + { + "epoch": 0.07, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2433, + "num_input_tokens_seen": 4257218560, + "step": 2030 + }, + { + "epoch": 0.07, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2678, + "num_input_tokens_seen": 4259315712, + "step": 2031 + }, + { + "epoch": 0.07, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2687, + "num_input_tokens_seen": 4261412864, + "step": 2032 + }, + { + "epoch": 0.07, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2481, + "num_input_tokens_seen": 4263510016, + "step": 2033 + }, + { + "epoch": 0.07, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.2347, + "num_input_tokens_seen": 4265607168, + "step": 2034 + }, + { + "epoch": 0.07, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2405, + "num_input_tokens_seen": 4267704320, + "step": 2035 + }, + { + "epoch": 0.07, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2299, + "num_input_tokens_seen": 4269801472, + "step": 2036 + }, + { + "epoch": 0.07, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.2466, + "num_input_tokens_seen": 4271898624, + "step": 2037 + }, + { + "epoch": 0.07, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2504, + "num_input_tokens_seen": 4273995776, + "step": 2038 + }, + { + "epoch": 0.07, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.225, + "num_input_tokens_seen": 4276092928, + "step": 2039 + }, + { + "epoch": 0.07, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.2708, + "num_input_tokens_seen": 4278190080, + "step": 2040 + }, + { + "epoch": 0.07, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 4280287232, + "step": 2041 + }, + { + "epoch": 0.07, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2186, + "num_input_tokens_seen": 4282384384, + "step": 2042 + }, + { + "epoch": 0.07, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2279, + "num_input_tokens_seen": 4284481536, + "step": 2043 + }, + { + "epoch": 0.07, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2589, + "num_input_tokens_seen": 4286578688, + "step": 2044 + }, + { + "epoch": 0.07, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.234, + "num_input_tokens_seen": 4288675840, + "step": 2045 + }, + { + "epoch": 0.07, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.24, + "num_input_tokens_seen": 4290772992, + "step": 2046 + }, + { + "epoch": 0.07, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2474, + "num_input_tokens_seen": 4292870144, + "step": 2047 + }, + { + "epoch": 0.07, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2538, + "num_input_tokens_seen": 4294967296, + "step": 2048 + }, + { + "epoch": 0.07, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2374, + "num_input_tokens_seen": 4297064448, + "step": 2049 + }, + { + "epoch": 0.07, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2482, + "num_input_tokens_seen": 4299161600, + "step": 2050 + }, + { + "epoch": 0.07, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.3018, + "num_input_tokens_seen": 4301258752, + "step": 2051 + }, + { + "epoch": 0.07, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2305, + "num_input_tokens_seen": 4303355904, + "step": 2052 + }, + { + "epoch": 0.07, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.2291, + "num_input_tokens_seen": 4305453056, + "step": 2053 + }, + { + "epoch": 0.07, + "grad_norm": 1.1953125, + "learning_rate": 2e-05, + "loss": 2.2681, + "num_input_tokens_seen": 4307550208, + "step": 2054 + }, + { + "epoch": 0.07, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2618, + "num_input_tokens_seen": 4309647360, + "step": 2055 + }, + { + "epoch": 0.07, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.2511, + "num_input_tokens_seen": 4311744512, + "step": 2056 + }, + { + "epoch": 0.07, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.2143, + "num_input_tokens_seen": 4313841664, + "step": 2057 + }, + { + "epoch": 0.08, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2648, + "num_input_tokens_seen": 4315938816, + "step": 2058 + }, + { + "epoch": 0.08, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.2365, + "num_input_tokens_seen": 4318035968, + "step": 2059 + }, + { + "epoch": 0.08, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2296, + "num_input_tokens_seen": 4320133120, + "step": 2060 + }, + { + "epoch": 0.08, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 4322230272, + "step": 2061 + }, + { + "epoch": 0.08, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2419, + "num_input_tokens_seen": 4324327424, + "step": 2062 + }, + { + "epoch": 0.08, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2536, + "num_input_tokens_seen": 4326424576, + "step": 2063 + }, + { + "epoch": 0.08, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2211, + "num_input_tokens_seen": 4328521728, + "step": 2064 + }, + { + "epoch": 0.08, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.2456, + "num_input_tokens_seen": 4330618880, + "step": 2065 + }, + { + "epoch": 0.08, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.2504, + "num_input_tokens_seen": 4332716032, + "step": 2066 + }, + { + "epoch": 0.08, + "grad_norm": 1.2421875, + "learning_rate": 2e-05, + "loss": 2.251, + "num_input_tokens_seen": 4334813184, + "step": 2067 + }, + { + "epoch": 0.08, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.2403, + "num_input_tokens_seen": 4336910336, + "step": 2068 + }, + { + "epoch": 0.08, + "grad_norm": 1.609375, + "learning_rate": 2e-05, + "loss": 2.2676, + "num_input_tokens_seen": 4339007488, + "step": 2069 + }, + { + "epoch": 0.08, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.2543, + "num_input_tokens_seen": 4341104640, + "step": 2070 + }, + { + "epoch": 0.08, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2235, + "num_input_tokens_seen": 4343201792, + "step": 2071 + }, + { + "epoch": 0.08, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2591, + "num_input_tokens_seen": 4345298944, + "step": 2072 + }, + { + "epoch": 0.08, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2658, + "num_input_tokens_seen": 4347396096, + "step": 2073 + }, + { + "epoch": 0.08, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2363, + "num_input_tokens_seen": 4349493248, + "step": 2074 + }, + { + "epoch": 0.08, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.2125, + "num_input_tokens_seen": 4351590400, + "step": 2075 + }, + { + "epoch": 0.08, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2348, + "num_input_tokens_seen": 4353687552, + "step": 2076 + }, + { + "epoch": 0.08, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2434, + "num_input_tokens_seen": 4355784704, + "step": 2077 + }, + { + "epoch": 0.08, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.1948, + "num_input_tokens_seen": 4357881856, + "step": 2078 + }, + { + "epoch": 0.08, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2597, + "num_input_tokens_seen": 4359979008, + "step": 2079 + }, + { + "epoch": 0.08, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2381, + "num_input_tokens_seen": 4362076160, + "step": 2080 + }, + { + "epoch": 0.08, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2364, + "num_input_tokens_seen": 4364173312, + "step": 2081 + }, + { + "epoch": 0.08, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.2216, + "num_input_tokens_seen": 4366270464, + "step": 2082 + }, + { + "epoch": 0.08, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.2442, + "num_input_tokens_seen": 4368367616, + "step": 2083 + }, + { + "epoch": 0.08, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2489, + "num_input_tokens_seen": 4370464768, + "step": 2084 + }, + { + "epoch": 0.08, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2523, + "num_input_tokens_seen": 4372561920, + "step": 2085 + }, + { + "epoch": 0.08, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2367, + "num_input_tokens_seen": 4374659072, + "step": 2086 + }, + { + "epoch": 0.08, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.256, + "num_input_tokens_seen": 4376756224, + "step": 2087 + }, + { + "epoch": 0.08, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2477, + "num_input_tokens_seen": 4378853376, + "step": 2088 + }, + { + "epoch": 0.08, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.252, + "num_input_tokens_seen": 4380950528, + "step": 2089 + }, + { + "epoch": 0.08, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2262, + "num_input_tokens_seen": 4383047680, + "step": 2090 + }, + { + "epoch": 0.08, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.2349, + "num_input_tokens_seen": 4385144832, + "step": 2091 + }, + { + "epoch": 0.08, + "grad_norm": 0.76171875, + "learning_rate": 2e-05, + "loss": 2.2263, + "num_input_tokens_seen": 4387241984, + "step": 2092 + }, + { + "epoch": 0.08, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2351, + "num_input_tokens_seen": 4389339136, + "step": 2093 + }, + { + "epoch": 0.08, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2602, + "num_input_tokens_seen": 4391436288, + "step": 2094 + }, + { + "epoch": 0.08, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2371, + "num_input_tokens_seen": 4393533440, + "step": 2095 + }, + { + "epoch": 0.08, + "grad_norm": 1.4296875, + "learning_rate": 2e-05, + "loss": 2.22, + "num_input_tokens_seen": 4395630592, + "step": 2096 + }, + { + "epoch": 0.08, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.2533, + "num_input_tokens_seen": 4397727744, + "step": 2097 + }, + { + "epoch": 0.08, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2633, + "num_input_tokens_seen": 4399824896, + "step": 2098 + }, + { + "epoch": 0.08, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.2472, + "num_input_tokens_seen": 4401922048, + "step": 2099 + }, + { + "epoch": 0.08, + "grad_norm": 1.546875, + "learning_rate": 2e-05, + "loss": 2.2618, + "num_input_tokens_seen": 4404019200, + "step": 2100 + }, + { + "epoch": 0.08, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.2135, + "num_input_tokens_seen": 4406116352, + "step": 2101 + }, + { + "epoch": 0.08, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2126, + "num_input_tokens_seen": 4408213504, + "step": 2102 + }, + { + "epoch": 0.08, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.2389, + "num_input_tokens_seen": 4410310656, + "step": 2103 + }, + { + "epoch": 0.08, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2247, + "num_input_tokens_seen": 4412407808, + "step": 2104 + }, + { + "epoch": 0.08, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2196, + "num_input_tokens_seen": 4414504960, + "step": 2105 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2177, + "num_input_tokens_seen": 4416602112, + "step": 2106 + }, + { + "epoch": 0.09, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.265, + "num_input_tokens_seen": 4418699264, + "step": 2107 + }, + { + "epoch": 0.09, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.2521, + "num_input_tokens_seen": 4420796416, + "step": 2108 + }, + { + "epoch": 0.09, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2208, + "num_input_tokens_seen": 4422893568, + "step": 2109 + }, + { + "epoch": 0.09, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.2399, + "num_input_tokens_seen": 4424990720, + "step": 2110 + }, + { + "epoch": 0.09, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2121, + "num_input_tokens_seen": 4427087872, + "step": 2111 + }, + { + "epoch": 0.09, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2473, + "num_input_tokens_seen": 4429185024, + "step": 2112 + }, + { + "epoch": 0.09, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2501, + "num_input_tokens_seen": 4431282176, + "step": 2113 + }, + { + "epoch": 0.09, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.2531, + "num_input_tokens_seen": 4433379328, + "step": 2114 + }, + { + "epoch": 0.09, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.233, + "num_input_tokens_seen": 4435476480, + "step": 2115 + }, + { + "epoch": 0.09, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2337, + "num_input_tokens_seen": 4437573632, + "step": 2116 + }, + { + "epoch": 0.09, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2492, + "num_input_tokens_seen": 4439670784, + "step": 2117 + }, + { + "epoch": 0.09, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2544, + "num_input_tokens_seen": 4441767936, + "step": 2118 + }, + { + "epoch": 0.09, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2546, + "num_input_tokens_seen": 4443865088, + "step": 2119 + }, + { + "epoch": 0.09, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2508, + "num_input_tokens_seen": 4445962240, + "step": 2120 + }, + { + "epoch": 0.09, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2426, + "num_input_tokens_seen": 4448059392, + "step": 2121 + }, + { + "epoch": 0.09, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2184, + "num_input_tokens_seen": 4450156544, + "step": 2122 + }, + { + "epoch": 0.09, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2317, + "num_input_tokens_seen": 4452253696, + "step": 2123 + }, + { + "epoch": 0.09, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2562, + "num_input_tokens_seen": 4454350848, + "step": 2124 + }, + { + "epoch": 0.09, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2453, + "num_input_tokens_seen": 4456448000, + "step": 2125 + }, + { + "epoch": 0.09, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2139, + "num_input_tokens_seen": 4458545152, + "step": 2126 + }, + { + "epoch": 0.09, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2362, + "num_input_tokens_seen": 4460642304, + "step": 2127 + }, + { + "epoch": 0.09, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.234, + "num_input_tokens_seen": 4462739456, + "step": 2128 + }, + { + "epoch": 0.09, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2327, + "num_input_tokens_seen": 4464836608, + "step": 2129 + }, + { + "epoch": 0.09, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.22, + "num_input_tokens_seen": 4466933760, + "step": 2130 + }, + { + "epoch": 0.09, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2256, + "num_input_tokens_seen": 4469030912, + "step": 2131 + }, + { + "epoch": 0.09, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2394, + "num_input_tokens_seen": 4471128064, + "step": 2132 + }, + { + "epoch": 0.09, + "grad_norm": 1.109375, + "learning_rate": 2e-05, + "loss": 2.257, + "num_input_tokens_seen": 4473225216, + "step": 2133 + }, + { + "epoch": 0.09, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.2542, + "num_input_tokens_seen": 4475322368, + "step": 2134 + }, + { + "epoch": 0.09, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2487, + "num_input_tokens_seen": 4477419520, + "step": 2135 + }, + { + "epoch": 0.09, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.2741, + "num_input_tokens_seen": 4479516672, + "step": 2136 + }, + { + "epoch": 0.09, + "grad_norm": 1.578125, + "learning_rate": 2e-05, + "loss": 2.2387, + "num_input_tokens_seen": 4481613824, + "step": 2137 + }, + { + "epoch": 0.09, + "grad_norm": 1.3125, + "learning_rate": 2e-05, + "loss": 2.2706, + "num_input_tokens_seen": 4483710976, + "step": 2138 + }, + { + "epoch": 0.09, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.2613, + "num_input_tokens_seen": 4485808128, + "step": 2139 + }, + { + "epoch": 0.09, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.2488, + "num_input_tokens_seen": 4487905280, + "step": 2140 + }, + { + "epoch": 0.09, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.2403, + "num_input_tokens_seen": 4490002432, + "step": 2141 + }, + { + "epoch": 0.09, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.2388, + "num_input_tokens_seen": 4492099584, + "step": 2142 + }, + { + "epoch": 0.09, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2168, + "num_input_tokens_seen": 4494196736, + "step": 2143 + }, + { + "epoch": 0.09, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.257, + "num_input_tokens_seen": 4496293888, + "step": 2144 + }, + { + "epoch": 0.09, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2504, + "num_input_tokens_seen": 4498391040, + "step": 2145 + }, + { + "epoch": 0.09, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2443, + "num_input_tokens_seen": 4500488192, + "step": 2146 + }, + { + "epoch": 0.09, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2134, + "num_input_tokens_seen": 4502585344, + "step": 2147 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2315, + "num_input_tokens_seen": 4504682496, + "step": 2148 + }, + { + "epoch": 0.09, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.2434, + "num_input_tokens_seen": 4506779648, + "step": 2149 + }, + { + "epoch": 0.09, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.2275, + "num_input_tokens_seen": 4508876800, + "step": 2150 + }, + { + "epoch": 0.09, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.239, + "num_input_tokens_seen": 4510973952, + "step": 2151 + }, + { + "epoch": 0.09, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2218, + "num_input_tokens_seen": 4513071104, + "step": 2152 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.2536, + "num_input_tokens_seen": 4515168256, + "step": 2153 + }, + { + "epoch": 0.1, + "grad_norm": 0.734375, + "learning_rate": 2e-05, + "loss": 2.2386, + "num_input_tokens_seen": 4517265408, + "step": 2154 + }, + { + "epoch": 0.1, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2258, + "num_input_tokens_seen": 4519362560, + "step": 2155 + }, + { + "epoch": 0.1, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2621, + "num_input_tokens_seen": 4521459712, + "step": 2156 + }, + { + "epoch": 0.1, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.243, + "num_input_tokens_seen": 4523556864, + "step": 2157 + }, + { + "epoch": 0.1, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2213, + "num_input_tokens_seen": 4525654016, + "step": 2158 + }, + { + "epoch": 0.1, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.2142, + "num_input_tokens_seen": 4527751168, + "step": 2159 + }, + { + "epoch": 0.1, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2339, + "num_input_tokens_seen": 4529848320, + "step": 2160 + }, + { + "epoch": 0.1, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.2633, + "num_input_tokens_seen": 4531945472, + "step": 2161 + }, + { + "epoch": 0.1, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.22, + "num_input_tokens_seen": 4534042624, + "step": 2162 + }, + { + "epoch": 0.1, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2008, + "num_input_tokens_seen": 4536139776, + "step": 2163 + }, + { + "epoch": 0.1, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.236, + "num_input_tokens_seen": 4538236928, + "step": 2164 + }, + { + "epoch": 0.1, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2376, + "num_input_tokens_seen": 4540334080, + "step": 2165 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2557, + "num_input_tokens_seen": 4542431232, + "step": 2166 + }, + { + "epoch": 0.1, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2047, + "num_input_tokens_seen": 4544528384, + "step": 2167 + }, + { + "epoch": 0.1, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2418, + "num_input_tokens_seen": 4546625536, + "step": 2168 + }, + { + "epoch": 0.1, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2286, + "num_input_tokens_seen": 4548722688, + "step": 2169 + }, + { + "epoch": 0.1, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2395, + "num_input_tokens_seen": 4550819840, + "step": 2170 + }, + { + "epoch": 0.1, + "grad_norm": 0.94140625, + "learning_rate": 2e-05, + "loss": 2.2208, + "num_input_tokens_seen": 4552916992, + "step": 2171 + }, + { + "epoch": 0.1, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2427, + "num_input_tokens_seen": 4555014144, + "step": 2172 + }, + { + "epoch": 0.1, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2212, + "num_input_tokens_seen": 4557111296, + "step": 2173 + }, + { + "epoch": 0.1, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2273, + "num_input_tokens_seen": 4559208448, + "step": 2174 + }, + { + "epoch": 0.1, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.2387, + "num_input_tokens_seen": 4561305600, + "step": 2175 + }, + { + "epoch": 0.1, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2109, + "num_input_tokens_seen": 4563402752, + "step": 2176 + }, + { + "epoch": 0.1, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.229, + "num_input_tokens_seen": 4565499904, + "step": 2177 + }, + { + "epoch": 0.1, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.2521, + "num_input_tokens_seen": 4567597056, + "step": 2178 + }, + { + "epoch": 0.1, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.2126, + "num_input_tokens_seen": 4569694208, + "step": 2179 + }, + { + "epoch": 0.1, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2463, + "num_input_tokens_seen": 4571791360, + "step": 2180 + }, + { + "epoch": 0.1, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.2159, + "num_input_tokens_seen": 4573888512, + "step": 2181 + }, + { + "epoch": 0.1, + "grad_norm": 1.125, + "learning_rate": 2e-05, + "loss": 2.2124, + "num_input_tokens_seen": 4575985664, + "step": 2182 + }, + { + "epoch": 0.1, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2327, + "num_input_tokens_seen": 4578082816, + "step": 2183 + }, + { + "epoch": 0.1, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2233, + "num_input_tokens_seen": 4580179968, + "step": 2184 + }, + { + "epoch": 0.1, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.1894, + "num_input_tokens_seen": 4582277120, + "step": 2185 + }, + { + "epoch": 0.1, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.2294, + "num_input_tokens_seen": 4584374272, + "step": 2186 + }, + { + "epoch": 0.1, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.2339, + "num_input_tokens_seen": 4586471424, + "step": 2187 + }, + { + "epoch": 0.1, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2275, + "num_input_tokens_seen": 4588568576, + "step": 2188 + }, + { + "epoch": 0.1, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.2488, + "num_input_tokens_seen": 4590665728, + "step": 2189 + }, + { + "epoch": 0.1, + "eval_loss": 2.2922277450561523, + "eval_runtime": 2101.3307, + "eval_samples_per_second": 1.876, + "eval_steps_per_second": 0.469, + "num_input_tokens_seen": 4590665728, + "step": 2189 + }, + { + "epoch": 0.1, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2405, + "num_input_tokens_seen": 4592762880, + "step": 2190 + }, + { + "epoch": 0.1, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2444, + "num_input_tokens_seen": 4594860032, + "step": 2191 + }, + { + "epoch": 0.1, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2438, + "num_input_tokens_seen": 4596957184, + "step": 2192 + }, + { + "epoch": 0.1, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2298, + "num_input_tokens_seen": 4599054336, + "step": 2193 + }, + { + "epoch": 0.1, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2418, + "num_input_tokens_seen": 4601151488, + "step": 2194 + }, + { + "epoch": 0.1, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.2123, + "num_input_tokens_seen": 4603248640, + "step": 2195 + }, + { + "epoch": 0.1, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2411, + "num_input_tokens_seen": 4605345792, + "step": 2196 + }, + { + "epoch": 0.1, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.2491, + "num_input_tokens_seen": 4607442944, + "step": 2197 + }, + { + "epoch": 0.1, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.2304, + "num_input_tokens_seen": 4609540096, + "step": 2198 + }, + { + "epoch": 0.1, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.212, + "num_input_tokens_seen": 4611637248, + "step": 2199 + }, + { + "epoch": 0.1, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 2.2465, + "num_input_tokens_seen": 4613734400, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.2287, + "num_input_tokens_seen": 4615831552, + "step": 2201 + }, + { + "epoch": 0.11, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2248, + "num_input_tokens_seen": 4617928704, + "step": 2202 + }, + { + "epoch": 0.11, + "grad_norm": 1.28125, + "learning_rate": 2e-05, + "loss": 2.2287, + "num_input_tokens_seen": 4620025856, + "step": 2203 + }, + { + "epoch": 0.11, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.2228, + "num_input_tokens_seen": 4622123008, + "step": 2204 + }, + { + "epoch": 0.11, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.2178, + "num_input_tokens_seen": 4624220160, + "step": 2205 + }, + { + "epoch": 0.11, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2298, + "num_input_tokens_seen": 4626317312, + "step": 2206 + }, + { + "epoch": 0.11, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.2332, + "num_input_tokens_seen": 4628414464, + "step": 2207 + }, + { + "epoch": 0.11, + "grad_norm": 1.078125, + "learning_rate": 2e-05, + "loss": 2.2124, + "num_input_tokens_seen": 4630511616, + "step": 2208 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.2237, + "num_input_tokens_seen": 4632608768, + "step": 2209 + }, + { + "epoch": 0.11, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2125, + "num_input_tokens_seen": 4634705920, + "step": 2210 + }, + { + "epoch": 0.11, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2237, + "num_input_tokens_seen": 4636803072, + "step": 2211 + }, + { + "epoch": 0.11, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.205, + "num_input_tokens_seen": 4638900224, + "step": 2212 + }, + { + "epoch": 0.11, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2087, + "num_input_tokens_seen": 4640997376, + "step": 2213 + }, + { + "epoch": 0.11, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.218, + "num_input_tokens_seen": 4643094528, + "step": 2214 + }, + { + "epoch": 0.11, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2039, + "num_input_tokens_seen": 4645191680, + "step": 2215 + }, + { + "epoch": 0.11, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2072, + "num_input_tokens_seen": 4647288832, + "step": 2216 + }, + { + "epoch": 0.11, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2188, + "num_input_tokens_seen": 4649385984, + "step": 2217 + }, + { + "epoch": 0.11, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2232, + "num_input_tokens_seen": 4651483136, + "step": 2218 + }, + { + "epoch": 0.11, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2337, + "num_input_tokens_seen": 4653580288, + "step": 2219 + }, + { + "epoch": 0.11, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2159, + "num_input_tokens_seen": 4655677440, + "step": 2220 + }, + { + "epoch": 0.11, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2206, + "num_input_tokens_seen": 4657774592, + "step": 2221 + }, + { + "epoch": 0.11, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2128, + "num_input_tokens_seen": 4659871744, + "step": 2222 + }, + { + "epoch": 0.11, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2072, + "num_input_tokens_seen": 4661968896, + "step": 2223 + }, + { + "epoch": 0.11, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.1864, + "num_input_tokens_seen": 4664066048, + "step": 2224 + }, + { + "epoch": 0.11, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2103, + "num_input_tokens_seen": 4666163200, + "step": 2225 + }, + { + "epoch": 0.11, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.2004, + "num_input_tokens_seen": 4668260352, + "step": 2226 + }, + { + "epoch": 0.11, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.2474, + "num_input_tokens_seen": 4670357504, + "step": 2227 + }, + { + "epoch": 0.11, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2397, + "num_input_tokens_seen": 4672454656, + "step": 2228 + }, + { + "epoch": 0.11, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2307, + "num_input_tokens_seen": 4674551808, + "step": 2229 + }, + { + "epoch": 0.11, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.2362, + "num_input_tokens_seen": 4676648960, + "step": 2230 + }, + { + "epoch": 0.11, + "grad_norm": 2.21875, + "learning_rate": 2e-05, + "loss": 2.2631, + "num_input_tokens_seen": 4678746112, + "step": 2231 + }, + { + "epoch": 0.11, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2201, + "num_input_tokens_seen": 4680843264, + "step": 2232 + }, + { + "epoch": 0.11, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.1976, + "num_input_tokens_seen": 4682940416, + "step": 2233 + }, + { + "epoch": 0.11, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.2425, + "num_input_tokens_seen": 4685037568, + "step": 2234 + }, + { + "epoch": 0.11, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.1857, + "num_input_tokens_seen": 4687134720, + "step": 2235 + }, + { + "epoch": 0.11, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2355, + "num_input_tokens_seen": 4689231872, + "step": 2236 + }, + { + "epoch": 0.11, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2559, + "num_input_tokens_seen": 4691329024, + "step": 2237 + }, + { + "epoch": 0.11, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.2244, + "num_input_tokens_seen": 4693426176, + "step": 2238 + }, + { + "epoch": 0.11, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.2114, + "num_input_tokens_seen": 4695523328, + "step": 2239 + }, + { + "epoch": 0.11, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.2016, + "num_input_tokens_seen": 4697620480, + "step": 2240 + }, + { + "epoch": 0.11, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2105, + "num_input_tokens_seen": 4699717632, + "step": 2241 + }, + { + "epoch": 0.11, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2006, + "num_input_tokens_seen": 4701814784, + "step": 2242 + }, + { + "epoch": 0.11, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.1822, + "num_input_tokens_seen": 4703911936, + "step": 2243 + }, + { + "epoch": 0.11, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2176, + "num_input_tokens_seen": 4706009088, + "step": 2244 + }, + { + "epoch": 0.11, + "grad_norm": 0.73046875, + "learning_rate": 2e-05, + "loss": 2.2156, + "num_input_tokens_seen": 4708106240, + "step": 2245 + }, + { + "epoch": 0.11, + "grad_norm": 1.453125, + "learning_rate": 2e-05, + "loss": 2.2331, + "num_input_tokens_seen": 4710203392, + "step": 2246 + }, + { + "epoch": 0.11, + "grad_norm": 1.5078125, + "learning_rate": 2e-05, + "loss": 2.2408, + "num_input_tokens_seen": 4712300544, + "step": 2247 + }, + { + "epoch": 0.11, + "grad_norm": 0.765625, + "learning_rate": 2e-05, + "loss": 2.249, + "num_input_tokens_seen": 4714397696, + "step": 2248 + }, + { + "epoch": 0.12, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.2389, + "num_input_tokens_seen": 4716494848, + "step": 2249 + }, + { + "epoch": 0.12, + "grad_norm": 1.65625, + "learning_rate": 2e-05, + "loss": 2.2193, + "num_input_tokens_seen": 4718592000, + "step": 2250 + }, + { + "epoch": 0.12, + "grad_norm": 1.2265625, + "learning_rate": 2e-05, + "loss": 2.2399, + "num_input_tokens_seen": 4720689152, + "step": 2251 + }, + { + "epoch": 0.12, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.2025, + "num_input_tokens_seen": 4722786304, + "step": 2252 + }, + { + "epoch": 0.12, + "grad_norm": 1.84375, + "learning_rate": 2e-05, + "loss": 2.2386, + "num_input_tokens_seen": 4724883456, + "step": 2253 + }, + { + "epoch": 0.12, + "grad_norm": 1.4453125, + "learning_rate": 2e-05, + "loss": 2.2075, + "num_input_tokens_seen": 4726980608, + "step": 2254 + }, + { + "epoch": 0.12, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.1982, + "num_input_tokens_seen": 4729077760, + "step": 2255 + }, + { + "epoch": 0.12, + "grad_norm": 1.734375, + "learning_rate": 2e-05, + "loss": 2.2462, + "num_input_tokens_seen": 4731174912, + "step": 2256 + }, + { + "epoch": 0.12, + "grad_norm": 1.6328125, + "learning_rate": 2e-05, + "loss": 2.2281, + "num_input_tokens_seen": 4733272064, + "step": 2257 + }, + { + "epoch": 0.12, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.243, + "num_input_tokens_seen": 4735369216, + "step": 2258 + }, + { + "epoch": 0.12, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.2094, + "num_input_tokens_seen": 4737466368, + "step": 2259 + }, + { + "epoch": 0.12, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 2.2054, + "num_input_tokens_seen": 4739563520, + "step": 2260 + }, + { + "epoch": 0.12, + "grad_norm": 3.6875, + "learning_rate": 2e-05, + "loss": 2.1821, + "num_input_tokens_seen": 4741660672, + "step": 2261 + }, + { + "epoch": 0.12, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2007, + "num_input_tokens_seen": 4743757824, + "step": 2262 + }, + { + "epoch": 0.12, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2274, + "num_input_tokens_seen": 4745854976, + "step": 2263 + }, + { + "epoch": 0.12, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2129, + "num_input_tokens_seen": 4747952128, + "step": 2264 + }, + { + "epoch": 0.12, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2175, + "num_input_tokens_seen": 4750049280, + "step": 2265 + }, + { + "epoch": 0.12, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2393, + "num_input_tokens_seen": 4752146432, + "step": 2266 + }, + { + "epoch": 0.12, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2158, + "num_input_tokens_seen": 4754243584, + "step": 2267 + }, + { + "epoch": 0.12, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.2568, + "num_input_tokens_seen": 4756340736, + "step": 2268 + }, + { + "epoch": 0.12, + "grad_norm": 0.97265625, + "learning_rate": 2e-05, + "loss": 2.2007, + "num_input_tokens_seen": 4758437888, + "step": 2269 + }, + { + "epoch": 0.12, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2047, + "num_input_tokens_seen": 4760535040, + "step": 2270 + }, + { + "epoch": 0.12, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.2197, + "num_input_tokens_seen": 4762632192, + "step": 2271 + }, + { + "epoch": 0.12, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2206, + "num_input_tokens_seen": 4764729344, + "step": 2272 + }, + { + "epoch": 0.12, + "grad_norm": 0.94921875, + "learning_rate": 2e-05, + "loss": 2.2069, + "num_input_tokens_seen": 4766826496, + "step": 2273 + }, + { + "epoch": 0.12, + "grad_norm": 1.2109375, + "learning_rate": 2e-05, + "loss": 2.247, + "num_input_tokens_seen": 4768923648, + "step": 2274 + }, + { + "epoch": 0.12, + "grad_norm": 1.484375, + "learning_rate": 2e-05, + "loss": 2.2155, + "num_input_tokens_seen": 4771020800, + "step": 2275 + }, + { + "epoch": 0.12, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2072, + "num_input_tokens_seen": 4773117952, + "step": 2276 + }, + { + "epoch": 0.12, + "grad_norm": 1.53125, + "learning_rate": 2e-05, + "loss": 2.2116, + "num_input_tokens_seen": 4775215104, + "step": 2277 + }, + { + "epoch": 0.12, + "grad_norm": 1.5, + "learning_rate": 2e-05, + "loss": 2.1617, + "num_input_tokens_seen": 4777312256, + "step": 2278 + }, + { + "epoch": 0.12, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2556, + "num_input_tokens_seen": 4779409408, + "step": 2279 + }, + { + "epoch": 0.12, + "grad_norm": 1.2890625, + "learning_rate": 2e-05, + "loss": 2.2067, + "num_input_tokens_seen": 4781506560, + "step": 2280 + }, + { + "epoch": 0.12, + "grad_norm": 1.5546875, + "learning_rate": 2e-05, + "loss": 2.2204, + "num_input_tokens_seen": 4783603712, + "step": 2281 + }, + { + "epoch": 0.12, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.1997, + "num_input_tokens_seen": 4785700864, + "step": 2282 + }, + { + "epoch": 0.12, + "grad_norm": 1.421875, + "learning_rate": 2e-05, + "loss": 2.2384, + "num_input_tokens_seen": 4787798016, + "step": 2283 + }, + { + "epoch": 0.12, + "grad_norm": 1.34375, + "learning_rate": 2e-05, + "loss": 2.2249, + "num_input_tokens_seen": 4789895168, + "step": 2284 + }, + { + "epoch": 0.12, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.226, + "num_input_tokens_seen": 4791992320, + "step": 2285 + }, + { + "epoch": 0.12, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2345, + "num_input_tokens_seen": 4794089472, + "step": 2286 + }, + { + "epoch": 0.12, + "grad_norm": 1.0, + "learning_rate": 2e-05, + "loss": 2.2021, + "num_input_tokens_seen": 4796186624, + "step": 2287 + }, + { + "epoch": 0.12, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.2128, + "num_input_tokens_seen": 4798283776, + "step": 2288 + }, + { + "epoch": 0.12, + "grad_norm": 0.8359375, + "learning_rate": 2e-05, + "loss": 2.2279, + "num_input_tokens_seen": 4800380928, + "step": 2289 + }, + { + "epoch": 0.12, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2229, + "num_input_tokens_seen": 4802478080, + "step": 2290 + }, + { + "epoch": 0.12, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2167, + "num_input_tokens_seen": 4804575232, + "step": 2291 + }, + { + "epoch": 0.12, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2161, + "num_input_tokens_seen": 4806672384, + "step": 2292 + }, + { + "epoch": 0.12, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2243, + "num_input_tokens_seen": 4808769536, + "step": 2293 + }, + { + "epoch": 0.12, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2112, + "num_input_tokens_seen": 4810866688, + "step": 2294 + }, + { + "epoch": 0.12, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2128, + "num_input_tokens_seen": 4812963840, + "step": 2295 + }, + { + "epoch": 0.12, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.2524, + "num_input_tokens_seen": 4815060992, + "step": 2296 + }, + { + "epoch": 0.13, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.211, + "num_input_tokens_seen": 4817158144, + "step": 2297 + }, + { + "epoch": 0.13, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2139, + "num_input_tokens_seen": 4819255296, + "step": 2298 + }, + { + "epoch": 0.13, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.1893, + "num_input_tokens_seen": 4821352448, + "step": 2299 + }, + { + "epoch": 0.13, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2162, + "num_input_tokens_seen": 4823449600, + "step": 2300 + }, + { + "epoch": 0.13, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2545, + "num_input_tokens_seen": 4825546752, + "step": 2301 + }, + { + "epoch": 0.13, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2129, + "num_input_tokens_seen": 4827643904, + "step": 2302 + }, + { + "epoch": 0.13, + "grad_norm": 1.203125, + "learning_rate": 2e-05, + "loss": 2.2218, + "num_input_tokens_seen": 4829741056, + "step": 2303 + }, + { + "epoch": 0.13, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.2165, + "num_input_tokens_seen": 4831838208, + "step": 2304 + }, + { + "epoch": 0.13, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.2324, + "num_input_tokens_seen": 4833935360, + "step": 2305 + }, + { + "epoch": 0.13, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2368, + "num_input_tokens_seen": 4836032512, + "step": 2306 + }, + { + "epoch": 0.13, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2558, + "num_input_tokens_seen": 4838129664, + "step": 2307 + }, + { + "epoch": 0.13, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2291, + "num_input_tokens_seen": 4840226816, + "step": 2308 + }, + { + "epoch": 0.13, + "grad_norm": 0.93359375, + "learning_rate": 2e-05, + "loss": 2.2295, + "num_input_tokens_seen": 4842323968, + "step": 2309 + }, + { + "epoch": 0.13, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.2345, + "num_input_tokens_seen": 4844421120, + "step": 2310 + }, + { + "epoch": 0.13, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2057, + "num_input_tokens_seen": 4846518272, + "step": 2311 + }, + { + "epoch": 0.13, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2473, + "num_input_tokens_seen": 4848615424, + "step": 2312 + }, + { + "epoch": 0.13, + "grad_norm": 0.7265625, + "learning_rate": 2e-05, + "loss": 2.2094, + "num_input_tokens_seen": 4850712576, + "step": 2313 + }, + { + "epoch": 0.13, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2132, + "num_input_tokens_seen": 4852809728, + "step": 2314 + }, + { + "epoch": 0.13, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.193, + "num_input_tokens_seen": 4854906880, + "step": 2315 + }, + { + "epoch": 0.13, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2043, + "num_input_tokens_seen": 4857004032, + "step": 2316 + }, + { + "epoch": 0.13, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2047, + "num_input_tokens_seen": 4859101184, + "step": 2317 + }, + { + "epoch": 0.13, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2007, + "num_input_tokens_seen": 4861198336, + "step": 2318 + }, + { + "epoch": 0.13, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.2062, + "num_input_tokens_seen": 4863295488, + "step": 2319 + }, + { + "epoch": 0.13, + "grad_norm": 0.953125, + "learning_rate": 2e-05, + "loss": 2.1987, + "num_input_tokens_seen": 4865392640, + "step": 2320 + }, + { + "epoch": 0.13, + "grad_norm": 0.83984375, + "learning_rate": 2e-05, + "loss": 2.2125, + "num_input_tokens_seen": 4867489792, + "step": 2321 + }, + { + "epoch": 0.13, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.246, + "num_input_tokens_seen": 4869586944, + "step": 2322 + }, + { + "epoch": 0.13, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2184, + "num_input_tokens_seen": 4871684096, + "step": 2323 + }, + { + "epoch": 0.13, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.2249, + "num_input_tokens_seen": 4873781248, + "step": 2324 + }, + { + "epoch": 0.13, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.235, + "num_input_tokens_seen": 4875878400, + "step": 2325 + }, + { + "epoch": 0.13, + "grad_norm": 0.77734375, + "learning_rate": 2e-05, + "loss": 2.2251, + "num_input_tokens_seen": 4877975552, + "step": 2326 + }, + { + "epoch": 0.13, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2355, + "num_input_tokens_seen": 4880072704, + "step": 2327 + }, + { + "epoch": 0.13, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.2144, + "num_input_tokens_seen": 4882169856, + "step": 2328 + }, + { + "epoch": 0.13, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2044, + "num_input_tokens_seen": 4884267008, + "step": 2329 + }, + { + "epoch": 0.13, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2153, + "num_input_tokens_seen": 4886364160, + "step": 2330 + }, + { + "epoch": 0.13, + "grad_norm": 0.96875, + "learning_rate": 2e-05, + "loss": 2.1933, + "num_input_tokens_seen": 4888461312, + "step": 2331 + }, + { + "epoch": 0.13, + "grad_norm": 0.796875, + "learning_rate": 2e-05, + "loss": 2.2241, + "num_input_tokens_seen": 4890558464, + "step": 2332 + }, + { + "epoch": 0.13, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2318, + "num_input_tokens_seen": 4892655616, + "step": 2333 + }, + { + "epoch": 0.13, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.2072, + "num_input_tokens_seen": 4894752768, + "step": 2334 + }, + { + "epoch": 0.13, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.2245, + "num_input_tokens_seen": 4896849920, + "step": 2335 + }, + { + "epoch": 0.13, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2125, + "num_input_tokens_seen": 4898947072, + "step": 2336 + }, + { + "epoch": 0.13, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2228, + "num_input_tokens_seen": 4901044224, + "step": 2337 + }, + { + "epoch": 0.13, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.2031, + "num_input_tokens_seen": 4903141376, + "step": 2338 + }, + { + "epoch": 0.13, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.1687, + "num_input_tokens_seen": 4905238528, + "step": 2339 + }, + { + "epoch": 0.13, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2423, + "num_input_tokens_seen": 4907335680, + "step": 2340 + }, + { + "epoch": 0.13, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2175, + "num_input_tokens_seen": 4909432832, + "step": 2341 + }, + { + "epoch": 0.13, + "grad_norm": 1.25, + "learning_rate": 2e-05, + "loss": 2.2102, + "num_input_tokens_seen": 4911529984, + "step": 2342 + }, + { + "epoch": 0.13, + "grad_norm": 1.0390625, + "learning_rate": 2e-05, + "loss": 2.2022, + "num_input_tokens_seen": 4913627136, + "step": 2343 + }, + { + "epoch": 0.14, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.1779, + "num_input_tokens_seen": 4915724288, + "step": 2344 + }, + { + "epoch": 0.14, + "grad_norm": 1.3203125, + "learning_rate": 2e-05, + "loss": 2.193, + "num_input_tokens_seen": 4917821440, + "step": 2345 + }, + { + "epoch": 0.14, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2176, + "num_input_tokens_seen": 4919918592, + "step": 2346 + }, + { + "epoch": 0.14, + "grad_norm": 0.9921875, + "learning_rate": 2e-05, + "loss": 2.2466, + "num_input_tokens_seen": 4922015744, + "step": 2347 + }, + { + "epoch": 0.14, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.2113, + "num_input_tokens_seen": 4924112896, + "step": 2348 + }, + { + "epoch": 0.14, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2185, + "num_input_tokens_seen": 4926210048, + "step": 2349 + }, + { + "epoch": 0.14, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.177, + "num_input_tokens_seen": 4928307200, + "step": 2350 + }, + { + "epoch": 0.14, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2186, + "num_input_tokens_seen": 4930404352, + "step": 2351 + }, + { + "epoch": 0.14, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.1916, + "num_input_tokens_seen": 4932501504, + "step": 2352 + }, + { + "epoch": 0.14, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.1838, + "num_input_tokens_seen": 4934598656, + "step": 2353 + }, + { + "epoch": 0.14, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2081, + "num_input_tokens_seen": 4936695808, + "step": 2354 + }, + { + "epoch": 0.14, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2101, + "num_input_tokens_seen": 4938792960, + "step": 2355 + }, + { + "epoch": 0.14, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2087, + "num_input_tokens_seen": 4940890112, + "step": 2356 + }, + { + "epoch": 0.14, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2218, + "num_input_tokens_seen": 4942987264, + "step": 2357 + }, + { + "epoch": 0.14, + "grad_norm": 1.09375, + "learning_rate": 2e-05, + "loss": 2.224, + "num_input_tokens_seen": 4945084416, + "step": 2358 + }, + { + "epoch": 0.14, + "grad_norm": 0.75390625, + "learning_rate": 2e-05, + "loss": 2.2067, + "num_input_tokens_seen": 4947181568, + "step": 2359 + }, + { + "epoch": 0.14, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.1973, + "num_input_tokens_seen": 4949278720, + "step": 2360 + }, + { + "epoch": 0.14, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.2084, + "num_input_tokens_seen": 4951375872, + "step": 2361 + }, + { + "epoch": 0.14, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2111, + "num_input_tokens_seen": 4953473024, + "step": 2362 + }, + { + "epoch": 0.14, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.1762, + "num_input_tokens_seen": 4955570176, + "step": 2363 + }, + { + "epoch": 0.14, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2157, + "num_input_tokens_seen": 4957667328, + "step": 2364 + }, + { + "epoch": 0.14, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2088, + "num_input_tokens_seen": 4959764480, + "step": 2365 + }, + { + "epoch": 0.14, + "grad_norm": 1.234375, + "learning_rate": 2e-05, + "loss": 2.2402, + "num_input_tokens_seen": 4961861632, + "step": 2366 + }, + { + "epoch": 0.14, + "grad_norm": 1.3671875, + "learning_rate": 2e-05, + "loss": 2.211, + "num_input_tokens_seen": 4963958784, + "step": 2367 + }, + { + "epoch": 0.14, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.2208, + "num_input_tokens_seen": 4966055936, + "step": 2368 + }, + { + "epoch": 0.14, + "grad_norm": 1.328125, + "learning_rate": 2e-05, + "loss": 2.1928, + "num_input_tokens_seen": 4968153088, + "step": 2369 + }, + { + "epoch": 0.14, + "grad_norm": 1.3359375, + "learning_rate": 2e-05, + "loss": 2.22, + "num_input_tokens_seen": 4970250240, + "step": 2370 + }, + { + "epoch": 0.14, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.2084, + "num_input_tokens_seen": 4972347392, + "step": 2371 + }, + { + "epoch": 0.14, + "grad_norm": 0.87890625, + "learning_rate": 2e-05, + "loss": 2.1896, + "num_input_tokens_seen": 4974444544, + "step": 2372 + }, + { + "epoch": 0.14, + "grad_norm": 1.1640625, + "learning_rate": 2e-05, + "loss": 2.2113, + "num_input_tokens_seen": 4976541696, + "step": 2373 + }, + { + "epoch": 0.14, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.1771, + "num_input_tokens_seen": 4978638848, + "step": 2374 + }, + { + "epoch": 0.14, + "grad_norm": 1.046875, + "learning_rate": 2e-05, + "loss": 2.2459, + "num_input_tokens_seen": 4980736000, + "step": 2375 + }, + { + "epoch": 0.14, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2276, + "num_input_tokens_seen": 4982833152, + "step": 2376 + }, + { + "epoch": 0.14, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2095, + "num_input_tokens_seen": 4984930304, + "step": 2377 + }, + { + "epoch": 0.14, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2181, + "num_input_tokens_seen": 4987027456, + "step": 2378 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.2384, + "num_input_tokens_seen": 4989124608, + "step": 2379 + }, + { + "epoch": 0.14, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2352, + "num_input_tokens_seen": 4991221760, + "step": 2380 + }, + { + "epoch": 0.14, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.206, + "num_input_tokens_seen": 4993318912, + "step": 2381 + }, + { + "epoch": 0.14, + "grad_norm": 0.921875, + "learning_rate": 2e-05, + "loss": 2.2215, + "num_input_tokens_seen": 4995416064, + "step": 2382 + }, + { + "epoch": 0.14, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.234, + "num_input_tokens_seen": 4997513216, + "step": 2383 + }, + { + "epoch": 0.14, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2282, + "num_input_tokens_seen": 4999610368, + "step": 2384 + }, + { + "epoch": 0.14, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.1974, + "num_input_tokens_seen": 5001707520, + "step": 2385 + }, + { + "epoch": 0.14, + "grad_norm": 0.76953125, + "learning_rate": 2e-05, + "loss": 2.2135, + "num_input_tokens_seen": 5003804672, + "step": 2386 + }, + { + "epoch": 0.14, + "grad_norm": 0.9453125, + "learning_rate": 2e-05, + "loss": 2.2315, + "num_input_tokens_seen": 5005901824, + "step": 2387 + }, + { + "epoch": 0.14, + "grad_norm": 0.9296875, + "learning_rate": 2e-05, + "loss": 2.2172, + "num_input_tokens_seen": 5007998976, + "step": 2388 + }, + { + "epoch": 0.14, + "eval_loss": 2.2845656871795654, + "eval_runtime": 1677.6813, + "eval_samples_per_second": 2.35, + "eval_steps_per_second": 0.588, + "num_input_tokens_seen": 5007998976, + "step": 2388 + }, + { + "epoch": 0.14, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2172, + "num_input_tokens_seen": 5010096128, + "step": 2389 + }, + { + "epoch": 0.14, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2471, + "num_input_tokens_seen": 5012193280, + "step": 2390 + }, + { + "epoch": 0.14, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2383, + "num_input_tokens_seen": 5014290432, + "step": 2391 + }, + { + "epoch": 0.15, + "grad_norm": 0.859375, + "learning_rate": 2e-05, + "loss": 2.1998, + "num_input_tokens_seen": 5016387584, + "step": 2392 + }, + { + "epoch": 0.15, + "grad_norm": 1.4921875, + "learning_rate": 2e-05, + "loss": 2.2113, + "num_input_tokens_seen": 5018484736, + "step": 2393 + }, + { + "epoch": 0.15, + "grad_norm": 1.4765625, + "learning_rate": 2e-05, + "loss": 2.1996, + "num_input_tokens_seen": 5020581888, + "step": 2394 + }, + { + "epoch": 0.15, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2106, + "num_input_tokens_seen": 5022679040, + "step": 2395 + }, + { + "epoch": 0.15, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.2066, + "num_input_tokens_seen": 5024776192, + "step": 2396 + }, + { + "epoch": 0.15, + "grad_norm": 1.1875, + "learning_rate": 2e-05, + "loss": 2.2016, + "num_input_tokens_seen": 5026873344, + "step": 2397 + }, + { + "epoch": 0.15, + "grad_norm": 0.96484375, + "learning_rate": 2e-05, + "loss": 2.2149, + "num_input_tokens_seen": 5028970496, + "step": 2398 + }, + { + "epoch": 0.15, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2258, + "num_input_tokens_seen": 5031067648, + "step": 2399 + }, + { + "epoch": 0.15, + "grad_norm": 0.98828125, + "learning_rate": 2e-05, + "loss": 2.2216, + "num_input_tokens_seen": 5033164800, + "step": 2400 + }, + { + "epoch": 0.15, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.2041, + "num_input_tokens_seen": 5035261952, + "step": 2401 + }, + { + "epoch": 0.15, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2054, + "num_input_tokens_seen": 5037359104, + "step": 2402 + }, + { + "epoch": 0.15, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2385, + "num_input_tokens_seen": 5039456256, + "step": 2403 + }, + { + "epoch": 0.15, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.1899, + "num_input_tokens_seen": 5041553408, + "step": 2404 + }, + { + "epoch": 0.15, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.207, + "num_input_tokens_seen": 5043650560, + "step": 2405 + }, + { + "epoch": 0.15, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.2031, + "num_input_tokens_seen": 5045747712, + "step": 2406 + }, + { + "epoch": 0.15, + "grad_norm": 0.91015625, + "learning_rate": 2e-05, + "loss": 2.1876, + "num_input_tokens_seen": 5047844864, + "step": 2407 + }, + { + "epoch": 0.15, + "grad_norm": 0.84375, + "learning_rate": 2e-05, + "loss": 2.2022, + "num_input_tokens_seen": 5049942016, + "step": 2408 + }, + { + "epoch": 0.15, + "grad_norm": 1.0703125, + "learning_rate": 2e-05, + "loss": 2.175, + "num_input_tokens_seen": 5052039168, + "step": 2409 + }, + { + "epoch": 0.15, + "grad_norm": 0.8984375, + "learning_rate": 2e-05, + "loss": 2.2121, + "num_input_tokens_seen": 5054136320, + "step": 2410 + }, + { + "epoch": 0.15, + "grad_norm": 0.85546875, + "learning_rate": 2e-05, + "loss": 2.1748, + "num_input_tokens_seen": 5056233472, + "step": 2411 + }, + { + "epoch": 0.15, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.2166, + "num_input_tokens_seen": 5058330624, + "step": 2412 + }, + { + "epoch": 0.15, + "grad_norm": 1.015625, + "learning_rate": 2e-05, + "loss": 2.198, + "num_input_tokens_seen": 5060427776, + "step": 2413 + }, + { + "epoch": 0.15, + "grad_norm": 0.9140625, + "learning_rate": 2e-05, + "loss": 2.2371, + "num_input_tokens_seen": 5062524928, + "step": 2414 + }, + { + "epoch": 0.15, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.195, + "num_input_tokens_seen": 5064622080, + "step": 2415 + }, + { + "epoch": 0.15, + "grad_norm": 0.9609375, + "learning_rate": 2e-05, + "loss": 2.2151, + "num_input_tokens_seen": 5066719232, + "step": 2416 + }, + { + "epoch": 0.15, + "grad_norm": 0.80859375, + "learning_rate": 2e-05, + "loss": 2.1957, + "num_input_tokens_seen": 5068816384, + "step": 2417 + }, + { + "epoch": 0.15, + "grad_norm": 0.875, + "learning_rate": 2e-05, + "loss": 2.2213, + "num_input_tokens_seen": 5070913536, + "step": 2418 + }, + { + "epoch": 0.15, + "grad_norm": 0.78515625, + "learning_rate": 2e-05, + "loss": 2.2185, + "num_input_tokens_seen": 5073010688, + "step": 2419 + }, + { + "epoch": 0.15, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2109, + "num_input_tokens_seen": 5075107840, + "step": 2420 + }, + { + "epoch": 0.15, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2051, + "num_input_tokens_seen": 5077204992, + "step": 2421 + }, + { + "epoch": 0.15, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2415, + "num_input_tokens_seen": 5079302144, + "step": 2422 + }, + { + "epoch": 0.15, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2256, + "num_input_tokens_seen": 5081399296, + "step": 2423 + }, + { + "epoch": 0.15, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2285, + "num_input_tokens_seen": 5083496448, + "step": 2424 + }, + { + "epoch": 0.15, + "grad_norm": 0.71484375, + "learning_rate": 2e-05, + "loss": 2.1879, + "num_input_tokens_seen": 5085593600, + "step": 2425 + }, + { + "epoch": 0.15, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.2043, + "num_input_tokens_seen": 5087690752, + "step": 2426 + }, + { + "epoch": 0.15, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2538, + "num_input_tokens_seen": 5089787904, + "step": 2427 + }, + { + "epoch": 0.15, + "grad_norm": 0.7578125, + "learning_rate": 2e-05, + "loss": 2.2484, + "num_input_tokens_seen": 5091885056, + "step": 2428 + }, + { + "epoch": 0.15, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2404, + "num_input_tokens_seen": 5093982208, + "step": 2429 + }, + { + "epoch": 0.15, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.1924, + "num_input_tokens_seen": 5096079360, + "step": 2430 + }, + { + "epoch": 0.15, + "grad_norm": 0.92578125, + "learning_rate": 2e-05, + "loss": 2.2381, + "num_input_tokens_seen": 5098176512, + "step": 2431 + }, + { + "epoch": 0.15, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.2188, + "num_input_tokens_seen": 5100273664, + "step": 2432 + }, + { + "epoch": 0.15, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.2242, + "num_input_tokens_seen": 5102370816, + "step": 2433 + }, + { + "epoch": 0.15, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2287, + "num_input_tokens_seen": 5104467968, + "step": 2434 + }, + { + "epoch": 0.15, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2295, + "num_input_tokens_seen": 5106565120, + "step": 2435 + }, + { + "epoch": 0.15, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2007, + "num_input_tokens_seen": 5108662272, + "step": 2436 + }, + { + "epoch": 0.15, + "grad_norm": 0.8515625, + "learning_rate": 2e-05, + "loss": 2.1813, + "num_input_tokens_seen": 5110759424, + "step": 2437 + }, + { + "epoch": 0.15, + "grad_norm": 0.72265625, + "learning_rate": 2e-05, + "loss": 2.2076, + "num_input_tokens_seen": 5112856576, + "step": 2438 + }, + { + "epoch": 0.15, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2065, + "num_input_tokens_seen": 5114953728, + "step": 2439 + }, + { + "epoch": 0.16, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2148, + "num_input_tokens_seen": 5117050880, + "step": 2440 + }, + { + "epoch": 0.16, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2454, + "num_input_tokens_seen": 5119148032, + "step": 2441 + }, + { + "epoch": 0.16, + "grad_norm": 0.8046875, + "learning_rate": 2e-05, + "loss": 2.2369, + "num_input_tokens_seen": 5121245184, + "step": 2442 + }, + { + "epoch": 0.16, + "grad_norm": 0.7734375, + "learning_rate": 2e-05, + "loss": 2.2235, + "num_input_tokens_seen": 5123342336, + "step": 2443 + }, + { + "epoch": 0.16, + "grad_norm": 0.73828125, + "learning_rate": 2e-05, + "loss": 2.208, + "num_input_tokens_seen": 5125439488, + "step": 2444 + }, + { + "epoch": 0.16, + "grad_norm": 0.8828125, + "learning_rate": 2e-05, + "loss": 2.2429, + "num_input_tokens_seen": 5127536640, + "step": 2445 + }, + { + "epoch": 0.16, + "grad_norm": 0.91796875, + "learning_rate": 2e-05, + "loss": 2.2352, + "num_input_tokens_seen": 5129633792, + "step": 2446 + }, + { + "epoch": 0.16, + "grad_norm": 0.7890625, + "learning_rate": 2e-05, + "loss": 2.223, + "num_input_tokens_seen": 5131730944, + "step": 2447 + }, + { + "epoch": 0.16, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.2403, + "num_input_tokens_seen": 5133828096, + "step": 2448 + }, + { + "epoch": 0.16, + "grad_norm": 1.40625, + "learning_rate": 2e-05, + "loss": 2.2296, + "num_input_tokens_seen": 5135925248, + "step": 2449 + }, + { + "epoch": 0.16, + "grad_norm": 15.4375, + "learning_rate": 2e-05, + "loss": 2.228, + "num_input_tokens_seen": 5138022400, + "step": 2450 + }, + { + "epoch": 0.16, + "grad_norm": 1.59375, + "learning_rate": 2e-05, + "loss": 2.2053, + "num_input_tokens_seen": 5140119552, + "step": 2451 + }, + { + "epoch": 0.16, + "grad_norm": 1.1328125, + "learning_rate": 2e-05, + "loss": 2.1757, + "num_input_tokens_seen": 5142216704, + "step": 2452 + }, + { + "epoch": 0.16, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2169, + "num_input_tokens_seen": 5144313856, + "step": 2453 + }, + { + "epoch": 0.16, + "grad_norm": 1.3515625, + "learning_rate": 2e-05, + "loss": 2.1949, + "num_input_tokens_seen": 5146411008, + "step": 2454 + }, + { + "epoch": 0.16, + "grad_norm": 1.140625, + "learning_rate": 2e-05, + "loss": 2.2216, + "num_input_tokens_seen": 5148508160, + "step": 2455 + }, + { + "epoch": 0.16, + "grad_norm": 1.0234375, + "learning_rate": 2e-05, + "loss": 2.1917, + "num_input_tokens_seen": 5150605312, + "step": 2456 + }, + { + "epoch": 0.16, + "grad_norm": 1.375, + "learning_rate": 2e-05, + "loss": 2.2072, + "num_input_tokens_seen": 5152702464, + "step": 2457 + }, + { + "epoch": 0.16, + "grad_norm": 1.1015625, + "learning_rate": 2e-05, + "loss": 2.219, + "num_input_tokens_seen": 5154799616, + "step": 2458 + }, + { + "epoch": 0.16, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.211, + "num_input_tokens_seen": 5156896768, + "step": 2459 + }, + { + "epoch": 0.16, + "grad_norm": 1.03125, + "learning_rate": 2e-05, + "loss": 2.2342, + "num_input_tokens_seen": 5158993920, + "step": 2460 + }, + { + "epoch": 0.16, + "grad_norm": 0.8203125, + "learning_rate": 2e-05, + "loss": 2.214, + "num_input_tokens_seen": 5161091072, + "step": 2461 + }, + { + "epoch": 0.16, + "grad_norm": 0.8671875, + "learning_rate": 2e-05, + "loss": 2.2081, + "num_input_tokens_seen": 5163188224, + "step": 2462 + }, + { + "epoch": 0.16, + "grad_norm": 1.0859375, + "learning_rate": 2e-05, + "loss": 2.2518, + "num_input_tokens_seen": 5165285376, + "step": 2463 + }, + { + "epoch": 0.16, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2446, + "num_input_tokens_seen": 5167382528, + "step": 2464 + }, + { + "epoch": 0.16, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2265, + "num_input_tokens_seen": 5169479680, + "step": 2465 + }, + { + "epoch": 0.16, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2186, + "num_input_tokens_seen": 5171576832, + "step": 2466 + }, + { + "epoch": 0.16, + "grad_norm": 0.98046875, + "learning_rate": 2e-05, + "loss": 2.268, + "num_input_tokens_seen": 5173673984, + "step": 2467 + }, + { + "epoch": 0.16, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2073, + "num_input_tokens_seen": 5175771136, + "step": 2468 + }, + { + "epoch": 0.16, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2059, + "num_input_tokens_seen": 5177868288, + "step": 2469 + }, + { + "epoch": 0.16, + "grad_norm": 0.828125, + "learning_rate": 2e-05, + "loss": 2.2206, + "num_input_tokens_seen": 5179965440, + "step": 2470 + }, + { + "epoch": 0.16, + "grad_norm": 1.0625, + "learning_rate": 2e-05, + "loss": 2.2297, + "num_input_tokens_seen": 5182062592, + "step": 2471 + }, + { + "epoch": 0.16, + "grad_norm": 0.890625, + "learning_rate": 2e-05, + "loss": 2.1802, + "num_input_tokens_seen": 5184159744, + "step": 2472 + }, + { + "epoch": 0.16, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.2438, + "num_input_tokens_seen": 5186256896, + "step": 2473 + }, + { + "epoch": 0.16, + "grad_norm": 0.81640625, + "learning_rate": 2e-05, + "loss": 2.2078, + "num_input_tokens_seen": 5188354048, + "step": 2474 + }, + { + "epoch": 0.16, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.2347, + "num_input_tokens_seen": 5190451200, + "step": 2475 + }, + { + "epoch": 0.16, + "grad_norm": 0.9375, + "learning_rate": 2e-05, + "loss": 2.1934, + "num_input_tokens_seen": 5192548352, + "step": 2476 + }, + { + "epoch": 0.16, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.1833, + "num_input_tokens_seen": 5194645504, + "step": 2477 + }, + { + "epoch": 0.16, + "grad_norm": 0.8125, + "learning_rate": 2e-05, + "loss": 2.2239, + "num_input_tokens_seen": 5196742656, + "step": 2478 + }, + { + "epoch": 0.16, + "grad_norm": 1.1171875, + "learning_rate": 2e-05, + "loss": 2.2279, + "num_input_tokens_seen": 5198839808, + "step": 2479 + }, + { + "epoch": 0.16, + "grad_norm": 1.15625, + "learning_rate": 2e-05, + "loss": 2.2398, + "num_input_tokens_seen": 5200936960, + "step": 2480 + }, + { + "epoch": 0.16, + "grad_norm": 0.90234375, + "learning_rate": 2e-05, + "loss": 2.227, + "num_input_tokens_seen": 5203034112, + "step": 2481 + }, + { + "epoch": 0.16, + "grad_norm": 1.2578125, + "learning_rate": 2e-05, + "loss": 2.2006, + "num_input_tokens_seen": 5205131264, + "step": 2482 + }, + { + "epoch": 0.16, + "grad_norm": 1.75, + "learning_rate": 2e-05, + "loss": 2.2074, + "num_input_tokens_seen": 5207228416, + "step": 2483 + }, + { + "epoch": 0.16, + "grad_norm": 1.4609375, + "learning_rate": 2e-05, + "loss": 2.2153, + "num_input_tokens_seen": 5209325568, + "step": 2484 + }, + { + "epoch": 0.16, + "grad_norm": 0.90625, + "learning_rate": 2e-05, + "loss": 2.2281, + "num_input_tokens_seen": 5211422720, + "step": 2485 + }, + { + "epoch": 0.16, + "grad_norm": 1.2734375, + "learning_rate": 2e-05, + "loss": 2.1909, + "num_input_tokens_seen": 5213519872, + "step": 2486 + }, + { + "epoch": 0.17, + "grad_norm": 1.640625, + "learning_rate": 2e-05, + "loss": 2.2138, + "num_input_tokens_seen": 5215617024, + "step": 2487 + }, + { + "epoch": 0.17, + "grad_norm": 1.296875, + "learning_rate": 2e-05, + "loss": 2.1726, + "num_input_tokens_seen": 5217714176, + "step": 2488 + }, + { + "epoch": 0.17, + "grad_norm": 0.84765625, + "learning_rate": 2e-05, + "loss": 2.179, + "num_input_tokens_seen": 5219811328, + "step": 2489 + }, + { + "epoch": 0.17, + "grad_norm": 1.171875, + "learning_rate": 2e-05, + "loss": 2.2159, + "num_input_tokens_seen": 5221908480, + "step": 2490 + }, + { + "epoch": 0.17, + "grad_norm": 1.21875, + "learning_rate": 2e-05, + "loss": 2.2382, + "num_input_tokens_seen": 5224005632, + "step": 2491 + }, + { + "epoch": 0.17, + "grad_norm": 1.1484375, + "learning_rate": 2e-05, + "loss": 2.1965, + "num_input_tokens_seen": 5226102784, + "step": 2492 + }, + { + "epoch": 0.17, + "grad_norm": 0.82421875, + "learning_rate": 2e-05, + "loss": 2.1674, + "num_input_tokens_seen": 5228199936, + "step": 2493 + }, + { + "epoch": 0.17, + "grad_norm": 1.0078125, + "learning_rate": 2e-05, + "loss": 2.203, + "num_input_tokens_seen": 5230297088, + "step": 2494 + }, + { + "epoch": 0.17, + "grad_norm": 1.0546875, + "learning_rate": 2e-05, + "loss": 2.1765, + "num_input_tokens_seen": 5232394240, + "step": 2495 + }, + { + "epoch": 0.17, + "grad_norm": 0.86328125, + "learning_rate": 2e-05, + "loss": 2.1648, + "num_input_tokens_seen": 5234491392, + "step": 2496 + }, + { + "epoch": 0.17, + "grad_norm": 0.89453125, + "learning_rate": 2e-05, + "loss": 2.1952, + "num_input_tokens_seen": 5236588544, + "step": 2497 + }, + { + "epoch": 0.17, + "grad_norm": 1.1796875, + "learning_rate": 2e-05, + "loss": 2.2105, + "num_input_tokens_seen": 5238685696, + "step": 2498 + }, + { + "epoch": 0.17, + "grad_norm": 0.88671875, + "learning_rate": 2e-05, + "loss": 2.1676, + "num_input_tokens_seen": 5240782848, + "step": 2499 + }, + { + "epoch": 0.17, + "grad_norm": 0.99609375, + "learning_rate": 2e-05, + "loss": 2.1949, + "num_input_tokens_seen": 5242880000, + "step": 2500 + }, + { + "epoch": 0.17, + "grad_norm": 0.9765625, + "learning_rate": 2e-05, + "loss": 2.2532, + "num_input_tokens_seen": 5244977152, + "step": 2501 + }, + { + "epoch": 0.17, + "grad_norm": 0.79296875, + "learning_rate": 2e-05, + "loss": 2.2425, + "num_input_tokens_seen": 5247074304, + "step": 2502 + }, + { + "epoch": 0.17, + "grad_norm": 0.7421875, + "learning_rate": 2e-05, + "loss": 2.2774, + "num_input_tokens_seen": 5249171456, + "step": 2503 + }, + { + "epoch": 0.17, + "grad_norm": 0.71875, + "learning_rate": 2e-05, + "loss": 2.2143, + "num_input_tokens_seen": 5251268608, + "step": 2504 + }, + { + "epoch": 0.17, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.2455, + "num_input_tokens_seen": 5253365760, + "step": 2505 + }, + { + "epoch": 0.17, + "grad_norm": 0.87109375, + "learning_rate": 2e-05, + "loss": 2.2435, + "num_input_tokens_seen": 5255462912, + "step": 2506 + }, + { + "epoch": 0.17, + "grad_norm": 0.6953125, + "learning_rate": 2e-05, + "loss": 2.2258, + "num_input_tokens_seen": 5257560064, + "step": 2507 + }, + { + "epoch": 0.17, + "grad_norm": 0.80078125, + "learning_rate": 2e-05, + "loss": 2.2156, + "num_input_tokens_seen": 5259657216, + "step": 2508 + }, + { + "epoch": 0.17, + "grad_norm": 0.83203125, + "learning_rate": 2e-05, + "loss": 2.2501, + "num_input_tokens_seen": 5261754368, + "step": 2509 + }, + { + "epoch": 0.17, + "grad_norm": 0.75, + "learning_rate": 2e-05, + "loss": 2.2627, + "num_input_tokens_seen": 5263851520, + "step": 2510 + }, + { + "epoch": 0.17, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2641, + "num_input_tokens_seen": 5265948672, + "step": 2511 + }, + { + "epoch": 0.17, + "grad_norm": 0.70703125, + "learning_rate": 2e-05, + "loss": 2.2227, + "num_input_tokens_seen": 5268045824, + "step": 2512 + }, + { + "epoch": 0.17, + "grad_norm": 0.78125, + "learning_rate": 2e-05, + "loss": 2.1824, + "num_input_tokens_seen": 5270142976, + "step": 2513 + }, + { + "epoch": 0.17, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2242, + "num_input_tokens_seen": 5272240128, + "step": 2514 + }, + { + "epoch": 0.17, + "grad_norm": 0.62890625, + "learning_rate": 2e-05, + "loss": 2.2262, + "num_input_tokens_seen": 5274337280, + "step": 2515 + }, + { + "epoch": 0.17, + "grad_norm": 0.625, + "learning_rate": 2e-05, + "loss": 2.2326, + "num_input_tokens_seen": 5276434432, + "step": 2516 + }, + { + "epoch": 0.17, + "grad_norm": 0.58203125, + "learning_rate": 2e-05, + "loss": 2.2287, + "num_input_tokens_seen": 5278531584, + "step": 2517 + }, + { + "epoch": 0.17, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.2359, + "num_input_tokens_seen": 5280628736, + "step": 2518 + }, + { + "epoch": 0.17, + "grad_norm": 0.62890625, + "learning_rate": 2e-05, + "loss": 2.2626, + "num_input_tokens_seen": 5282725888, + "step": 2519 + }, + { + "epoch": 0.17, + "grad_norm": 0.55859375, + "learning_rate": 2e-05, + "loss": 2.2574, + "num_input_tokens_seen": 5284823040, + "step": 2520 + }, + { + "epoch": 0.17, + "grad_norm": 0.58203125, + "learning_rate": 2e-05, + "loss": 2.2636, + "num_input_tokens_seen": 5286920192, + "step": 2521 + }, + { + "epoch": 0.17, + "grad_norm": 0.609375, + "learning_rate": 2e-05, + "loss": 2.2267, + "num_input_tokens_seen": 5289017344, + "step": 2522 + }, + { + "epoch": 0.17, + "grad_norm": 0.5703125, + "learning_rate": 2e-05, + "loss": 2.2373, + "num_input_tokens_seen": 5291114496, + "step": 2523 + }, + { + "epoch": 0.17, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2023, + "num_input_tokens_seen": 5293211648, + "step": 2524 + }, + { + "epoch": 0.17, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.2242, + "num_input_tokens_seen": 5295308800, + "step": 2525 + }, + { + "epoch": 0.17, + "grad_norm": 0.6328125, + "learning_rate": 2e-05, + "loss": 2.2436, + "num_input_tokens_seen": 5297405952, + "step": 2526 + }, + { + "epoch": 0.17, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.211, + "num_input_tokens_seen": 5299503104, + "step": 2527 + }, + { + "epoch": 0.17, + "grad_norm": 0.62109375, + "learning_rate": 2e-05, + "loss": 2.2402, + "num_input_tokens_seen": 5301600256, + "step": 2528 + }, + { + "epoch": 0.17, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.2375, + "num_input_tokens_seen": 5303697408, + "step": 2529 + }, + { + "epoch": 0.17, + "grad_norm": 0.625, + "learning_rate": 2e-05, + "loss": 2.2399, + "num_input_tokens_seen": 5305794560, + "step": 2530 + }, + { + "epoch": 0.17, + "grad_norm": 0.62890625, + "learning_rate": 2e-05, + "loss": 2.2409, + "num_input_tokens_seen": 5307891712, + "step": 2531 + }, + { + "epoch": 0.17, + "grad_norm": 0.59375, + "learning_rate": 2e-05, + "loss": 2.2632, + "num_input_tokens_seen": 5309988864, + "step": 2532 + }, + { + "epoch": 0.17, + "grad_norm": 0.66796875, + "learning_rate": 2e-05, + "loss": 2.2392, + "num_input_tokens_seen": 5312086016, + "step": 2533 + }, + { + "epoch": 0.17, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.2223, + "num_input_tokens_seen": 5314183168, + "step": 2534 + }, + { + "epoch": 0.18, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.2643, + "num_input_tokens_seen": 5316280320, + "step": 2535 + }, + { + "epoch": 0.18, + "grad_norm": 0.5546875, + "learning_rate": 2e-05, + "loss": 2.266, + "num_input_tokens_seen": 5318377472, + "step": 2536 + }, + { + "epoch": 0.18, + "grad_norm": 0.5859375, + "learning_rate": 2e-05, + "loss": 2.2564, + "num_input_tokens_seen": 5320474624, + "step": 2537 + }, + { + "epoch": 0.18, + "grad_norm": 0.54296875, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 5322571776, + "step": 2538 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.2835, + "num_input_tokens_seen": 5324668928, + "step": 2539 + }, + { + "epoch": 0.18, + "grad_norm": 0.55859375, + "learning_rate": 2e-05, + "loss": 2.228, + "num_input_tokens_seen": 5326766080, + "step": 2540 + }, + { + "epoch": 0.18, + "grad_norm": 0.5390625, + "learning_rate": 2e-05, + "loss": 2.2302, + "num_input_tokens_seen": 5328863232, + "step": 2541 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.2323, + "num_input_tokens_seen": 5330960384, + "step": 2542 + }, + { + "epoch": 0.18, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.1935, + "num_input_tokens_seen": 5333057536, + "step": 2543 + }, + { + "epoch": 0.18, + "grad_norm": 0.5546875, + "learning_rate": 2e-05, + "loss": 2.2391, + "num_input_tokens_seen": 5335154688, + "step": 2544 + }, + { + "epoch": 0.18, + "grad_norm": 0.56640625, + "learning_rate": 2e-05, + "loss": 2.2353, + "num_input_tokens_seen": 5337251840, + "step": 2545 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 2e-05, + "loss": 2.2606, + "num_input_tokens_seen": 5339348992, + "step": 2546 + }, + { + "epoch": 0.18, + "grad_norm": 0.54296875, + "learning_rate": 2e-05, + "loss": 2.2437, + "num_input_tokens_seen": 5341446144, + "step": 2547 + }, + { + "epoch": 0.18, + "grad_norm": 0.578125, + "learning_rate": 2e-05, + "loss": 2.2223, + "num_input_tokens_seen": 5343543296, + "step": 2548 + }, + { + "epoch": 0.18, + "grad_norm": 0.578125, + "learning_rate": 2e-05, + "loss": 2.234, + "num_input_tokens_seen": 5345640448, + "step": 2549 + }, + { + "epoch": 0.18, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2671, + "num_input_tokens_seen": 5347737600, + "step": 2550 + }, + { + "epoch": 0.18, + "grad_norm": 0.5546875, + "learning_rate": 2e-05, + "loss": 2.2482, + "num_input_tokens_seen": 5349834752, + "step": 2551 + }, + { + "epoch": 0.18, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.2244, + "num_input_tokens_seen": 5351931904, + "step": 2552 + }, + { + "epoch": 0.18, + "grad_norm": 0.5703125, + "learning_rate": 2e-05, + "loss": 2.2537, + "num_input_tokens_seen": 5354029056, + "step": 2553 + }, + { + "epoch": 0.18, + "grad_norm": 0.62109375, + "learning_rate": 2e-05, + "loss": 2.2495, + "num_input_tokens_seen": 5356126208, + "step": 2554 + }, + { + "epoch": 0.18, + "grad_norm": 0.671875, + "learning_rate": 2e-05, + "loss": 2.2813, + "num_input_tokens_seen": 5358223360, + "step": 2555 + }, + { + "epoch": 0.18, + "grad_norm": 0.54296875, + "learning_rate": 2e-05, + "loss": 2.2157, + "num_input_tokens_seen": 5360320512, + "step": 2556 + }, + { + "epoch": 0.18, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2131, + "num_input_tokens_seen": 5362417664, + "step": 2557 + }, + { + "epoch": 0.18, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.275, + "num_input_tokens_seen": 5364514816, + "step": 2558 + }, + { + "epoch": 0.18, + "grad_norm": 0.578125, + "learning_rate": 2e-05, + "loss": 2.2604, + "num_input_tokens_seen": 5366611968, + "step": 2559 + }, + { + "epoch": 0.18, + "grad_norm": 0.55859375, + "learning_rate": 2e-05, + "loss": 2.2455, + "num_input_tokens_seen": 5368709120, + "step": 2560 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.2506, + "num_input_tokens_seen": 5370806272, + "step": 2561 + }, + { + "epoch": 0.18, + "grad_norm": 0.578125, + "learning_rate": 2e-05, + "loss": 2.2468, + "num_input_tokens_seen": 5372903424, + "step": 2562 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 2e-05, + "loss": 2.2476, + "num_input_tokens_seen": 5375000576, + "step": 2563 + }, + { + "epoch": 0.18, + "grad_norm": 0.56640625, + "learning_rate": 2e-05, + "loss": 2.2391, + "num_input_tokens_seen": 5377097728, + "step": 2564 + }, + { + "epoch": 0.18, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2651, + "num_input_tokens_seen": 5379194880, + "step": 2565 + }, + { + "epoch": 0.18, + "grad_norm": 0.5390625, + "learning_rate": 2e-05, + "loss": 2.2662, + "num_input_tokens_seen": 5381292032, + "step": 2566 + }, + { + "epoch": 0.18, + "grad_norm": 0.53515625, + "learning_rate": 2e-05, + "loss": 2.2551, + "num_input_tokens_seen": 5383389184, + "step": 2567 + }, + { + "epoch": 0.18, + "grad_norm": 0.51953125, + "learning_rate": 2e-05, + "loss": 2.2431, + "num_input_tokens_seen": 5385486336, + "step": 2568 + }, + { + "epoch": 0.18, + "grad_norm": 0.5546875, + "learning_rate": 2e-05, + "loss": 2.269, + "num_input_tokens_seen": 5387583488, + "step": 2569 + }, + { + "epoch": 0.18, + "grad_norm": 0.54296875, + "learning_rate": 2e-05, + "loss": 2.244, + "num_input_tokens_seen": 5389680640, + "step": 2570 + }, + { + "epoch": 0.18, + "grad_norm": 0.5859375, + "learning_rate": 2e-05, + "loss": 2.2554, + "num_input_tokens_seen": 5391777792, + "step": 2571 + }, + { + "epoch": 0.18, + "grad_norm": 0.53125, + "learning_rate": 2e-05, + "loss": 2.228, + "num_input_tokens_seen": 5393874944, + "step": 2572 + }, + { + "epoch": 0.18, + "grad_norm": 0.53515625, + "learning_rate": 2e-05, + "loss": 2.2689, + "num_input_tokens_seen": 5395972096, + "step": 2573 + }, + { + "epoch": 0.18, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2274, + "num_input_tokens_seen": 5398069248, + "step": 2574 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.2118, + "num_input_tokens_seen": 5400166400, + "step": 2575 + }, + { + "epoch": 0.18, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2246, + "num_input_tokens_seen": 5402263552, + "step": 2576 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.231, + "num_input_tokens_seen": 5404360704, + "step": 2577 + }, + { + "epoch": 0.18, + "grad_norm": 0.5625, + "learning_rate": 2e-05, + "loss": 2.2491, + "num_input_tokens_seen": 5406457856, + "step": 2578 + }, + { + "epoch": 0.18, + "grad_norm": 0.6171875, + "learning_rate": 2e-05, + "loss": 2.2323, + "num_input_tokens_seen": 5408555008, + "step": 2579 + }, + { + "epoch": 0.18, + "grad_norm": 0.5859375, + "learning_rate": 2e-05, + "loss": 2.2529, + "num_input_tokens_seen": 5410652160, + "step": 2580 + }, + { + "epoch": 0.18, + "grad_norm": 0.67578125, + "learning_rate": 2e-05, + "loss": 2.2248, + "num_input_tokens_seen": 5412749312, + "step": 2581 + }, + { + "epoch": 0.18, + "grad_norm": 0.6015625, + "learning_rate": 2e-05, + "loss": 2.2077, + "num_input_tokens_seen": 5414846464, + "step": 2582 + }, + { + "epoch": 0.19, + "grad_norm": 0.61328125, + "learning_rate": 2e-05, + "loss": 2.2312, + "num_input_tokens_seen": 5416943616, + "step": 2583 + }, + { + "epoch": 0.19, + "grad_norm": 0.62890625, + "learning_rate": 2e-05, + "loss": 2.2491, + "num_input_tokens_seen": 5419040768, + "step": 2584 + }, + { + "epoch": 0.19, + "grad_norm": 0.69921875, + "learning_rate": 2e-05, + "loss": 2.2337, + "num_input_tokens_seen": 5421137920, + "step": 2585 + }, + { + "epoch": 0.19, + "grad_norm": 0.56640625, + "learning_rate": 2e-05, + "loss": 2.2125, + "num_input_tokens_seen": 5423235072, + "step": 2586 + }, + { + "epoch": 0.19, + "grad_norm": 0.64453125, + "learning_rate": 2e-05, + "loss": 2.2438, + "num_input_tokens_seen": 5425332224, + "step": 2587 + }, + { + "epoch": 0.19, + "eval_loss": 2.27114200592041, + "eval_runtime": 1924.8277, + "eval_samples_per_second": 2.048, + "eval_steps_per_second": 0.512, + "num_input_tokens_seen": 5425332224, + "step": 2587 + }, + { + "epoch": 0.19, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.2681, + "num_input_tokens_seen": 5427429376, + "step": 2588 + }, + { + "epoch": 0.19, + "grad_norm": 0.55859375, + "learning_rate": 2e-05, + "loss": 2.2245, + "num_input_tokens_seen": 5429526528, + "step": 2589 + }, + { + "epoch": 0.19, + "grad_norm": 0.60546875, + "learning_rate": 2e-05, + "loss": 2.2476, + "num_input_tokens_seen": 5431623680, + "step": 2590 + }, + { + "epoch": 0.19, + "grad_norm": 0.53125, + "learning_rate": 2e-05, + "loss": 2.2004, + "num_input_tokens_seen": 5433720832, + "step": 2591 + }, + { + "epoch": 0.19, + "grad_norm": 0.6171875, + "learning_rate": 2e-05, + "loss": 2.2343, + "num_input_tokens_seen": 5435817984, + "step": 2592 + }, + { + "epoch": 0.19, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2351, + "num_input_tokens_seen": 5437915136, + "step": 2593 + }, + { + "epoch": 0.19, + "grad_norm": 0.58984375, + "learning_rate": 2e-05, + "loss": 2.2344, + "num_input_tokens_seen": 5440012288, + "step": 2594 + }, + { + "epoch": 0.19, + "grad_norm": 0.546875, + "learning_rate": 2e-05, + "loss": 2.2213, + "num_input_tokens_seen": 5442109440, + "step": 2595 + }, + { + "epoch": 0.19, + "grad_norm": 0.55859375, + "learning_rate": 2e-05, + "loss": 2.2306, + "num_input_tokens_seen": 5444206592, + "step": 2596 + }, + { + "epoch": 0.19, + "grad_norm": 0.58203125, + "learning_rate": 2e-05, + "loss": 2.2363, + "num_input_tokens_seen": 5446303744, + "step": 2597 + }, + { + "epoch": 0.19, + "grad_norm": 0.578125, + "learning_rate": 2e-05, + "loss": 2.2797, + "num_input_tokens_seen": 5448400896, + "step": 2598 + }, + { + "epoch": 0.19, + "grad_norm": 0.57421875, + "learning_rate": 2e-05, + "loss": 2.2352, + "num_input_tokens_seen": 5450498048, + "step": 2599 + }, + { + "epoch": 0.19, + "grad_norm": 0.54296875, + "learning_rate": 2e-05, + "loss": 2.2149, + "num_input_tokens_seen": 5452595200, + "step": 2600 } ], "logging_steps": 1, "max_steps": 4768, - "num_input_tokens_seen": 3565158400, + "num_input_tokens_seen": 5452595200, "num_train_epochs": 9223372036854775807, "save_steps": 100, - "total_flos": 1.5210377742479524e+20, + "total_flos": 2.3262930664968684e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null