diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,2913 @@ +{ + "best_metric": 0.8931613819214387, + "best_model_checkpoint": "bge-small-hotpotwa-matryoshka-fine-tuned-50/checkpoint-500", + "epoch": 26.924694993689524, + "eval_steps": 50, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.33655868742111905, + "grad_norm": 1.7359095811843872, + "learning_rate": 1.3513513513513515e-06, + "loss": 19.5758, + "step": 50 + }, + { + "epoch": 0.33655868742111905, + "eval_dim_128_cosine_accuracy": 0.9551585423568386, + "eval_dim_128_dot_accuracy": 0.08980123047799338, + "eval_dim_128_euclidean_accuracy": 0.9530288689067676, + "eval_dim_128_manhattan_accuracy": 0.9527922385234264, + "eval_dim_128_max_accuracy": 0.9551585423568386, + "eval_dim_256_cosine_accuracy": 0.966280170373876, + "eval_dim_256_dot_accuracy": 0.042711784193090394, + "eval_dim_256_euclidean_accuracy": 0.9659252247988642, + "eval_dim_256_manhattan_accuracy": 0.9634406057737813, + "eval_dim_256_max_accuracy": 0.966280170373876, + "eval_dim_384_cosine_accuracy": 0.9667534311405585, + "eval_dim_384_dot_accuracy": 0.03324656885944155, + "eval_dim_384_euclidean_accuracy": 0.9667534311405585, + "eval_dim_384_manhattan_accuracy": 0.9669900615238997, + "eval_dim_384_max_accuracy": 0.9669900615238997, + "eval_dim_64_cosine_accuracy": 0.9358731661145291, + "eval_dim_64_dot_accuracy": 0.1320397539044013, + "eval_dim_64_euclidean_accuracy": 0.9345716990061524, + "eval_dim_64_manhattan_accuracy": 0.9269995267392334, + "eval_dim_64_max_accuracy": 0.9358731661145291, + "eval_loss": 19.393272399902344, + "eval_runtime": 104.7788, + "eval_samples_per_second": 80.665, + "eval_sequential_score": 0.9358731661145291, + "eval_steps_per_second": 2.529, + "step": 50 + }, + { + "epoch": 0.6731173748422381, + "grad_norm": 1.976278305053711, + "learning_rate": 2.702702702702703e-06, + "loss": 19.4573, + "step": 100 + }, + { + "epoch": 0.6731173748422381, + "eval_dim_128_cosine_accuracy": 0.9570515854235684, + "eval_dim_128_dot_accuracy": 0.06625650733554188, + "eval_dim_128_euclidean_accuracy": 0.9589446284902982, + "eval_dim_128_manhattan_accuracy": 0.9557501183151916, + "eval_dim_128_max_accuracy": 0.9589446284902982, + "eval_dim_256_cosine_accuracy": 0.9646237576904875, + "eval_dim_256_dot_accuracy": 0.04046379555134879, + "eval_dim_256_euclidean_accuracy": 0.9650970184571699, + "eval_dim_256_manhattan_accuracy": 0.9632039753904401, + "eval_dim_256_max_accuracy": 0.9650970184571699, + "eval_dim_384_cosine_accuracy": 0.9653336488405111, + "eval_dim_384_dot_accuracy": 0.03466635115948888, + "eval_dim_384_euclidean_accuracy": 0.9653336488405111, + "eval_dim_384_manhattan_accuracy": 0.9646237576904875, + "eval_dim_384_max_accuracy": 0.9653336488405111, + "eval_dim_64_cosine_accuracy": 0.9449834358731661, + "eval_dim_64_dot_accuracy": 0.08932796971131093, + "eval_dim_64_euclidean_accuracy": 0.9461665877898722, + "eval_dim_64_manhattan_accuracy": 0.9420255560814008, + "eval_dim_64_max_accuracy": 0.9461665877898722, + "eval_loss": 19.097097396850586, + "eval_runtime": 103.9699, + "eval_samples_per_second": 81.293, + "eval_sequential_score": 0.9449834358731661, + "eval_steps_per_second": 2.549, + "step": 100 + }, + { + "epoch": 1.0096760622633572, + "grad_norm": 2.1209616661071777, + "learning_rate": 4.0540540540540545e-06, + "loss": 19.1409, + "step": 150 + }, + { + "epoch": 1.0096760622633572, + "eval_dim_128_cosine_accuracy": 0.9384761003312825, + "eval_dim_128_dot_accuracy": 0.06897775674396593, + "eval_dim_128_euclidean_accuracy": 0.9421438712730714, + "eval_dim_128_manhattan_accuracy": 0.939540937056318, + "eval_dim_128_max_accuracy": 0.9421438712730714, + "eval_dim_256_cosine_accuracy": 0.9434453383814482, + "eval_dim_256_dot_accuracy": 0.05797444391859915, + "eval_dim_256_euclidean_accuracy": 0.9436819687647894, + "eval_dim_256_manhattan_accuracy": 0.9423805016564126, + "eval_dim_256_max_accuracy": 0.9436819687647894, + "eval_dim_384_cosine_accuracy": 0.9473497397065783, + "eval_dim_384_dot_accuracy": 0.05265026029342167, + "eval_dim_384_euclidean_accuracy": 0.9473497397065783, + "eval_dim_384_manhattan_accuracy": 0.9458116422148604, + "eval_dim_384_max_accuracy": 0.9473497397065783, + "eval_dim_64_cosine_accuracy": 0.9306672976810223, + "eval_dim_64_dot_accuracy": 0.07749645054424988, + "eval_dim_64_euclidean_accuracy": 0.9332702318977757, + "eval_dim_64_manhattan_accuracy": 0.9320870799810695, + "eval_dim_64_max_accuracy": 0.9332702318977757, + "eval_loss": 18.4069766998291, + "eval_runtime": 103.2125, + "eval_samples_per_second": 81.889, + "eval_sequential_score": 0.9306672976810223, + "eval_steps_per_second": 2.568, + "step": 150 + }, + { + "epoch": 1.3462347496844762, + "grad_norm": 1.658170461654663, + "learning_rate": 5.405405405405406e-06, + "loss": 18.6431, + "step": 200 + }, + { + "epoch": 1.3462347496844762, + "eval_dim_128_cosine_accuracy": 0.9125650733554188, + "eval_dim_128_dot_accuracy": 0.08826313298627544, + "eval_dim_128_euclidean_accuracy": 0.9139848556554662, + "eval_dim_128_manhattan_accuracy": 0.9145764316138192, + "eval_dim_128_max_accuracy": 0.9145764316138192, + "eval_dim_256_cosine_accuracy": 0.9163511594888784, + "eval_dim_256_dot_accuracy": 0.08613345953620445, + "eval_dim_256_euclidean_accuracy": 0.9163511594888784, + "eval_dim_256_manhattan_accuracy": 0.9151680075721723, + "eval_dim_256_max_accuracy": 0.9163511594888784, + "eval_dim_384_cosine_accuracy": 0.9183625177472787, + "eval_dim_384_dot_accuracy": 0.08163748225272124, + "eval_dim_384_euclidean_accuracy": 0.9183625177472787, + "eval_dim_384_manhattan_accuracy": 0.9184808329389493, + "eval_dim_384_max_accuracy": 0.9184808329389493, + "eval_dim_64_cosine_accuracy": 0.9093705631803124, + "eval_dim_64_dot_accuracy": 0.09477046852815901, + "eval_dim_64_euclidean_accuracy": 0.9126833885470894, + "eval_dim_64_manhattan_accuracy": 0.9113819214387128, + "eval_dim_64_max_accuracy": 0.9126833885470894, + "eval_loss": 17.32919692993164, + "eval_runtime": 102.8811, + "eval_samples_per_second": 82.153, + "eval_sequential_score": 0.9093705631803124, + "eval_steps_per_second": 2.576, + "step": 200 + }, + { + "epoch": 1.6827934371055953, + "grad_norm": 1.5389924049377441, + "learning_rate": 6.7567567567567575e-06, + "loss": 18.2288, + "step": 250 + }, + { + "epoch": 1.6827934371055953, + "eval_dim_128_cosine_accuracy": 0.9062943681968765, + "eval_dim_128_dot_accuracy": 0.09311405584477046, + "eval_dim_128_euclidean_accuracy": 0.9062943681968765, + "eval_dim_128_manhattan_accuracy": 0.9062943681968765, + "eval_dim_128_max_accuracy": 0.9062943681968765, + "eval_dim_256_cosine_accuracy": 0.9071225745385707, + "eval_dim_256_dot_accuracy": 0.09335068622811168, + "eval_dim_256_euclidean_accuracy": 0.907950780880265, + "eval_dim_256_manhattan_accuracy": 0.9093705631803124, + "eval_dim_256_max_accuracy": 0.9093705631803124, + "eval_dim_384_cosine_accuracy": 0.9099621391386654, + "eval_dim_384_dot_accuracy": 0.0900378608613346, + "eval_dim_384_euclidean_accuracy": 0.9099621391386654, + "eval_dim_384_manhattan_accuracy": 0.9087789872219593, + "eval_dim_384_max_accuracy": 0.9099621391386654, + "eval_dim_64_cosine_accuracy": 0.9022716516800757, + "eval_dim_64_dot_accuracy": 0.09962139138665405, + "eval_dim_64_euclidean_accuracy": 0.9046379555134879, + "eval_dim_64_manhattan_accuracy": 0.9040463795551349, + "eval_dim_64_max_accuracy": 0.9046379555134879, + "eval_loss": 16.875099182128906, + "eval_runtime": 104.7249, + "eval_samples_per_second": 80.707, + "eval_sequential_score": 0.9022716516800757, + "eval_steps_per_second": 2.53, + "step": 250 + }, + { + "epoch": 2.0193521245267143, + "grad_norm": 1.4371246099472046, + "learning_rate": 8.108108108108109e-06, + "loss": 18.0425, + "step": 300 + }, + { + "epoch": 2.0193521245267143, + "eval_dim_128_cosine_accuracy": 0.9020350212967345, + "eval_dim_128_dot_accuracy": 0.09772834831992427, + "eval_dim_128_euclidean_accuracy": 0.9035731187884525, + "eval_dim_128_manhattan_accuracy": 0.9044013251301467, + "eval_dim_128_max_accuracy": 0.9044013251301467, + "eval_dim_256_cosine_accuracy": 0.9032181732134406, + "eval_dim_256_dot_accuracy": 0.09690014197823, + "eval_dim_256_euclidean_accuracy": 0.90309985802177, + "eval_dim_256_manhattan_accuracy": 0.9042830099384761, + "eval_dim_256_max_accuracy": 0.9042830099384761, + "eval_dim_384_cosine_accuracy": 0.9045196403218173, + "eval_dim_384_dot_accuracy": 0.09548035967818268, + "eval_dim_384_euclidean_accuracy": 0.9045196403218173, + "eval_dim_384_manhattan_accuracy": 0.9049929010884997, + "eval_dim_384_max_accuracy": 0.9049929010884997, + "eval_dim_64_cosine_accuracy": 0.8989588263132986, + "eval_dim_64_dot_accuracy": 0.10234264079507809, + "eval_dim_64_euclidean_accuracy": 0.9016800757217227, + "eval_dim_64_manhattan_accuracy": 0.9016800757217227, + "eval_dim_64_max_accuracy": 0.9016800757217227, + "eval_loss": 16.69808578491211, + "eval_runtime": 103.4615, + "eval_samples_per_second": 81.692, + "eval_sequential_score": 0.8989588263132986, + "eval_steps_per_second": 2.561, + "step": 300 + }, + { + "epoch": 2.3559108119478336, + "grad_norm": 1.386720895767212, + "learning_rate": 9.45945945945946e-06, + "loss": 17.9458, + "step": 350 + }, + { + "epoch": 2.3559108119478336, + "eval_dim_128_cosine_accuracy": 0.9036914339801231, + "eval_dim_128_dot_accuracy": 0.09761003312825367, + "eval_dim_128_euclidean_accuracy": 0.9034548035967819, + "eval_dim_128_manhattan_accuracy": 0.9016800757217227, + "eval_dim_128_max_accuracy": 0.9036914339801231, + "eval_dim_256_cosine_accuracy": 0.9013251301467108, + "eval_dim_256_dot_accuracy": 0.09855655466161856, + "eval_dim_256_euclidean_accuracy": 0.9015617605300521, + "eval_dim_256_manhattan_accuracy": 0.9022716516800757, + "eval_dim_256_max_accuracy": 0.9022716516800757, + "eval_dim_384_cosine_accuracy": 0.9021533364884051, + "eval_dim_384_dot_accuracy": 0.09784666351159489, + "eval_dim_384_euclidean_accuracy": 0.9021533364884051, + "eval_dim_384_manhattan_accuracy": 0.9039280643634643, + "eval_dim_384_max_accuracy": 0.9039280643634643, + "eval_dim_64_cosine_accuracy": 0.8983672503549456, + "eval_dim_64_dot_accuracy": 0.10352579271178419, + "eval_dim_64_euclidean_accuracy": 0.8995504022716517, + "eval_dim_64_manhattan_accuracy": 0.8981306199716044, + "eval_dim_64_max_accuracy": 0.8995504022716517, + "eval_loss": 16.615509033203125, + "eval_runtime": 103.1308, + "eval_samples_per_second": 81.954, + "eval_sequential_score": 0.8983672503549456, + "eval_steps_per_second": 2.57, + "step": 350 + }, + { + "epoch": 2.6924694993689524, + "grad_norm": 1.4882862567901611, + "learning_rate": 1.0810810810810812e-05, + "loss": 17.8525, + "step": 400 + }, + { + "epoch": 2.6924694993689524, + "eval_dim_128_cosine_accuracy": 0.8977756743965926, + "eval_dim_128_dot_accuracy": 0.10269758637008992, + "eval_dim_128_euclidean_accuracy": 0.9006152389966872, + "eval_dim_128_manhattan_accuracy": 0.900378608613346, + "eval_dim_128_max_accuracy": 0.9006152389966872, + "eval_dim_256_cosine_accuracy": 0.8970657832465688, + "eval_dim_256_dot_accuracy": 0.10269758637008992, + "eval_dim_256_euclidean_accuracy": 0.8980123047799338, + "eval_dim_256_manhattan_accuracy": 0.8971840984382394, + "eval_dim_256_max_accuracy": 0.8980123047799338, + "eval_dim_384_cosine_accuracy": 0.8974207288215806, + "eval_dim_384_dot_accuracy": 0.1025792711784193, + "eval_dim_384_euclidean_accuracy": 0.8974207288215806, + "eval_dim_384_manhattan_accuracy": 0.898248935163275, + "eval_dim_384_max_accuracy": 0.898248935163275, + "eval_dim_64_cosine_accuracy": 0.8948177946048272, + "eval_dim_64_dot_accuracy": 0.10636535731187885, + "eval_dim_64_euclidean_accuracy": 0.8969474680548982, + "eval_dim_64_manhattan_accuracy": 0.8948177946048272, + "eval_dim_64_max_accuracy": 0.8969474680548982, + "eval_loss": 16.553625106811523, + "eval_runtime": 103.3808, + "eval_samples_per_second": 81.756, + "eval_sequential_score": 0.8948177946048272, + "eval_steps_per_second": 2.563, + "step": 400 + }, + { + "epoch": 3.0290281867900717, + "grad_norm": 1.5986053943634033, + "learning_rate": 1.2162162162162164e-05, + "loss": 17.7529, + "step": 450 + }, + { + "epoch": 3.0290281867900717, + "eval_dim_128_cosine_accuracy": 0.8980123047799338, + "eval_dim_128_dot_accuracy": 0.10340747752011359, + "eval_dim_128_euclidean_accuracy": 0.8997870326549929, + "eval_dim_128_manhattan_accuracy": 0.8996687174633223, + "eval_dim_128_max_accuracy": 0.8997870326549929, + "eval_dim_256_cosine_accuracy": 0.8956460009465216, + "eval_dim_256_dot_accuracy": 0.10399905347846664, + "eval_dim_256_euclidean_accuracy": 0.8970657832465688, + "eval_dim_256_manhattan_accuracy": 0.8960009465215334, + "eval_dim_256_max_accuracy": 0.8970657832465688, + "eval_dim_384_cosine_accuracy": 0.8952910553715097, + "eval_dim_384_dot_accuracy": 0.1047089446284903, + "eval_dim_384_euclidean_accuracy": 0.8952910553715097, + "eval_dim_384_manhattan_accuracy": 0.8971840984382394, + "eval_dim_384_max_accuracy": 0.8971840984382394, + "eval_dim_64_cosine_accuracy": 0.8950544249881685, + "eval_dim_64_dot_accuracy": 0.10541883577851396, + "eval_dim_64_euclidean_accuracy": 0.8969474680548982, + "eval_dim_64_manhattan_accuracy": 0.8948177946048272, + "eval_dim_64_max_accuracy": 0.8969474680548982, + "eval_loss": 16.51355743408203, + "eval_runtime": 104.654, + "eval_samples_per_second": 80.761, + "eval_sequential_score": 0.8950544249881685, + "eval_steps_per_second": 2.532, + "step": 450 + }, + { + "epoch": 3.3655868742111905, + "grad_norm": 1.8756661415100098, + "learning_rate": 1.3513513513513515e-05, + "loss": 17.6709, + "step": 500 + }, + { + "epoch": 3.3655868742111905, + "eval_dim_128_cosine_accuracy": 0.8931613819214387, + "eval_dim_128_dot_accuracy": 0.10766682442025556, + "eval_dim_128_euclidean_accuracy": 0.8944628490298154, + "eval_dim_128_manhattan_accuracy": 0.8942262186464742, + "eval_dim_128_max_accuracy": 0.8944628490298154, + "eval_dim_256_cosine_accuracy": 0.8913866540463795, + "eval_dim_256_dot_accuracy": 0.10896829152863227, + "eval_dim_256_euclidean_accuracy": 0.8937529578797918, + "eval_dim_256_manhattan_accuracy": 0.8937529578797918, + "eval_dim_256_max_accuracy": 0.8937529578797918, + "eval_dim_384_cosine_accuracy": 0.8928064363464269, + "eval_dim_384_dot_accuracy": 0.10719356365357312, + "eval_dim_384_euclidean_accuracy": 0.8928064363464269, + "eval_dim_384_manhattan_accuracy": 0.8932796971131093, + "eval_dim_384_max_accuracy": 0.8932796971131093, + "eval_dim_64_cosine_accuracy": 0.8906767628963559, + "eval_dim_64_dot_accuracy": 0.11121628017037388, + "eval_dim_64_euclidean_accuracy": 0.8911500236630383, + "eval_dim_64_manhattan_accuracy": 0.8893752957879791, + "eval_dim_64_max_accuracy": 0.8911500236630383, + "eval_loss": 16.4824161529541, + "eval_runtime": 103.2754, + "eval_samples_per_second": 81.839, + "eval_sequential_score": 0.8906767628963559, + "eval_steps_per_second": 2.566, + "step": 500 + }, + { + "epoch": 3.70214556163231, + "grad_norm": 2.3590304851531982, + "learning_rate": 1.4864864864864865e-05, + "loss": 17.5348, + "step": 550 + }, + { + "epoch": 3.70214556163231, + "eval_dim_128_cosine_accuracy": 0.8862991008045433, + "eval_dim_128_dot_accuracy": 0.11500236630383341, + "eval_dim_128_euclidean_accuracy": 0.8864174159962139, + "eval_dim_128_manhattan_accuracy": 0.8858258400378609, + "eval_dim_128_max_accuracy": 0.8864174159962139, + "eval_dim_256_cosine_accuracy": 0.8858258400378609, + "eval_dim_256_dot_accuracy": 0.11358258400378608, + "eval_dim_256_euclidean_accuracy": 0.8867723615712257, + "eval_dim_256_manhattan_accuracy": 0.8858258400378609, + "eval_dim_256_max_accuracy": 0.8867723615712257, + "eval_dim_384_cosine_accuracy": 0.8859441552295315, + "eval_dim_384_dot_accuracy": 0.11405584477046853, + "eval_dim_384_euclidean_accuracy": 0.8859441552295315, + "eval_dim_384_manhattan_accuracy": 0.88760056791292, + "eval_dim_384_max_accuracy": 0.88760056791292, + "eval_dim_64_cosine_accuracy": 0.884879318504496, + "eval_dim_64_dot_accuracy": 0.11985328916232844, + "eval_dim_64_euclidean_accuracy": 0.8845243729294842, + "eval_dim_64_manhattan_accuracy": 0.8828679602460956, + "eval_dim_64_max_accuracy": 0.884879318504496, + "eval_loss": 16.463218688964844, + "eval_runtime": 103.2788, + "eval_samples_per_second": 81.837, + "eval_sequential_score": 0.884879318504496, + "eval_steps_per_second": 2.566, + "step": 550 + }, + { + "epoch": 4.038704249053429, + "grad_norm": 2.6120336055755615, + "learning_rate": 1.6216216216216218e-05, + "loss": 17.4198, + "step": 600 + }, + { + "epoch": 4.038704249053429, + "eval_dim_128_cosine_accuracy": 0.8852342640795078, + "eval_dim_128_dot_accuracy": 0.11748698532891623, + "eval_dim_128_euclidean_accuracy": 0.8846426881211548, + "eval_dim_128_manhattan_accuracy": 0.8859441552295315, + "eval_dim_128_max_accuracy": 0.8859441552295315, + "eval_dim_256_cosine_accuracy": 0.8861807856128727, + "eval_dim_256_dot_accuracy": 0.1137008991954567, + "eval_dim_256_euclidean_accuracy": 0.8871273071462376, + "eval_dim_256_manhattan_accuracy": 0.8866540463795551, + "eval_dim_256_max_accuracy": 0.8871273071462376, + "eval_dim_384_cosine_accuracy": 0.8859441552295315, + "eval_dim_384_dot_accuracy": 0.11405584477046853, + "eval_dim_384_euclidean_accuracy": 0.8859441552295315, + "eval_dim_384_manhattan_accuracy": 0.8847610033128254, + "eval_dim_384_max_accuracy": 0.8859441552295315, + "eval_dim_64_cosine_accuracy": 0.8839327969711311, + "eval_dim_64_dot_accuracy": 0.12103644107903455, + "eval_dim_64_euclidean_accuracy": 0.8861807856128727, + "eval_dim_64_manhattan_accuracy": 0.8857075248461902, + "eval_dim_64_max_accuracy": 0.8861807856128727, + "eval_loss": 16.46009063720703, + "eval_runtime": 104.1113, + "eval_samples_per_second": 81.182, + "eval_sequential_score": 0.8839327969711311, + "eval_steps_per_second": 2.545, + "step": 600 + }, + { + "epoch": 4.375262936474548, + "grad_norm": 2.63383412361145, + "learning_rate": 1.756756756756757e-05, + "loss": 17.3673, + "step": 650 + }, + { + "epoch": 4.375262936474548, + "eval_dim_128_cosine_accuracy": 0.8853525792711784, + "eval_dim_128_dot_accuracy": 0.1160672030288689, + "eval_dim_128_euclidean_accuracy": 0.8867723615712257, + "eval_dim_128_manhattan_accuracy": 0.8855892096545196, + "eval_dim_128_max_accuracy": 0.8867723615712257, + "eval_dim_256_cosine_accuracy": 0.8864174159962139, + "eval_dim_256_dot_accuracy": 0.11417415996213914, + "eval_dim_256_euclidean_accuracy": 0.8871273071462376, + "eval_dim_256_manhattan_accuracy": 0.8862991008045433, + "eval_dim_256_max_accuracy": 0.8871273071462376, + "eval_dim_384_cosine_accuracy": 0.8865357311878845, + "eval_dim_384_dot_accuracy": 0.11346426881211548, + "eval_dim_384_euclidean_accuracy": 0.8865357311878845, + "eval_dim_384_manhattan_accuracy": 0.8861807856128727, + "eval_dim_384_max_accuracy": 0.8865357311878845, + "eval_dim_64_cosine_accuracy": 0.8841694273544723, + "eval_dim_64_dot_accuracy": 0.12091812588736393, + "eval_dim_64_euclidean_accuracy": 0.883341221012778, + "eval_dim_64_manhattan_accuracy": 0.8828679602460956, + "eval_dim_64_max_accuracy": 0.8841694273544723, + "eval_loss": 16.440513610839844, + "eval_runtime": 102.5958, + "eval_samples_per_second": 82.382, + "eval_sequential_score": 0.8841694273544723, + "eval_steps_per_second": 2.583, + "step": 650 + }, + { + "epoch": 4.711821623895667, + "grad_norm": 3.044569730758667, + "learning_rate": 1.891891891891892e-05, + "loss": 17.2603, + "step": 700 + }, + { + "epoch": 4.711821623895667, + "eval_dim_128_cosine_accuracy": 0.8834595362044486, + "eval_dim_128_dot_accuracy": 0.11772361571225745, + "eval_dim_128_euclidean_accuracy": 0.8835778513961192, + "eval_dim_128_manhattan_accuracy": 0.8840511121628017, + "eval_dim_128_max_accuracy": 0.8840511121628017, + "eval_dim_256_cosine_accuracy": 0.8838144817794605, + "eval_dim_256_dot_accuracy": 0.11571225745385708, + "eval_dim_256_euclidean_accuracy": 0.8838144817794605, + "eval_dim_256_manhattan_accuracy": 0.8839327969711311, + "eval_dim_256_max_accuracy": 0.8839327969711311, + "eval_dim_384_cosine_accuracy": 0.8838144817794605, + "eval_dim_384_dot_accuracy": 0.11618551822053952, + "eval_dim_384_euclidean_accuracy": 0.8838144817794605, + "eval_dim_384_manhattan_accuracy": 0.8847610033128254, + "eval_dim_384_max_accuracy": 0.8847610033128254, + "eval_dim_64_cosine_accuracy": 0.8807382867960246, + "eval_dim_64_dot_accuracy": 0.12328442972077615, + "eval_dim_64_euclidean_accuracy": 0.8814481779460482, + "eval_dim_64_manhattan_accuracy": 0.8810932323710364, + "eval_dim_64_max_accuracy": 0.8814481779460482, + "eval_loss": 16.435609817504883, + "eval_runtime": 103.6437, + "eval_samples_per_second": 81.549, + "eval_sequential_score": 0.8807382867960246, + "eval_steps_per_second": 2.557, + "step": 700 + }, + { + "epoch": 5.0483803113167856, + "grad_norm": 3.3264880180358887, + "learning_rate": 1.9999888744757143e-05, + "loss": 17.1807, + "step": 750 + }, + { + "epoch": 5.0483803113167856, + "eval_dim_128_cosine_accuracy": 0.8849976336961666, + "eval_dim_128_dot_accuracy": 0.11654046379555134, + "eval_dim_128_euclidean_accuracy": 0.884879318504496, + "eval_dim_128_manhattan_accuracy": 0.8838144817794605, + "eval_dim_128_max_accuracy": 0.8849976336961666, + "eval_dim_256_cosine_accuracy": 0.8864174159962139, + "eval_dim_256_dot_accuracy": 0.11417415996213914, + "eval_dim_256_euclidean_accuracy": 0.8852342640795078, + "eval_dim_256_manhattan_accuracy": 0.8857075248461902, + "eval_dim_256_max_accuracy": 0.8864174159962139, + "eval_dim_384_cosine_accuracy": 0.8859441552295315, + "eval_dim_384_dot_accuracy": 0.11405584477046853, + "eval_dim_384_euclidean_accuracy": 0.8859441552295315, + "eval_dim_384_manhattan_accuracy": 0.8855892096545196, + "eval_dim_384_max_accuracy": 0.8859441552295315, + "eval_dim_64_cosine_accuracy": 0.8838144817794605, + "eval_dim_64_dot_accuracy": 0.12079981069569333, + "eval_dim_64_euclidean_accuracy": 0.8844060577378136, + "eval_dim_64_manhattan_accuracy": 0.8834595362044486, + "eval_dim_64_max_accuracy": 0.8844060577378136, + "eval_loss": 16.444347381591797, + "eval_runtime": 103.5226, + "eval_samples_per_second": 81.644, + "eval_sequential_score": 0.8838144817794605, + "eval_steps_per_second": 2.56, + "step": 750 + }, + { + "epoch": 5.384938998737905, + "grad_norm": 2.7032034397125244, + "learning_rate": 1.999599507118322e-05, + "loss": 17.1629, + "step": 800 + }, + { + "epoch": 5.384938998737905, + "eval_dim_128_cosine_accuracy": 0.8847610033128254, + "eval_dim_128_dot_accuracy": 0.11701372456223379, + "eval_dim_128_euclidean_accuracy": 0.8859441552295315, + "eval_dim_128_manhattan_accuracy": 0.884879318504496, + "eval_dim_128_max_accuracy": 0.8859441552295315, + "eval_dim_256_cosine_accuracy": 0.8861807856128727, + "eval_dim_256_dot_accuracy": 0.11417415996213914, + "eval_dim_256_euclidean_accuracy": 0.8859441552295315, + "eval_dim_256_manhattan_accuracy": 0.8853525792711784, + "eval_dim_256_max_accuracy": 0.8861807856128727, + "eval_dim_384_cosine_accuracy": 0.8866540463795551, + "eval_dim_384_dot_accuracy": 0.11334595362044486, + "eval_dim_384_euclidean_accuracy": 0.8866540463795551, + "eval_dim_384_manhattan_accuracy": 0.8862991008045433, + "eval_dim_384_max_accuracy": 0.8866540463795551, + "eval_dim_64_cosine_accuracy": 0.8841694273544723, + "eval_dim_64_dot_accuracy": 0.11831519167061051, + "eval_dim_64_euclidean_accuracy": 0.8841694273544723, + "eval_dim_64_manhattan_accuracy": 0.8839327969711311, + "eval_dim_64_max_accuracy": 0.8841694273544723, + "eval_loss": 16.420166015625, + "eval_runtime": 103.5297, + "eval_samples_per_second": 81.638, + "eval_sequential_score": 0.8841694273544723, + "eval_steps_per_second": 2.56, + "step": 800 + }, + { + "epoch": 5.721497686159024, + "grad_norm": 3.8163998126983643, + "learning_rate": 1.9986541110764565e-05, + "loss": 17.0747, + "step": 850 + }, + { + "epoch": 5.721497686159024, + "eval_dim_128_cosine_accuracy": 0.8853525792711784, + "eval_dim_128_dot_accuracy": 0.11618551822053952, + "eval_dim_128_euclidean_accuracy": 0.8835778513961192, + "eval_dim_128_manhattan_accuracy": 0.8845243729294842, + "eval_dim_128_max_accuracy": 0.8853525792711784, + "eval_dim_256_cosine_accuracy": 0.8874822527212494, + "eval_dim_256_dot_accuracy": 0.11358258400378608, + "eval_dim_256_euclidean_accuracy": 0.8864174159962139, + "eval_dim_256_manhattan_accuracy": 0.8862991008045433, + "eval_dim_256_max_accuracy": 0.8874822527212494, + "eval_dim_384_cosine_accuracy": 0.8868906767628963, + "eval_dim_384_dot_accuracy": 0.11310932323710364, + "eval_dim_384_euclidean_accuracy": 0.8868906767628963, + "eval_dim_384_manhattan_accuracy": 0.8862991008045433, + "eval_dim_384_max_accuracy": 0.8868906767628963, + "eval_dim_64_cosine_accuracy": 0.8836961665877898, + "eval_dim_64_dot_accuracy": 0.11867013724562234, + "eval_dim_64_euclidean_accuracy": 0.8831045906294368, + "eval_dim_64_manhattan_accuracy": 0.8832229058211074, + "eval_dim_64_max_accuracy": 0.8836961665877898, + "eval_loss": 16.416208267211914, + "eval_runtime": 103.4694, + "eval_samples_per_second": 81.686, + "eval_sequential_score": 0.8836961665877898, + "eval_steps_per_second": 2.561, + "step": 850 + }, + { + "epoch": 6.058056373580143, + "grad_norm": 3.9848620891571045, + "learning_rate": 1.9971532122280466e-05, + "loss": 17.0161, + "step": 900 + }, + { + "epoch": 6.058056373580143, + "eval_dim_128_cosine_accuracy": 0.8852342640795078, + "eval_dim_128_dot_accuracy": 0.11618551822053952, + "eval_dim_128_euclidean_accuracy": 0.8852342640795078, + "eval_dim_128_manhattan_accuracy": 0.8846426881211548, + "eval_dim_128_max_accuracy": 0.8852342640795078, + "eval_dim_256_cosine_accuracy": 0.8862991008045433, + "eval_dim_256_dot_accuracy": 0.11417415996213914, + "eval_dim_256_euclidean_accuracy": 0.8858258400378609, + "eval_dim_256_manhattan_accuracy": 0.8853525792711784, + "eval_dim_256_max_accuracy": 0.8862991008045433, + "eval_dim_384_cosine_accuracy": 0.8855892096545196, + "eval_dim_384_dot_accuracy": 0.11441079034548036, + "eval_dim_384_euclidean_accuracy": 0.8855892096545196, + "eval_dim_384_manhattan_accuracy": 0.885470894462849, + "eval_dim_384_max_accuracy": 0.8855892096545196, + "eval_dim_64_cosine_accuracy": 0.8855892096545196, + "eval_dim_64_dot_accuracy": 0.11831519167061051, + "eval_dim_64_euclidean_accuracy": 0.885470894462849, + "eval_dim_64_manhattan_accuracy": 0.8834595362044486, + "eval_dim_64_max_accuracy": 0.8855892096545196, + "eval_loss": 16.419212341308594, + "eval_runtime": 104.3001, + "eval_samples_per_second": 81.035, + "eval_sequential_score": 0.8855892096545196, + "eval_steps_per_second": 2.541, + "step": 900 + }, + { + "epoch": 6.394615061001262, + "grad_norm": 4.083323001861572, + "learning_rate": 1.995097645450266e-05, + "loss": 17.0146, + "step": 950 + }, + { + "epoch": 6.394615061001262, + "eval_dim_128_cosine_accuracy": 0.884879318504496, + "eval_dim_128_dot_accuracy": 0.1171320397539044, + "eval_dim_128_euclidean_accuracy": 0.8861807856128727, + "eval_dim_128_manhattan_accuracy": 0.8853525792711784, + "eval_dim_128_max_accuracy": 0.8861807856128727, + "eval_dim_256_cosine_accuracy": 0.8853525792711784, + "eval_dim_256_dot_accuracy": 0.11464742072882159, + "eval_dim_256_euclidean_accuracy": 0.885470894462849, + "eval_dim_256_manhattan_accuracy": 0.8858258400378609, + "eval_dim_256_max_accuracy": 0.8858258400378609, + "eval_dim_384_cosine_accuracy": 0.8855892096545196, + "eval_dim_384_dot_accuracy": 0.11441079034548036, + "eval_dim_384_euclidean_accuracy": 0.8855892096545196, + "eval_dim_384_manhattan_accuracy": 0.8864174159962139, + "eval_dim_384_max_accuracy": 0.8864174159962139, + "eval_dim_64_cosine_accuracy": 0.8844060577378136, + "eval_dim_64_dot_accuracy": 0.11796024609559867, + "eval_dim_64_euclidean_accuracy": 0.8852342640795078, + "eval_dim_64_manhattan_accuracy": 0.8844060577378136, + "eval_dim_64_max_accuracy": 0.8852342640795078, + "eval_loss": 16.403297424316406, + "eval_runtime": 102.2875, + "eval_samples_per_second": 82.63, + "eval_sequential_score": 0.8844060577378136, + "eval_steps_per_second": 2.591, + "step": 950 + }, + { + "epoch": 6.731173748422381, + "grad_norm": 3.874021291732788, + "learning_rate": 1.992488554155135e-05, + "loss": 16.9393, + "step": 1000 + }, + { + "epoch": 6.731173748422381, + "eval_dim_128_cosine_accuracy": 0.8828679602460956, + "eval_dim_128_dot_accuracy": 0.11784193090392807, + "eval_dim_128_euclidean_accuracy": 0.8846426881211548, + "eval_dim_128_manhattan_accuracy": 0.8841694273544723, + "eval_dim_128_max_accuracy": 0.8846426881211548, + "eval_dim_256_cosine_accuracy": 0.8839327969711311, + "eval_dim_256_dot_accuracy": 0.1171320397539044, + "eval_dim_256_euclidean_accuracy": 0.8840511121628017, + "eval_dim_256_manhattan_accuracy": 0.8852342640795078, + "eval_dim_256_max_accuracy": 0.8852342640795078, + "eval_dim_384_cosine_accuracy": 0.8847610033128254, + "eval_dim_384_dot_accuracy": 0.11523899668717463, + "eval_dim_384_euclidean_accuracy": 0.8847610033128254, + "eval_dim_384_manhattan_accuracy": 0.8852342640795078, + "eval_dim_384_max_accuracy": 0.8852342640795078, + "eval_dim_64_cosine_accuracy": 0.8834595362044486, + "eval_dim_64_dot_accuracy": 0.11831519167061051, + "eval_dim_64_euclidean_accuracy": 0.8835778513961192, + "eval_dim_64_manhattan_accuracy": 0.8820397539044014, + "eval_dim_64_max_accuracy": 0.8835778513961192, + "eval_loss": 16.40532684326172, + "eval_runtime": 104.0121, + "eval_samples_per_second": 81.26, + "eval_sequential_score": 0.8834595362044486, + "eval_steps_per_second": 2.548, + "step": 1000 + }, + { + "epoch": 7.0677324358435, + "grad_norm": 4.689154148101807, + "learning_rate": 1.9893273896534936e-05, + "loss": 16.899, + "step": 1050 + }, + { + "epoch": 7.0677324358435, + "eval_dim_128_cosine_accuracy": 0.8826313298627544, + "eval_dim_128_dot_accuracy": 0.11867013724562234, + "eval_dim_128_euclidean_accuracy": 0.8823946994794132, + "eval_dim_128_manhattan_accuracy": 0.882158069096072, + "eval_dim_128_max_accuracy": 0.8826313298627544, + "eval_dim_256_cosine_accuracy": 0.8828679602460956, + "eval_dim_256_dot_accuracy": 0.11725035494557501, + "eval_dim_256_euclidean_accuracy": 0.8831045906294368, + "eval_dim_256_manhattan_accuracy": 0.8834595362044486, + "eval_dim_256_max_accuracy": 0.8834595362044486, + "eval_dim_384_cosine_accuracy": 0.883341221012778, + "eval_dim_384_dot_accuracy": 0.11665877898722196, + "eval_dim_384_euclidean_accuracy": 0.883341221012778, + "eval_dim_384_manhattan_accuracy": 0.8839327969711311, + "eval_dim_384_max_accuracy": 0.8839327969711311, + "eval_dim_64_cosine_accuracy": 0.88180312352106, + "eval_dim_64_dot_accuracy": 0.11890676762896356, + "eval_dim_64_euclidean_accuracy": 0.8828679602460956, + "eval_dim_64_manhattan_accuracy": 0.8820397539044014, + "eval_dim_64_max_accuracy": 0.8828679602460956, + "eval_loss": 16.416202545166016, + "eval_runtime": 104.6249, + "eval_samples_per_second": 80.784, + "eval_sequential_score": 0.88180312352106, + "eval_steps_per_second": 2.533, + "step": 1050 + }, + { + "epoch": 7.40429112326462, + "grad_norm": 3.6406683921813965, + "learning_rate": 1.9856159103477085e-05, + "loss": 16.9112, + "step": 1100 + }, + { + "epoch": 7.40429112326462, + "eval_dim_128_cosine_accuracy": 0.8828679602460956, + "eval_dim_128_dot_accuracy": 0.11878845243729295, + "eval_dim_128_euclidean_accuracy": 0.8828679602460956, + "eval_dim_128_manhattan_accuracy": 0.8828679602460956, + "eval_dim_128_max_accuracy": 0.8828679602460956, + "eval_dim_256_cosine_accuracy": 0.8834595362044486, + "eval_dim_256_dot_accuracy": 0.11618551822053952, + "eval_dim_256_euclidean_accuracy": 0.8826313298627544, + "eval_dim_256_manhattan_accuracy": 0.8840511121628017, + "eval_dim_256_max_accuracy": 0.8840511121628017, + "eval_dim_384_cosine_accuracy": 0.883341221012778, + "eval_dim_384_dot_accuracy": 0.11665877898722196, + "eval_dim_384_euclidean_accuracy": 0.883341221012778, + "eval_dim_384_manhattan_accuracy": 0.884287742546143, + "eval_dim_384_max_accuracy": 0.884287742546143, + "eval_dim_64_cosine_accuracy": 0.8820397539044014, + "eval_dim_64_dot_accuracy": 0.11914339801230478, + "eval_dim_64_euclidean_accuracy": 0.8831045906294368, + "eval_dim_64_manhattan_accuracy": 0.8826313298627544, + "eval_dim_64_max_accuracy": 0.8831045906294368, + "eval_loss": 16.405092239379883, + "eval_runtime": 101.4605, + "eval_samples_per_second": 83.303, + "eval_sequential_score": 0.8820397539044014, + "eval_steps_per_second": 2.612, + "step": 1100 + }, + { + "epoch": 7.740849810685738, + "grad_norm": 4.141761302947998, + "learning_rate": 1.9813561807535597e-05, + "loss": 16.8508, + "step": 1150 + }, + { + "epoch": 7.740849810685738, + "eval_dim_128_cosine_accuracy": 0.882158069096072, + "eval_dim_128_dot_accuracy": 0.11878845243729295, + "eval_dim_128_euclidean_accuracy": 0.8825130146710838, + "eval_dim_128_manhattan_accuracy": 0.8838144817794605, + "eval_dim_128_max_accuracy": 0.8838144817794605, + "eval_dim_256_cosine_accuracy": 0.8825130146710838, + "eval_dim_256_dot_accuracy": 0.11748698532891623, + "eval_dim_256_euclidean_accuracy": 0.8831045906294368, + "eval_dim_256_manhattan_accuracy": 0.8835778513961192, + "eval_dim_256_max_accuracy": 0.8835778513961192, + "eval_dim_384_cosine_accuracy": 0.8829862754377662, + "eval_dim_384_dot_accuracy": 0.11701372456223379, + "eval_dim_384_euclidean_accuracy": 0.8829862754377662, + "eval_dim_384_manhattan_accuracy": 0.883341221012778, + "eval_dim_384_max_accuracy": 0.883341221012778, + "eval_dim_64_cosine_accuracy": 0.8820397539044014, + "eval_dim_64_dot_accuracy": 0.12115475627070516, + "eval_dim_64_euclidean_accuracy": 0.882158069096072, + "eval_dim_64_manhattan_accuracy": 0.882749645054425, + "eval_dim_64_max_accuracy": 0.882749645054425, + "eval_loss": 16.40436363220215, + "eval_runtime": 102.9818, + "eval_samples_per_second": 82.073, + "eval_sequential_score": 0.8820397539044014, + "eval_steps_per_second": 2.573, + "step": 1150 + }, + { + "epoch": 8.077408498106857, + "grad_norm": 3.7137351036071777, + "learning_rate": 1.9765505703518494e-05, + "loss": 16.8104, + "step": 1200 + }, + { + "epoch": 8.077408498106857, + "eval_dim_128_cosine_accuracy": 0.8815664931377188, + "eval_dim_128_dot_accuracy": 0.119380028395646, + "eval_dim_128_euclidean_accuracy": 0.8813298627543776, + "eval_dim_128_manhattan_accuracy": 0.8820397539044014, + "eval_dim_128_max_accuracy": 0.8820397539044014, + "eval_dim_256_cosine_accuracy": 0.8815664931377188, + "eval_dim_256_dot_accuracy": 0.11796024609559867, + "eval_dim_256_euclidean_accuracy": 0.8808566019876952, + "eval_dim_256_manhattan_accuracy": 0.8807382867960246, + "eval_dim_256_max_accuracy": 0.8815664931377188, + "eval_dim_384_cosine_accuracy": 0.8814481779460482, + "eval_dim_384_dot_accuracy": 0.11855182205395173, + "eval_dim_384_euclidean_accuracy": 0.8814481779460482, + "eval_dim_384_manhattan_accuracy": 0.880619971604354, + "eval_dim_384_max_accuracy": 0.8814481779460482, + "eval_dim_64_cosine_accuracy": 0.8816848083293894, + "eval_dim_64_dot_accuracy": 0.12174633222905822, + "eval_dim_64_euclidean_accuracy": 0.880619971604354, + "eval_dim_64_manhattan_accuracy": 0.8809749171793658, + "eval_dim_64_max_accuracy": 0.8816848083293894, + "eval_loss": 16.40627670288086, + "eval_runtime": 104.9051, + "eval_samples_per_second": 80.568, + "eval_sequential_score": 0.8816848083293894, + "eval_steps_per_second": 2.526, + "step": 1200 + }, + { + "epoch": 8.413967185527977, + "grad_norm": 3.3535964488983154, + "learning_rate": 1.9712017522703764e-05, + "loss": 16.8212, + "step": 1250 + }, + { + "epoch": 8.413967185527977, + "eval_dim_128_cosine_accuracy": 0.8834595362044486, + "eval_dim_128_dot_accuracy": 0.11796024609559867, + "eval_dim_128_euclidean_accuracy": 0.882749645054425, + "eval_dim_128_manhattan_accuracy": 0.8825130146710838, + "eval_dim_128_max_accuracy": 0.8834595362044486, + "eval_dim_256_cosine_accuracy": 0.882158069096072, + "eval_dim_256_dot_accuracy": 0.11748698532891623, + "eval_dim_256_euclidean_accuracy": 0.8823946994794132, + "eval_dim_256_manhattan_accuracy": 0.8819214387127308, + "eval_dim_256_max_accuracy": 0.8823946994794132, + "eval_dim_384_cosine_accuracy": 0.882158069096072, + "eval_dim_384_dot_accuracy": 0.11784193090392807, + "eval_dim_384_euclidean_accuracy": 0.882158069096072, + "eval_dim_384_manhattan_accuracy": 0.882749645054425, + "eval_dim_384_max_accuracy": 0.882749645054425, + "eval_dim_64_cosine_accuracy": 0.8820397539044014, + "eval_dim_64_dot_accuracy": 0.12091812588736393, + "eval_dim_64_euclidean_accuracy": 0.8819214387127308, + "eval_dim_64_manhattan_accuracy": 0.8815664931377188, + "eval_dim_64_max_accuracy": 0.8820397539044014, + "eval_loss": 16.40399169921875, + "eval_runtime": 103.0829, + "eval_samples_per_second": 81.992, + "eval_sequential_score": 0.8820397539044014, + "eval_steps_per_second": 2.571, + "step": 1250 + }, + { + "epoch": 8.750525872949096, + "grad_norm": 4.203086853027344, + "learning_rate": 1.9653127017970035e-05, + "loss": 16.7743, + "step": 1300 + }, + { + "epoch": 8.750525872949096, + "eval_dim_128_cosine_accuracy": 0.882158069096072, + "eval_dim_128_dot_accuracy": 0.12020823473734027, + "eval_dim_128_euclidean_accuracy": 0.8815664931377188, + "eval_dim_128_manhattan_accuracy": 0.8814481779460482, + "eval_dim_128_max_accuracy": 0.882158069096072, + "eval_dim_256_cosine_accuracy": 0.8823946994794132, + "eval_dim_256_dot_accuracy": 0.11878845243729295, + "eval_dim_256_euclidean_accuracy": 0.8819214387127308, + "eval_dim_256_manhattan_accuracy": 0.8816848083293894, + "eval_dim_256_max_accuracy": 0.8823946994794132, + "eval_dim_384_cosine_accuracy": 0.8816848083293894, + "eval_dim_384_dot_accuracy": 0.11831519167061051, + "eval_dim_384_euclidean_accuracy": 0.8816848083293894, + "eval_dim_384_manhattan_accuracy": 0.882158069096072, + "eval_dim_384_max_accuracy": 0.882158069096072, + "eval_dim_64_cosine_accuracy": 0.8809749171793658, + "eval_dim_64_dot_accuracy": 0.121509701845717, + "eval_dim_64_euclidean_accuracy": 0.8807382867960246, + "eval_dim_64_manhattan_accuracy": 0.881211547562707, + "eval_dim_64_max_accuracy": 0.881211547562707, + "eval_loss": 16.39342498779297, + "eval_runtime": 102.6649, + "eval_samples_per_second": 82.326, + "eval_sequential_score": 0.8809749171793658, + "eval_steps_per_second": 2.581, + "step": 1300 + }, + { + "epoch": 9.087084560370215, + "grad_norm": 3.313908576965332, + "learning_rate": 1.9588866947246498e-05, + "loss": 16.7383, + "step": 1350 + }, + { + "epoch": 9.087084560370215, + "eval_dim_128_cosine_accuracy": 0.8809749171793658, + "eval_dim_128_dot_accuracy": 0.12068149550402271, + "eval_dim_128_euclidean_accuracy": 0.8808566019876952, + "eval_dim_128_manhattan_accuracy": 0.8814481779460482, + "eval_dim_128_max_accuracy": 0.8814481779460482, + "eval_dim_256_cosine_accuracy": 0.8820397539044014, + "eval_dim_256_dot_accuracy": 0.11831519167061051, + "eval_dim_256_euclidean_accuracy": 0.8810932323710364, + "eval_dim_256_manhattan_accuracy": 0.881211547562707, + "eval_dim_256_max_accuracy": 0.8820397539044014, + "eval_dim_384_cosine_accuracy": 0.8807382867960246, + "eval_dim_384_dot_accuracy": 0.11926171320397538, + "eval_dim_384_euclidean_accuracy": 0.8807382867960246, + "eval_dim_384_manhattan_accuracy": 0.8803833412210128, + "eval_dim_384_max_accuracy": 0.8807382867960246, + "eval_dim_64_cosine_accuracy": 0.880028395646001, + "eval_dim_64_dot_accuracy": 0.12245622337908188, + "eval_dim_64_euclidean_accuracy": 0.8807382867960246, + "eval_dim_64_manhattan_accuracy": 0.8816848083293894, + "eval_dim_64_max_accuracy": 0.8816848083293894, + "eval_loss": 16.39626121520996, + "eval_runtime": 105.1167, + "eval_samples_per_second": 80.406, + "eval_sequential_score": 0.880028395646001, + "eval_steps_per_second": 2.521, + "step": 1350 + }, + { + "epoch": 9.423643247791334, + "grad_norm": 6.617325305938721, + "learning_rate": 1.9519273055291266e-05, + "loss": 16.743, + "step": 1400 + }, + { + "epoch": 9.423643247791334, + "eval_dim_128_cosine_accuracy": 0.8819214387127308, + "eval_dim_128_dot_accuracy": 0.119380028395646, + "eval_dim_128_euclidean_accuracy": 0.8826313298627544, + "eval_dim_128_manhattan_accuracy": 0.8815664931377188, + "eval_dim_128_max_accuracy": 0.8826313298627544, + "eval_dim_256_cosine_accuracy": 0.882158069096072, + "eval_dim_256_dot_accuracy": 0.11784193090392807, + "eval_dim_256_euclidean_accuracy": 0.882158069096072, + "eval_dim_256_manhattan_accuracy": 0.8816848083293894, + "eval_dim_256_max_accuracy": 0.882158069096072, + "eval_dim_384_cosine_accuracy": 0.8819214387127308, + "eval_dim_384_dot_accuracy": 0.11807856128726929, + "eval_dim_384_euclidean_accuracy": 0.8819214387127308, + "eval_dim_384_manhattan_accuracy": 0.8826313298627544, + "eval_dim_384_max_accuracy": 0.8826313298627544, + "eval_dim_64_cosine_accuracy": 0.8797917652626597, + "eval_dim_64_dot_accuracy": 0.12091812588736393, + "eval_dim_64_euclidean_accuracy": 0.8810932323710364, + "eval_dim_64_manhattan_accuracy": 0.8807382867960246, + "eval_dim_64_max_accuracy": 0.8810932323710364, + "eval_loss": 16.406700134277344, + "eval_runtime": 101.1577, + "eval_samples_per_second": 83.553, + "eval_sequential_score": 0.8797917652626597, + "eval_steps_per_second": 2.62, + "step": 1400 + }, + { + "epoch": 9.760201935212454, + "grad_norm": 4.450948715209961, + "learning_rate": 1.944438405380829e-05, + "loss": 16.7047, + "step": 1450 + }, + { + "epoch": 9.760201935212454, + "eval_dim_128_cosine_accuracy": 0.8803833412210128, + "eval_dim_128_dot_accuracy": 0.12056318031235211, + "eval_dim_128_euclidean_accuracy": 0.8810932323710364, + "eval_dim_128_manhattan_accuracy": 0.8802650260293422, + "eval_dim_128_max_accuracy": 0.8810932323710364, + "eval_dim_256_cosine_accuracy": 0.8809749171793658, + "eval_dim_256_dot_accuracy": 0.11914339801230478, + "eval_dim_256_euclidean_accuracy": 0.8813298627543776, + "eval_dim_256_manhattan_accuracy": 0.881211547562707, + "eval_dim_256_max_accuracy": 0.8813298627543776, + "eval_dim_384_cosine_accuracy": 0.8809749171793658, + "eval_dim_384_dot_accuracy": 0.11902508282063418, + "eval_dim_384_euclidean_accuracy": 0.8809749171793658, + "eval_dim_384_manhattan_accuracy": 0.8820397539044014, + "eval_dim_384_max_accuracy": 0.8820397539044014, + "eval_dim_64_cosine_accuracy": 0.8796734500709891, + "eval_dim_64_dot_accuracy": 0.12245622337908188, + "eval_dim_64_euclidean_accuracy": 0.880028395646001, + "eval_dim_64_manhattan_accuracy": 0.8803833412210128, + "eval_dim_64_max_accuracy": 0.8803833412210128, + "eval_loss": 16.39591407775879, + "eval_runtime": 102.018, + "eval_samples_per_second": 82.848, + "eval_sequential_score": 0.8796734500709891, + "eval_steps_per_second": 2.598, + "step": 1450 + }, + { + "epoch": 10.096760622633571, + "grad_norm": 6.13853120803833, + "learning_rate": 1.9364241599913923e-05, + "loss": 16.6782, + "step": 1500 + }, + { + "epoch": 10.096760622633571, + "eval_dim_128_cosine_accuracy": 0.8788452437292948, + "eval_dim_128_dot_accuracy": 0.1228111689540937, + "eval_dim_128_euclidean_accuracy": 0.8796734500709891, + "eval_dim_128_manhattan_accuracy": 0.879081874112636, + "eval_dim_128_max_accuracy": 0.8796734500709891, + "eval_dim_256_cosine_accuracy": 0.879081874112636, + "eval_dim_256_dot_accuracy": 0.12091812588736393, + "eval_dim_256_euclidean_accuracy": 0.8797917652626597, + "eval_dim_256_manhattan_accuracy": 0.8803833412210128, + "eval_dim_256_max_accuracy": 0.8803833412210128, + "eval_dim_384_cosine_accuracy": 0.8795551348793185, + "eval_dim_384_dot_accuracy": 0.12044486512068149, + "eval_dim_384_euclidean_accuracy": 0.8795551348793185, + "eval_dim_384_manhattan_accuracy": 0.8799100804543304, + "eval_dim_384_max_accuracy": 0.8799100804543304, + "eval_dim_64_cosine_accuracy": 0.8783719829626124, + "eval_dim_64_dot_accuracy": 0.12363937529578797, + "eval_dim_64_euclidean_accuracy": 0.8795551348793185, + "eval_dim_64_manhattan_accuracy": 0.8781353525792712, + "eval_dim_64_max_accuracy": 0.8795551348793185, + "eval_loss": 16.398588180541992, + "eval_runtime": 103.6429, + "eval_samples_per_second": 81.549, + "eval_sequential_score": 0.8783719829626124, + "eval_steps_per_second": 2.557, + "step": 1500 + }, + { + "epoch": 10.43331931005469, + "grad_norm": 4.757913112640381, + "learning_rate": 1.9278890272965097e-05, + "loss": 16.6708, + "step": 1550 + }, + { + "epoch": 10.43331931005469, + "eval_dim_128_cosine_accuracy": 0.8794368196876479, + "eval_dim_128_dot_accuracy": 0.121509701845717, + "eval_dim_128_euclidean_accuracy": 0.8795551348793185, + "eval_dim_128_manhattan_accuracy": 0.879081874112636, + "eval_dim_128_max_accuracy": 0.8795551348793185, + "eval_dim_256_cosine_accuracy": 0.8792001893043067, + "eval_dim_256_dot_accuracy": 0.11961665877898722, + "eval_dim_256_euclidean_accuracy": 0.8794368196876479, + "eval_dim_256_manhattan_accuracy": 0.8795551348793185, + "eval_dim_256_max_accuracy": 0.8795551348793185, + "eval_dim_384_cosine_accuracy": 0.8796734500709891, + "eval_dim_384_dot_accuracy": 0.12032654992901089, + "eval_dim_384_euclidean_accuracy": 0.8796734500709891, + "eval_dim_384_manhattan_accuracy": 0.8809749171793658, + "eval_dim_384_max_accuracy": 0.8809749171793658, + "eval_dim_64_cosine_accuracy": 0.879081874112636, + "eval_dim_64_dot_accuracy": 0.12245622337908188, + "eval_dim_64_euclidean_accuracy": 0.8796734500709891, + "eval_dim_64_manhattan_accuracy": 0.8802650260293422, + "eval_dim_64_max_accuracy": 0.8802650260293422, + "eval_loss": 16.401565551757812, + "eval_runtime": 103.0896, + "eval_samples_per_second": 81.987, + "eval_sequential_score": 0.879081874112636, + "eval_steps_per_second": 2.571, + "step": 1550 + }, + { + "epoch": 10.76987799747581, + "grad_norm": 5.452834129333496, + "learning_rate": 1.9188377549761962e-05, + "loss": 16.6485, + "step": 1600 + }, + { + "epoch": 10.76987799747581, + "eval_dim_128_cosine_accuracy": 0.8789635589209654, + "eval_dim_128_dot_accuracy": 0.1216280170373876, + "eval_dim_128_euclidean_accuracy": 0.8789635589209654, + "eval_dim_128_manhattan_accuracy": 0.8781353525792712, + "eval_dim_128_max_accuracy": 0.8789635589209654, + "eval_dim_256_cosine_accuracy": 0.8801467108376716, + "eval_dim_256_dot_accuracy": 0.11985328916232844, + "eval_dim_256_euclidean_accuracy": 0.8796734500709891, + "eval_dim_256_manhattan_accuracy": 0.8794368196876479, + "eval_dim_256_max_accuracy": 0.8801467108376716, + "eval_dim_384_cosine_accuracy": 0.879081874112636, + "eval_dim_384_dot_accuracy": 0.12091812588736393, + "eval_dim_384_euclidean_accuracy": 0.879081874112636, + "eval_dim_384_manhattan_accuracy": 0.8794368196876479, + "eval_dim_384_max_accuracy": 0.8794368196876479, + "eval_dim_64_cosine_accuracy": 0.8781353525792712, + "eval_dim_64_dot_accuracy": 0.12304779933743493, + "eval_dim_64_euclidean_accuracy": 0.8783719829626124, + "eval_dim_64_manhattan_accuracy": 0.879081874112636, + "eval_dim_64_max_accuracy": 0.879081874112636, + "eval_loss": 16.396345138549805, + "eval_runtime": 103.471, + "eval_samples_per_second": 81.685, + "eval_sequential_score": 0.8781353525792712, + "eval_steps_per_second": 2.561, + "step": 1600 + }, + { + "epoch": 11.106436684896929, + "grad_norm": 3.5591487884521484, + "learning_rate": 1.9092753778138885e-05, + "loss": 16.6205, + "step": 1650 + }, + { + "epoch": 11.106436684896929, + "eval_dim_128_cosine_accuracy": 0.8778987221959299, + "eval_dim_128_dot_accuracy": 0.12316611452910553, + "eval_dim_128_euclidean_accuracy": 0.8781353525792712, + "eval_dim_128_manhattan_accuracy": 0.8780170373876006, + "eval_dim_128_max_accuracy": 0.8781353525792712, + "eval_dim_256_cosine_accuracy": 0.8787269285376242, + "eval_dim_256_dot_accuracy": 0.121509701845717, + "eval_dim_256_euclidean_accuracy": 0.8787269285376242, + "eval_dim_256_manhattan_accuracy": 0.8793185044959773, + "eval_dim_256_max_accuracy": 0.8793185044959773, + "eval_dim_384_cosine_accuracy": 0.8793185044959773, + "eval_dim_384_dot_accuracy": 0.12068149550402271, + "eval_dim_384_euclidean_accuracy": 0.8793185044959773, + "eval_dim_384_manhattan_accuracy": 0.8801467108376716, + "eval_dim_384_max_accuracy": 0.8801467108376716, + "eval_dim_64_cosine_accuracy": 0.8770705158542357, + "eval_dim_64_dot_accuracy": 0.12541410317084714, + "eval_dim_64_euclidean_accuracy": 0.8771888310459063, + "eval_dim_64_manhattan_accuracy": 0.8776620918125887, + "eval_dim_64_max_accuracy": 0.8776620918125887, + "eval_loss": 16.401174545288086, + "eval_runtime": 102.9169, + "eval_samples_per_second": 82.124, + "eval_sequential_score": 0.8770705158542357, + "eval_steps_per_second": 2.575, + "step": 1650 + }, + { + "epoch": 11.442995372318048, + "grad_norm": 3.712305784225464, + "learning_rate": 1.8992072148958368e-05, + "loss": 16.6095, + "step": 1700 + }, + { + "epoch": 11.442995372318048, + "eval_dim_128_cosine_accuracy": 0.8786086133459536, + "eval_dim_128_dot_accuracy": 0.12233790818741126, + "eval_dim_128_euclidean_accuracy": 0.878490298154283, + "eval_dim_128_manhattan_accuracy": 0.8786086133459536, + "eval_dim_128_max_accuracy": 0.8786086133459536, + "eval_dim_256_cosine_accuracy": 0.8789635589209654, + "eval_dim_256_dot_accuracy": 0.1216280170373876, + "eval_dim_256_euclidean_accuracy": 0.879081874112636, + "eval_dim_256_manhattan_accuracy": 0.8780170373876006, + "eval_dim_256_max_accuracy": 0.879081874112636, + "eval_dim_384_cosine_accuracy": 0.8794368196876479, + "eval_dim_384_dot_accuracy": 0.12056318031235211, + "eval_dim_384_euclidean_accuracy": 0.8794368196876479, + "eval_dim_384_manhattan_accuracy": 0.879081874112636, + "eval_dim_384_max_accuracy": 0.8794368196876479, + "eval_dim_64_cosine_accuracy": 0.879081874112636, + "eval_dim_64_dot_accuracy": 0.12541410317084714, + "eval_dim_64_euclidean_accuracy": 0.8777804070042593, + "eval_dim_64_manhattan_accuracy": 0.8788452437292948, + "eval_dim_64_max_accuracy": 0.879081874112636, + "eval_loss": 16.413122177124023, + "eval_runtime": 103.5898, + "eval_samples_per_second": 81.591, + "eval_sequential_score": 0.879081874112636, + "eval_steps_per_second": 2.558, + "step": 1700 + }, + { + "epoch": 11.779554059739167, + "grad_norm": 4.9205145835876465, + "learning_rate": 1.888638866652356e-05, + "loss": 16.5891, + "step": 1750 + }, + { + "epoch": 11.779554059739167, + "eval_dim_128_cosine_accuracy": 0.8807382867960246, + "eval_dim_128_dot_accuracy": 0.1194983435873166, + "eval_dim_128_euclidean_accuracy": 0.8805016564126834, + "eval_dim_128_manhattan_accuracy": 0.8792001893043067, + "eval_dim_128_max_accuracy": 0.8807382867960246, + "eval_dim_256_cosine_accuracy": 0.8805016564126834, + "eval_dim_256_dot_accuracy": 0.11902508282063418, + "eval_dim_256_euclidean_accuracy": 0.8797917652626597, + "eval_dim_256_manhattan_accuracy": 0.8795551348793185, + "eval_dim_256_max_accuracy": 0.8805016564126834, + "eval_dim_384_cosine_accuracy": 0.8809749171793658, + "eval_dim_384_dot_accuracy": 0.11902508282063418, + "eval_dim_384_euclidean_accuracy": 0.8809749171793658, + "eval_dim_384_manhattan_accuracy": 0.880028395646001, + "eval_dim_384_max_accuracy": 0.8809749171793658, + "eval_dim_64_cosine_accuracy": 0.8801467108376716, + "eval_dim_64_dot_accuracy": 0.12292948414576432, + "eval_dim_64_euclidean_accuracy": 0.879081874112636, + "eval_dim_64_manhattan_accuracy": 0.879081874112636, + "eval_dim_64_max_accuracy": 0.8801467108376716, + "eval_loss": 16.40700340270996, + "eval_runtime": 103.5887, + "eval_samples_per_second": 81.592, + "eval_sequential_score": 0.8801467108376716, + "eval_steps_per_second": 2.558, + "step": 1750 + }, + { + "epoch": 12.116112747160287, + "grad_norm": 4.849546909332275, + "learning_rate": 1.8775762117425777e-05, + "loss": 16.5619, + "step": 1800 + }, + { + "epoch": 12.116112747160287, + "eval_dim_128_cosine_accuracy": 0.8794368196876479, + "eval_dim_128_dot_accuracy": 0.121509701845717, + "eval_dim_128_euclidean_accuracy": 0.8792001893043067, + "eval_dim_128_manhattan_accuracy": 0.8789635589209654, + "eval_dim_128_max_accuracy": 0.8794368196876479, + "eval_dim_256_cosine_accuracy": 0.880028395646001, + "eval_dim_256_dot_accuracy": 0.11973497397065783, + "eval_dim_256_euclidean_accuracy": 0.8799100804543304, + "eval_dim_256_manhattan_accuracy": 0.8792001893043067, + "eval_dim_256_max_accuracy": 0.880028395646001, + "eval_dim_384_cosine_accuracy": 0.8796734500709891, + "eval_dim_384_dot_accuracy": 0.12032654992901089, + "eval_dim_384_euclidean_accuracy": 0.8796734500709891, + "eval_dim_384_manhattan_accuracy": 0.8796734500709891, + "eval_dim_384_max_accuracy": 0.8796734500709891, + "eval_dim_64_cosine_accuracy": 0.8780170373876006, + "eval_dim_64_dot_accuracy": 0.12470421202082348, + "eval_dim_64_euclidean_accuracy": 0.8797917652626597, + "eval_dim_64_manhattan_accuracy": 0.8786086133459536, + "eval_dim_64_max_accuracy": 0.8797917652626597, + "eval_loss": 16.396265029907227, + "eval_runtime": 102.3506, + "eval_samples_per_second": 82.579, + "eval_sequential_score": 0.8780170373876006, + "eval_steps_per_second": 2.589, + "step": 1800 + }, + { + "epoch": 12.452671434581404, + "grad_norm": 4.944924831390381, + "learning_rate": 1.866025403784439e-05, + "loss": 16.5467, + "step": 1850 + }, + { + "epoch": 12.452671434581404, + "eval_dim_128_cosine_accuracy": 0.8795551348793185, + "eval_dim_128_dot_accuracy": 0.12316611452910553, + "eval_dim_128_euclidean_accuracy": 0.8787269285376242, + "eval_dim_128_manhattan_accuracy": 0.8794368196876479, + "eval_dim_128_max_accuracy": 0.8795551348793185, + "eval_dim_256_cosine_accuracy": 0.880619971604354, + "eval_dim_256_dot_accuracy": 0.12068149550402271, + "eval_dim_256_euclidean_accuracy": 0.8794368196876479, + "eval_dim_256_manhattan_accuracy": 0.8801467108376716, + "eval_dim_256_max_accuracy": 0.880619971604354, + "eval_dim_384_cosine_accuracy": 0.8803833412210128, + "eval_dim_384_dot_accuracy": 0.11961665877898722, + "eval_dim_384_euclidean_accuracy": 0.8803833412210128, + "eval_dim_384_manhattan_accuracy": 0.8807382867960246, + "eval_dim_384_max_accuracy": 0.8807382867960246, + "eval_dim_64_cosine_accuracy": 0.8789635589209654, + "eval_dim_64_dot_accuracy": 0.12470421202082348, + "eval_dim_64_euclidean_accuracy": 0.8781353525792712, + "eval_dim_64_manhattan_accuracy": 0.8796734500709891, + "eval_dim_64_max_accuracy": 0.8796734500709891, + "eval_loss": 16.399133682250977, + "eval_runtime": 104.1432, + "eval_samples_per_second": 81.157, + "eval_sequential_score": 0.8789635589209654, + "eval_steps_per_second": 2.545, + "step": 1850 + }, + { + "epoch": 12.789230122002524, + "grad_norm": 6.032313346862793, + "learning_rate": 1.853992867931721e-05, + "loss": 16.5398, + "step": 1900 + }, + { + "epoch": 12.789230122002524, + "eval_dim_128_cosine_accuracy": 0.8792001893043067, + "eval_dim_128_dot_accuracy": 0.12139138665404638, + "eval_dim_128_euclidean_accuracy": 0.8797917652626597, + "eval_dim_128_manhattan_accuracy": 0.8787269285376242, + "eval_dim_128_max_accuracy": 0.8797917652626597, + "eval_dim_256_cosine_accuracy": 0.8797917652626597, + "eval_dim_256_dot_accuracy": 0.11973497397065783, + "eval_dim_256_euclidean_accuracy": 0.8797917652626597, + "eval_dim_256_manhattan_accuracy": 0.8792001893043067, + "eval_dim_256_max_accuracy": 0.8797917652626597, + "eval_dim_384_cosine_accuracy": 0.8801467108376716, + "eval_dim_384_dot_accuracy": 0.11985328916232844, + "eval_dim_384_euclidean_accuracy": 0.8801467108376716, + "eval_dim_384_manhattan_accuracy": 0.8805016564126834, + "eval_dim_384_max_accuracy": 0.8805016564126834, + "eval_dim_64_cosine_accuracy": 0.8788452437292948, + "eval_dim_64_dot_accuracy": 0.12423095125414103, + "eval_dim_64_euclidean_accuracy": 0.8793185044959773, + "eval_dim_64_manhattan_accuracy": 0.8776620918125887, + "eval_dim_64_max_accuracy": 0.8793185044959773, + "eval_loss": 16.397045135498047, + "eval_runtime": 103.5361, + "eval_samples_per_second": 81.633, + "eval_sequential_score": 0.8788452437292948, + "eval_steps_per_second": 2.559, + "step": 1900 + }, + { + "epoch": 13.125788809423643, + "grad_norm": 4.27797269821167, + "learning_rate": 1.8414852973000503e-05, + "loss": 16.5047, + "step": 1950 + }, + { + "epoch": 13.125788809423643, + "eval_dim_128_cosine_accuracy": 0.8795551348793185, + "eval_dim_128_dot_accuracy": 0.1216280170373876, + "eval_dim_128_euclidean_accuracy": 0.8796734500709891, + "eval_dim_128_manhattan_accuracy": 0.8797917652626597, + "eval_dim_128_max_accuracy": 0.8797917652626597, + "eval_dim_256_cosine_accuracy": 0.8803833412210128, + "eval_dim_256_dot_accuracy": 0.12068149550402271, + "eval_dim_256_euclidean_accuracy": 0.8797917652626597, + "eval_dim_256_manhattan_accuracy": 0.8797917652626597, + "eval_dim_256_max_accuracy": 0.8803833412210128, + "eval_dim_384_cosine_accuracy": 0.8803833412210128, + "eval_dim_384_dot_accuracy": 0.11961665877898722, + "eval_dim_384_euclidean_accuracy": 0.8803833412210128, + "eval_dim_384_manhattan_accuracy": 0.8805016564126834, + "eval_dim_384_max_accuracy": 0.8805016564126834, + "eval_dim_64_cosine_accuracy": 0.8788452437292948, + "eval_dim_64_dot_accuracy": 0.12588736393752958, + "eval_dim_64_euclidean_accuracy": 0.8793185044959773, + "eval_dim_64_manhattan_accuracy": 0.8802650260293422, + "eval_dim_64_max_accuracy": 0.8802650260293422, + "eval_loss": 16.396381378173828, + "eval_runtime": 102.672, + "eval_samples_per_second": 82.32, + "eval_sequential_score": 0.8788452437292948, + "eval_steps_per_second": 2.581, + "step": 1950 + }, + { + "epoch": 13.462347496844762, + "grad_norm": 4.051229953765869, + "learning_rate": 1.8285096492438424e-05, + "loss": 16.4985, + "step": 2000 + }, + { + "epoch": 13.462347496844762, + "eval_dim_128_cosine_accuracy": 0.8793185044959773, + "eval_dim_128_dot_accuracy": 0.12127307146237577, + "eval_dim_128_euclidean_accuracy": 0.8803833412210128, + "eval_dim_128_manhattan_accuracy": 0.8796734500709891, + "eval_dim_128_max_accuracy": 0.8803833412210128, + "eval_dim_256_cosine_accuracy": 0.8797917652626597, + "eval_dim_256_dot_accuracy": 0.12020823473734027, + "eval_dim_256_euclidean_accuracy": 0.8796734500709891, + "eval_dim_256_manhattan_accuracy": 0.8797917652626597, + "eval_dim_256_max_accuracy": 0.8797917652626597, + "eval_dim_384_cosine_accuracy": 0.8807382867960246, + "eval_dim_384_dot_accuracy": 0.11926171320397538, + "eval_dim_384_euclidean_accuracy": 0.8807382867960246, + "eval_dim_384_manhattan_accuracy": 0.8810932323710364, + "eval_dim_384_max_accuracy": 0.8810932323710364, + "eval_dim_64_cosine_accuracy": 0.8789635589209654, + "eval_dim_64_dot_accuracy": 0.12316611452910553, + "eval_dim_64_euclidean_accuracy": 0.8787269285376242, + "eval_dim_64_manhattan_accuracy": 0.879081874112636, + "eval_dim_64_max_accuracy": 0.879081874112636, + "eval_loss": 16.4024600982666, + "eval_runtime": 104.2185, + "eval_samples_per_second": 81.099, + "eval_sequential_score": 0.8789635589209654, + "eval_steps_per_second": 2.543, + "step": 2000 + }, + { + "epoch": 13.798906184265881, + "grad_norm": 4.3837666511535645, + "learning_rate": 1.8150731414862623e-05, + "loss": 16.4852, + "step": 2050 + }, + { + "epoch": 13.798906184265881, + "eval_dim_128_cosine_accuracy": 0.8801467108376716, + "eval_dim_128_dot_accuracy": 0.12032654992901089, + "eval_dim_128_euclidean_accuracy": 0.8805016564126834, + "eval_dim_128_manhattan_accuracy": 0.8797917652626597, + "eval_dim_128_max_accuracy": 0.8805016564126834, + "eval_dim_256_cosine_accuracy": 0.8809749171793658, + "eval_dim_256_dot_accuracy": 0.119380028395646, + "eval_dim_256_euclidean_accuracy": 0.8814481779460482, + "eval_dim_256_manhattan_accuracy": 0.8807382867960246, + "eval_dim_256_max_accuracy": 0.8814481779460482, + "eval_dim_384_cosine_accuracy": 0.880028395646001, + "eval_dim_384_dot_accuracy": 0.11997160435399905, + "eval_dim_384_euclidean_accuracy": 0.880028395646001, + "eval_dim_384_manhattan_accuracy": 0.8794368196876479, + "eval_dim_384_max_accuracy": 0.880028395646001, + "eval_dim_64_cosine_accuracy": 0.8793185044959773, + "eval_dim_64_dot_accuracy": 0.12352106010411737, + "eval_dim_64_euclidean_accuracy": 0.8801467108376716, + "eval_dim_64_manhattan_accuracy": 0.8796734500709891, + "eval_dim_64_max_accuracy": 0.8801467108376716, + "eval_loss": 16.410737991333008, + "eval_runtime": 102.5333, + "eval_samples_per_second": 82.432, + "eval_sequential_score": 0.8793185044959773, + "eval_steps_per_second": 2.585, + "step": 2050 + }, + { + "epoch": 14.135464871687, + "grad_norm": 4.87747859954834, + "learning_rate": 1.8011832481043577e-05, + "loss": 16.4526, + "step": 2100 + }, + { + "epoch": 14.135464871687, + "eval_dim_128_cosine_accuracy": 0.8796734500709891, + "eval_dim_128_dot_accuracy": 0.12103644107903455, + "eval_dim_128_euclidean_accuracy": 0.8796734500709891, + "eval_dim_128_manhattan_accuracy": 0.8794368196876479, + "eval_dim_128_max_accuracy": 0.8796734500709891, + "eval_dim_256_cosine_accuracy": 0.8801467108376716, + "eval_dim_256_dot_accuracy": 0.12068149550402271, + "eval_dim_256_euclidean_accuracy": 0.8805016564126834, + "eval_dim_256_manhattan_accuracy": 0.8797917652626597, + "eval_dim_256_max_accuracy": 0.8805016564126834, + "eval_dim_384_cosine_accuracy": 0.8808566019876952, + "eval_dim_384_dot_accuracy": 0.11914339801230478, + "eval_dim_384_euclidean_accuracy": 0.8808566019876952, + "eval_dim_384_manhattan_accuracy": 0.8810932323710364, + "eval_dim_384_max_accuracy": 0.8810932323710364, + "eval_dim_64_cosine_accuracy": 0.8778987221959299, + "eval_dim_64_dot_accuracy": 0.12470421202082348, + "eval_dim_64_euclidean_accuracy": 0.8781353525792712, + "eval_dim_64_manhattan_accuracy": 0.879081874112636, + "eval_dim_64_max_accuracy": 0.879081874112636, + "eval_loss": 16.392879486083984, + "eval_runtime": 103.5589, + "eval_samples_per_second": 81.615, + "eval_sequential_score": 0.8778987221959299, + "eval_steps_per_second": 2.559, + "step": 2100 + }, + { + "epoch": 14.47202355910812, + "grad_norm": 6.463150978088379, + "learning_rate": 1.78684769537159e-05, + "loss": 16.4343, + "step": 2150 + }, + { + "epoch": 14.47202355910812, + "eval_dim_128_cosine_accuracy": 0.8788452437292948, + "eval_dim_128_dot_accuracy": 0.12221959299574066, + "eval_dim_128_euclidean_accuracy": 0.878490298154283, + "eval_dim_128_manhattan_accuracy": 0.8797917652626597, + "eval_dim_128_max_accuracy": 0.8797917652626597, + "eval_dim_256_cosine_accuracy": 0.879081874112636, + "eval_dim_256_dot_accuracy": 0.121509701845717, + "eval_dim_256_euclidean_accuracy": 0.8786086133459536, + "eval_dim_256_manhattan_accuracy": 0.8797917652626597, + "eval_dim_256_max_accuracy": 0.8797917652626597, + "eval_dim_384_cosine_accuracy": 0.8796734500709891, + "eval_dim_384_dot_accuracy": 0.12032654992901089, + "eval_dim_384_euclidean_accuracy": 0.8796734500709891, + "eval_dim_384_manhattan_accuracy": 0.880619971604354, + "eval_dim_384_max_accuracy": 0.880619971604354, + "eval_dim_64_cosine_accuracy": 0.8774254614292475, + "eval_dim_64_dot_accuracy": 0.1250591575958353, + "eval_dim_64_euclidean_accuracy": 0.8774254614292475, + "eval_dim_64_manhattan_accuracy": 0.8780170373876006, + "eval_dim_64_max_accuracy": 0.8780170373876006, + "eval_loss": 16.40749740600586, + "eval_runtime": 102.9532, + "eval_samples_per_second": 82.096, + "eval_sequential_score": 0.8774254614292475, + "eval_steps_per_second": 2.574, + "step": 2150 + }, + { + "epoch": 14.80858224652924, + "grad_norm": 4.839356422424316, + "learning_rate": 1.7720744574600865e-05, + "loss": 16.4244, + "step": 2200 + }, + { + "epoch": 14.80858224652924, + "eval_dim_128_cosine_accuracy": 0.8803833412210128, + "eval_dim_128_dot_accuracy": 0.11973497397065783, + "eval_dim_128_euclidean_accuracy": 0.880619971604354, + "eval_dim_128_manhattan_accuracy": 0.8809749171793658, + "eval_dim_128_max_accuracy": 0.8809749171793658, + "eval_dim_256_cosine_accuracy": 0.8819214387127308, + "eval_dim_256_dot_accuracy": 0.119380028395646, + "eval_dim_256_euclidean_accuracy": 0.8815664931377188, + "eval_dim_256_manhattan_accuracy": 0.8814481779460482, + "eval_dim_256_max_accuracy": 0.8819214387127308, + "eval_dim_384_cosine_accuracy": 0.8820397539044014, + "eval_dim_384_dot_accuracy": 0.11796024609559867, + "eval_dim_384_euclidean_accuracy": 0.8820397539044014, + "eval_dim_384_manhattan_accuracy": 0.882158069096072, + "eval_dim_384_max_accuracy": 0.882158069096072, + "eval_dim_64_cosine_accuracy": 0.8808566019876952, + "eval_dim_64_dot_accuracy": 0.12221959299574066, + "eval_dim_64_euclidean_accuracy": 0.880619971604354, + "eval_dim_64_manhattan_accuracy": 0.8786086133459536, + "eval_dim_64_max_accuracy": 0.8808566019876952, + "eval_loss": 16.402673721313477, + "eval_runtime": 103.4179, + "eval_samples_per_second": 81.727, + "eval_sequential_score": 0.8808566019876952, + "eval_steps_per_second": 2.562, + "step": 2200 + }, + { + "epoch": 15.145140933950358, + "grad_norm": 5.812349796295166, + "learning_rate": 1.756871752004992e-05, + "loss": 16.3947, + "step": 2250 + }, + { + "epoch": 15.145140933950358, + "eval_dim_128_cosine_accuracy": 0.879081874112636, + "eval_dim_128_dot_accuracy": 0.12316611452910553, + "eval_dim_128_euclidean_accuracy": 0.8786086133459536, + "eval_dim_128_manhattan_accuracy": 0.8809749171793658, + "eval_dim_128_max_accuracy": 0.8809749171793658, + "eval_dim_256_cosine_accuracy": 0.8792001893043067, + "eval_dim_256_dot_accuracy": 0.12139138665404638, + "eval_dim_256_euclidean_accuracy": 0.8801467108376716, + "eval_dim_256_manhattan_accuracy": 0.8813298627543776, + "eval_dim_256_max_accuracy": 0.8813298627543776, + "eval_dim_384_cosine_accuracy": 0.8802650260293422, + "eval_dim_384_dot_accuracy": 0.11973497397065783, + "eval_dim_384_euclidean_accuracy": 0.8802650260293422, + "eval_dim_384_manhattan_accuracy": 0.8808566019876952, + "eval_dim_384_max_accuracy": 0.8808566019876952, + "eval_dim_64_cosine_accuracy": 0.8773071462375769, + "eval_dim_64_dot_accuracy": 0.12695220066256507, + "eval_dim_64_euclidean_accuracy": 0.8768338854708945, + "eval_dim_64_manhattan_accuracy": 0.8787269285376242, + "eval_dim_64_max_accuracy": 0.8787269285376242, + "eval_loss": 16.4101619720459, + "eval_runtime": 105.1832, + "eval_samples_per_second": 80.355, + "eval_sequential_score": 0.8773071462375769, + "eval_steps_per_second": 2.519, + "step": 2250 + }, + { + "epoch": 15.481699621371476, + "grad_norm": 4.386394023895264, + "learning_rate": 1.7412480355334006e-05, + "loss": 16.3827, + "step": 2300 + }, + { + "epoch": 15.481699621371476, + "eval_dim_128_cosine_accuracy": 0.8803833412210128, + "eval_dim_128_dot_accuracy": 0.12245622337908188, + "eval_dim_128_euclidean_accuracy": 0.880619971604354, + "eval_dim_128_manhattan_accuracy": 0.88180312352106, + "eval_dim_128_max_accuracy": 0.88180312352106, + "eval_dim_256_cosine_accuracy": 0.8813298627543776, + "eval_dim_256_dot_accuracy": 0.12079981069569333, + "eval_dim_256_euclidean_accuracy": 0.8809749171793658, + "eval_dim_256_manhattan_accuracy": 0.8819214387127308, + "eval_dim_256_max_accuracy": 0.8819214387127308, + "eval_dim_384_cosine_accuracy": 0.8813298627543776, + "eval_dim_384_dot_accuracy": 0.11867013724562234, + "eval_dim_384_euclidean_accuracy": 0.8813298627543776, + "eval_dim_384_manhattan_accuracy": 0.8809749171793658, + "eval_dim_384_max_accuracy": 0.8813298627543776, + "eval_dim_64_cosine_accuracy": 0.8781353525792712, + "eval_dim_64_dot_accuracy": 0.1260056791292002, + "eval_dim_64_euclidean_accuracy": 0.8789635589209654, + "eval_dim_64_manhattan_accuracy": 0.8814481779460482, + "eval_dim_64_max_accuracy": 0.8814481779460482, + "eval_loss": 16.404207229614258, + "eval_runtime": 101.3893, + "eval_samples_per_second": 83.362, + "eval_sequential_score": 0.8781353525792712, + "eval_steps_per_second": 2.614, + "step": 2300 + }, + { + "epoch": 15.818258308792595, + "grad_norm": 4.8762359619140625, + "learning_rate": 1.7252119987603976e-05, + "loss": 16.3719, + "step": 2350 + }, + { + "epoch": 15.818258308792595, + "eval_dim_128_cosine_accuracy": 0.8801467108376716, + "eval_dim_128_dot_accuracy": 0.12032654992901089, + "eval_dim_128_euclidean_accuracy": 0.8802650260293422, + "eval_dim_128_manhattan_accuracy": 0.8802650260293422, + "eval_dim_128_max_accuracy": 0.8802650260293422, + "eval_dim_256_cosine_accuracy": 0.88180312352106, + "eval_dim_256_dot_accuracy": 0.11878845243729295, + "eval_dim_256_euclidean_accuracy": 0.8820397539044014, + "eval_dim_256_manhattan_accuracy": 0.881211547562707, + "eval_dim_256_max_accuracy": 0.8820397539044014, + "eval_dim_384_cosine_accuracy": 0.8820397539044014, + "eval_dim_384_dot_accuracy": 0.11796024609559867, + "eval_dim_384_euclidean_accuracy": 0.8820397539044014, + "eval_dim_384_manhattan_accuracy": 0.8808566019876952, + "eval_dim_384_max_accuracy": 0.8820397539044014, + "eval_dim_64_cosine_accuracy": 0.879081874112636, + "eval_dim_64_dot_accuracy": 0.12458589682915286, + "eval_dim_64_euclidean_accuracy": 0.8805016564126834, + "eval_dim_64_manhattan_accuracy": 0.8797917652626597, + "eval_dim_64_max_accuracy": 0.8805016564126834, + "eval_loss": 16.40033721923828, + "eval_runtime": 104.1264, + "eval_samples_per_second": 81.171, + "eval_sequential_score": 0.879081874112636, + "eval_steps_per_second": 2.545, + "step": 2350 + }, + { + "epoch": 16.154816996213714, + "grad_norm": 5.414395809173584, + "learning_rate": 1.7087725617548385e-05, + "loss": 16.3403, + "step": 2400 + }, + { + "epoch": 16.154816996213714, + "eval_dim_128_cosine_accuracy": 0.8781353525792712, + "eval_dim_128_dot_accuracy": 0.12328442972077615, + "eval_dim_128_euclidean_accuracy": 0.8778987221959299, + "eval_dim_128_manhattan_accuracy": 0.8787269285376242, + "eval_dim_128_max_accuracy": 0.8787269285376242, + "eval_dim_256_cosine_accuracy": 0.879081874112636, + "eval_dim_256_dot_accuracy": 0.12210127780407004, + "eval_dim_256_euclidean_accuracy": 0.8788452437292948, + "eval_dim_256_manhattan_accuracy": 0.879081874112636, + "eval_dim_256_max_accuracy": 0.879081874112636, + "eval_dim_384_cosine_accuracy": 0.8799100804543304, + "eval_dim_384_dot_accuracy": 0.12008991954566967, + "eval_dim_384_euclidean_accuracy": 0.8799100804543304, + "eval_dim_384_manhattan_accuracy": 0.8794368196876479, + "eval_dim_384_max_accuracy": 0.8799100804543304, + "eval_dim_64_cosine_accuracy": 0.8767155702792239, + "eval_dim_64_dot_accuracy": 0.12766209181258872, + "eval_dim_64_euclidean_accuracy": 0.8765972550875533, + "eval_dim_64_manhattan_accuracy": 0.8769522006625651, + "eval_dim_64_max_accuracy": 0.8769522006625651, + "eval_loss": 16.413236618041992, + "eval_runtime": 105.1626, + "eval_samples_per_second": 80.371, + "eval_sequential_score": 0.8767155702792239, + "eval_steps_per_second": 2.52, + "step": 2400 + }, + { + "epoch": 16.491375683634836, + "grad_norm": 4.138753414154053, + "learning_rate": 1.6919388689775463e-05, + "loss": 16.3357, + "step": 2450 + }, + { + "epoch": 16.491375683634836, + "eval_dim_128_cosine_accuracy": 0.8803833412210128, + "eval_dim_128_dot_accuracy": 0.1216280170373876, + "eval_dim_128_euclidean_accuracy": 0.879081874112636, + "eval_dim_128_manhattan_accuracy": 0.8802650260293422, + "eval_dim_128_max_accuracy": 0.8803833412210128, + "eval_dim_256_cosine_accuracy": 0.8808566019876952, + "eval_dim_256_dot_accuracy": 0.121509701845717, + "eval_dim_256_euclidean_accuracy": 0.8802650260293422, + "eval_dim_256_manhattan_accuracy": 0.8807382867960246, + "eval_dim_256_max_accuracy": 0.8808566019876952, + "eval_dim_384_cosine_accuracy": 0.8807382867960246, + "eval_dim_384_dot_accuracy": 0.11926171320397538, + "eval_dim_384_euclidean_accuracy": 0.8807382867960246, + "eval_dim_384_manhattan_accuracy": 0.8801467108376716, + "eval_dim_384_max_accuracy": 0.8807382867960246, + "eval_dim_64_cosine_accuracy": 0.8792001893043067, + "eval_dim_64_dot_accuracy": 0.12647893989588263, + "eval_dim_64_euclidean_accuracy": 0.8786086133459536, + "eval_dim_64_manhattan_accuracy": 0.8807382867960246, + "eval_dim_64_max_accuracy": 0.8807382867960246, + "eval_loss": 16.414878845214844, + "eval_runtime": 100.6398, + "eval_samples_per_second": 83.983, + "eval_sequential_score": 0.8792001893043067, + "eval_steps_per_second": 2.633, + "step": 2450 + }, + { + "epoch": 16.827934371055953, + "grad_norm": 4.080146312713623, + "learning_rate": 1.6747202841946928e-05, + "loss": 16.3203, + "step": 2500 + }, + { + "epoch": 16.827934371055953, + "eval_dim_128_cosine_accuracy": 0.8803833412210128, + "eval_dim_128_dot_accuracy": 0.12186464742072882, + "eval_dim_128_euclidean_accuracy": 0.8814481779460482, + "eval_dim_128_manhattan_accuracy": 0.880619971604354, + "eval_dim_128_max_accuracy": 0.8814481779460482, + "eval_dim_256_cosine_accuracy": 0.8814481779460482, + "eval_dim_256_dot_accuracy": 0.12044486512068149, + "eval_dim_256_euclidean_accuracy": 0.8820397539044014, + "eval_dim_256_manhattan_accuracy": 0.8825130146710838, + "eval_dim_256_max_accuracy": 0.8825130146710838, + "eval_dim_384_cosine_accuracy": 0.8815664931377188, + "eval_dim_384_dot_accuracy": 0.11843350686228112, + "eval_dim_384_euclidean_accuracy": 0.8815664931377188, + "eval_dim_384_manhattan_accuracy": 0.8815664931377188, + "eval_dim_384_max_accuracy": 0.8815664931377188, + "eval_dim_64_cosine_accuracy": 0.879081874112636, + "eval_dim_64_dot_accuracy": 0.12565073355418835, + "eval_dim_64_euclidean_accuracy": 0.880028395646001, + "eval_dim_64_manhattan_accuracy": 0.8810932323710364, + "eval_dim_64_max_accuracy": 0.8810932323710364, + "eval_loss": 16.408126831054688, + "eval_runtime": 103.4973, + "eval_samples_per_second": 81.664, + "eval_sequential_score": 0.879081874112636, + "eval_steps_per_second": 2.56, + "step": 2500 + }, + { + "epoch": 17.16449305847707, + "grad_norm": 5.26322078704834, + "learning_rate": 1.6571263852691887e-05, + "loss": 16.2986, + "step": 2550 + }, + { + "epoch": 17.16449305847707, + "eval_dim_128_cosine_accuracy": 0.8797917652626597, + "eval_dim_128_dot_accuracy": 0.12304779933743493, + "eval_dim_128_euclidean_accuracy": 0.8801467108376716, + "eval_dim_128_manhattan_accuracy": 0.8778987221959299, + "eval_dim_128_max_accuracy": 0.8801467108376716, + "eval_dim_256_cosine_accuracy": 0.880028395646001, + "eval_dim_256_dot_accuracy": 0.12068149550402271, + "eval_dim_256_euclidean_accuracy": 0.880619971604354, + "eval_dim_256_manhattan_accuracy": 0.8805016564126834, + "eval_dim_256_max_accuracy": 0.880619971604354, + "eval_dim_384_cosine_accuracy": 0.8820397539044014, + "eval_dim_384_dot_accuracy": 0.11796024609559867, + "eval_dim_384_euclidean_accuracy": 0.8820397539044014, + "eval_dim_384_manhattan_accuracy": 0.8808566019876952, + "eval_dim_384_max_accuracy": 0.8820397539044014, + "eval_dim_64_cosine_accuracy": 0.879081874112636, + "eval_dim_64_dot_accuracy": 0.12707051585423568, + "eval_dim_64_euclidean_accuracy": 0.8781353525792712, + "eval_dim_64_manhattan_accuracy": 0.8778987221959299, + "eval_dim_64_max_accuracy": 0.879081874112636, + "eval_loss": 16.413921356201172, + "eval_runtime": 103.9357, + "eval_samples_per_second": 81.32, + "eval_sequential_score": 0.879081874112636, + "eval_steps_per_second": 2.55, + "step": 2550 + }, + { + "epoch": 17.50105174589819, + "grad_norm": 9.353097915649414, + "learning_rate": 1.639166958832985e-05, + "loss": 16.2923, + "step": 2600 + }, + { + "epoch": 17.50105174589819, + "eval_dim_128_cosine_accuracy": 0.8786086133459536, + "eval_dim_128_dot_accuracy": 0.12352106010411737, + "eval_dim_128_euclidean_accuracy": 0.8783719829626124, + "eval_dim_128_manhattan_accuracy": 0.8807382867960246, + "eval_dim_128_max_accuracy": 0.8807382867960246, + "eval_dim_256_cosine_accuracy": 0.8792001893043067, + "eval_dim_256_dot_accuracy": 0.12103644107903455, + "eval_dim_256_euclidean_accuracy": 0.8796734500709891, + "eval_dim_256_manhattan_accuracy": 0.8810932323710364, + "eval_dim_256_max_accuracy": 0.8810932323710364, + "eval_dim_384_cosine_accuracy": 0.8799100804543304, + "eval_dim_384_dot_accuracy": 0.12008991954566967, + "eval_dim_384_euclidean_accuracy": 0.8799100804543304, + "eval_dim_384_manhattan_accuracy": 0.880028395646001, + "eval_dim_384_max_accuracy": 0.880028395646001, + "eval_dim_64_cosine_accuracy": 0.8768338854708945, + "eval_dim_64_dot_accuracy": 0.12754377662091812, + "eval_dim_64_euclidean_accuracy": 0.8762423095125415, + "eval_dim_64_manhattan_accuracy": 0.8789635589209654, + "eval_dim_64_max_accuracy": 0.8789635589209654, + "eval_loss": 16.406217575073242, + "eval_runtime": 101.8719, + "eval_samples_per_second": 82.967, + "eval_sequential_score": 0.8768338854708945, + "eval_steps_per_second": 2.601, + "step": 2600 + }, + { + "epoch": 17.83761043331931, + "grad_norm": 5.8258891105651855, + "learning_rate": 1.6208519948432438e-05, + "loss": 16.2649, + "step": 2650 + }, + { + "epoch": 17.83761043331931, + "eval_dim_128_cosine_accuracy": 0.880028395646001, + "eval_dim_128_dot_accuracy": 0.12186464742072882, + "eval_dim_128_euclidean_accuracy": 0.8803833412210128, + "eval_dim_128_manhattan_accuracy": 0.8799100804543304, + "eval_dim_128_max_accuracy": 0.8803833412210128, + "eval_dim_256_cosine_accuracy": 0.8807382867960246, + "eval_dim_256_dot_accuracy": 0.12210127780407004, + "eval_dim_256_euclidean_accuracy": 0.8814481779460482, + "eval_dim_256_manhattan_accuracy": 0.8810932323710364, + "eval_dim_256_max_accuracy": 0.8814481779460482, + "eval_dim_384_cosine_accuracy": 0.8814481779460482, + "eval_dim_384_dot_accuracy": 0.11855182205395173, + "eval_dim_384_euclidean_accuracy": 0.8814481779460482, + "eval_dim_384_manhattan_accuracy": 0.8814481779460482, + "eval_dim_384_max_accuracy": 0.8814481779460482, + "eval_dim_64_cosine_accuracy": 0.8787269285376242, + "eval_dim_64_dot_accuracy": 0.1283719829626124, + "eval_dim_64_euclidean_accuracy": 0.8788452437292948, + "eval_dim_64_manhattan_accuracy": 0.8799100804543304, + "eval_dim_64_max_accuracy": 0.8799100804543304, + "eval_loss": 16.410572052001953, + "eval_runtime": 101.9269, + "eval_samples_per_second": 82.922, + "eval_sequential_score": 0.8787269285376242, + "eval_steps_per_second": 2.6, + "step": 2650 + }, + { + "epoch": 18.17416912074043, + "grad_norm": 4.463468074798584, + "learning_rate": 1.6021916810254096e-05, + "loss": 16.2505, + "step": 2700 + }, + { + "epoch": 18.17416912074043, + "eval_dim_128_cosine_accuracy": 0.8786086133459536, + "eval_dim_128_dot_accuracy": 0.12411263606247042, + "eval_dim_128_euclidean_accuracy": 0.8780170373876006, + "eval_dim_128_manhattan_accuracy": 0.8792001893043067, + "eval_dim_128_max_accuracy": 0.8792001893043067, + "eval_dim_256_cosine_accuracy": 0.8793185044959773, + "eval_dim_256_dot_accuracy": 0.12210127780407004, + "eval_dim_256_euclidean_accuracy": 0.8793185044959773, + "eval_dim_256_manhattan_accuracy": 0.8805016564126834, + "eval_dim_256_max_accuracy": 0.8805016564126834, + "eval_dim_384_cosine_accuracy": 0.8802650260293422, + "eval_dim_384_dot_accuracy": 0.11973497397065783, + "eval_dim_384_euclidean_accuracy": 0.8802650260293422, + "eval_dim_384_manhattan_accuracy": 0.8813298627543776, + "eval_dim_384_max_accuracy": 0.8813298627543776, + "eval_dim_64_cosine_accuracy": 0.8770705158542357, + "eval_dim_64_dot_accuracy": 0.13014671083767157, + "eval_dim_64_euclidean_accuracy": 0.8761239943208708, + "eval_dim_64_manhattan_accuracy": 0.8787269285376242, + "eval_dim_64_max_accuracy": 0.8787269285376242, + "eval_loss": 16.418752670288086, + "eval_runtime": 106.398, + "eval_samples_per_second": 79.438, + "eval_sequential_score": 0.8770705158542357, + "eval_steps_per_second": 2.491, + "step": 2700 + }, + { + "epoch": 18.510727808161548, + "grad_norm": 5.066239833831787, + "learning_rate": 1.5831963972062734e-05, + "loss": 16.226, + "step": 2750 + }, + { + "epoch": 18.510727808161548, + "eval_dim_128_cosine_accuracy": 0.8770705158542357, + "eval_dim_128_dot_accuracy": 0.12446758163748226, + "eval_dim_128_euclidean_accuracy": 0.8771888310459063, + "eval_dim_128_manhattan_accuracy": 0.8778987221959299, + "eval_dim_128_max_accuracy": 0.8778987221959299, + "eval_dim_256_cosine_accuracy": 0.8781353525792712, + "eval_dim_256_dot_accuracy": 0.12304779933743493, + "eval_dim_256_euclidean_accuracy": 0.8788452437292948, + "eval_dim_256_manhattan_accuracy": 0.8799100804543304, + "eval_dim_256_max_accuracy": 0.8799100804543304, + "eval_dim_384_cosine_accuracy": 0.8780170373876006, + "eval_dim_384_dot_accuracy": 0.12198296261239944, + "eval_dim_384_euclidean_accuracy": 0.8780170373876006, + "eval_dim_384_manhattan_accuracy": 0.8770705158542357, + "eval_dim_384_max_accuracy": 0.8780170373876006, + "eval_dim_64_cosine_accuracy": 0.8765972550875533, + "eval_dim_64_dot_accuracy": 0.12884524372929484, + "eval_dim_64_euclidean_accuracy": 0.8765972550875533, + "eval_dim_64_manhattan_accuracy": 0.8778987221959299, + "eval_dim_64_max_accuracy": 0.8778987221959299, + "eval_loss": 16.4149112701416, + "eval_runtime": 101.2915, + "eval_samples_per_second": 83.442, + "eval_sequential_score": 0.8765972550875533, + "eval_steps_per_second": 2.616, + "step": 2750 + }, + { + "epoch": 18.84728649558267, + "grad_norm": 4.982476234436035, + "learning_rate": 1.5638767095401778e-05, + "loss": 16.2106, + "step": 2800 + }, + { + "epoch": 18.84728649558267, + "eval_dim_128_cosine_accuracy": 0.8780170373876006, + "eval_dim_128_dot_accuracy": 0.12529578797917654, + "eval_dim_128_euclidean_accuracy": 0.878490298154283, + "eval_dim_128_manhattan_accuracy": 0.8794368196876479, + "eval_dim_128_max_accuracy": 0.8794368196876479, + "eval_dim_256_cosine_accuracy": 0.8799100804543304, + "eval_dim_256_dot_accuracy": 0.1226928537624231, + "eval_dim_256_euclidean_accuracy": 0.8797917652626597, + "eval_dim_256_manhattan_accuracy": 0.8801467108376716, + "eval_dim_256_max_accuracy": 0.8801467108376716, + "eval_dim_384_cosine_accuracy": 0.879081874112636, + "eval_dim_384_dot_accuracy": 0.12091812588736393, + "eval_dim_384_euclidean_accuracy": 0.879081874112636, + "eval_dim_384_manhattan_accuracy": 0.8794368196876479, + "eval_dim_384_max_accuracy": 0.8794368196876479, + "eval_dim_64_cosine_accuracy": 0.8767155702792239, + "eval_dim_64_dot_accuracy": 0.13002839564600094, + "eval_dim_64_euclidean_accuracy": 0.8768338854708945, + "eval_dim_64_manhattan_accuracy": 0.8778987221959299, + "eval_dim_64_max_accuracy": 0.8778987221959299, + "eval_loss": 16.423009872436523, + "eval_runtime": 103.6087, + "eval_samples_per_second": 81.576, + "eval_sequential_score": 0.8767155702792239, + "eval_steps_per_second": 2.558, + "step": 2800 + }, + { + "epoch": 19.183845183003786, + "grad_norm": 6.176373481750488, + "learning_rate": 1.5442433646315792e-05, + "loss": 16.2052, + "step": 2850 + }, + { + "epoch": 19.183845183003786, + "eval_dim_128_cosine_accuracy": 0.8769522006625651, + "eval_dim_128_dot_accuracy": 0.12576904874585898, + "eval_dim_128_euclidean_accuracy": 0.8758873639375295, + "eval_dim_128_manhattan_accuracy": 0.8793185044959773, + "eval_dim_128_max_accuracy": 0.8793185044959773, + "eval_dim_256_cosine_accuracy": 0.8776620918125887, + "eval_dim_256_dot_accuracy": 0.12328442972077615, + "eval_dim_256_euclidean_accuracy": 0.8778987221959299, + "eval_dim_256_manhattan_accuracy": 0.8796734500709891, + "eval_dim_256_max_accuracy": 0.8796734500709891, + "eval_dim_384_cosine_accuracy": 0.878490298154283, + "eval_dim_384_dot_accuracy": 0.121509701845717, + "eval_dim_384_euclidean_accuracy": 0.878490298154283, + "eval_dim_384_manhattan_accuracy": 0.8814481779460482, + "eval_dim_384_max_accuracy": 0.8814481779460482, + "eval_dim_64_cosine_accuracy": 0.8744675816374823, + "eval_dim_64_dot_accuracy": 0.13369616658778988, + "eval_dim_64_euclidean_accuracy": 0.8742309512541411, + "eval_dim_64_manhattan_accuracy": 0.8781353525792712, + "eval_dim_64_max_accuracy": 0.8781353525792712, + "eval_loss": 16.435117721557617, + "eval_runtime": 104.4101, + "eval_samples_per_second": 80.95, + "eval_sequential_score": 0.8744675816374823, + "eval_steps_per_second": 2.538, + "step": 2850 + }, + { + "epoch": 19.520403870424904, + "grad_norm": 7.323819160461426, + "learning_rate": 1.5243072835572319e-05, + "loss": 16.186, + "step": 2900 + }, + { + "epoch": 19.520403870424904, + "eval_dim_128_cosine_accuracy": 0.8776620918125887, + "eval_dim_128_dot_accuracy": 0.12363937529578797, + "eval_dim_128_euclidean_accuracy": 0.8776620918125887, + "eval_dim_128_manhattan_accuracy": 0.876360624704212, + "eval_dim_128_max_accuracy": 0.8776620918125887, + "eval_dim_256_cosine_accuracy": 0.8793185044959773, + "eval_dim_256_dot_accuracy": 0.12198296261239944, + "eval_dim_256_euclidean_accuracy": 0.8789635589209654, + "eval_dim_256_manhattan_accuracy": 0.8777804070042593, + "eval_dim_256_max_accuracy": 0.8793185044959773, + "eval_dim_384_cosine_accuracy": 0.8792001893043067, + "eval_dim_384_dot_accuracy": 0.12079981069569333, + "eval_dim_384_euclidean_accuracy": 0.8792001893043067, + "eval_dim_384_manhattan_accuracy": 0.8789635589209654, + "eval_dim_384_max_accuracy": 0.8792001893043067, + "eval_dim_64_cosine_accuracy": 0.8762423095125415, + "eval_dim_64_dot_accuracy": 0.13097491717936582, + "eval_dim_64_euclidean_accuracy": 0.8748225272124941, + "eval_dim_64_manhattan_accuracy": 0.8782536677709418, + "eval_dim_64_max_accuracy": 0.8782536677709418, + "eval_loss": 16.433080673217773, + "eval_runtime": 101.0285, + "eval_samples_per_second": 83.66, + "eval_sequential_score": 0.8762423095125415, + "eval_steps_per_second": 2.623, + "step": 2900 + }, + { + "epoch": 19.856962557846025, + "grad_norm": 6.637113571166992, + "learning_rate": 1.5040795557913246e-05, + "loss": 16.1496, + "step": 2950 + }, + { + "epoch": 19.856962557846025, + "eval_dim_128_cosine_accuracy": 0.8774254614292475, + "eval_dim_128_dot_accuracy": 0.12529578797917654, + "eval_dim_128_euclidean_accuracy": 0.8770705158542357, + "eval_dim_128_manhattan_accuracy": 0.8782536677709418, + "eval_dim_128_max_accuracy": 0.8782536677709418, + "eval_dim_256_cosine_accuracy": 0.8781353525792712, + "eval_dim_256_dot_accuracy": 0.12375769048745859, + "eval_dim_256_euclidean_accuracy": 0.8783719829626124, + "eval_dim_256_manhattan_accuracy": 0.879081874112636, + "eval_dim_256_max_accuracy": 0.879081874112636, + "eval_dim_384_cosine_accuracy": 0.8780170373876006, + "eval_dim_384_dot_accuracy": 0.12198296261239944, + "eval_dim_384_euclidean_accuracy": 0.8780170373876006, + "eval_dim_384_manhattan_accuracy": 0.8786086133459536, + "eval_dim_384_max_accuracy": 0.8786086133459536, + "eval_dim_64_cosine_accuracy": 0.8770705158542357, + "eval_dim_64_dot_accuracy": 0.13357785139611927, + "eval_dim_64_euclidean_accuracy": 0.8756507335541883, + "eval_dim_64_manhattan_accuracy": 0.8775437766209181, + "eval_dim_64_max_accuracy": 0.8775437766209181, + "eval_loss": 16.437721252441406, + "eval_runtime": 103.9645, + "eval_samples_per_second": 81.297, + "eval_sequential_score": 0.8770705158542357, + "eval_steps_per_second": 2.549, + "step": 2950 + }, + { + "epoch": 20.193521245267142, + "grad_norm": 4.9336957931518555, + "learning_rate": 1.4835714330369445e-05, + "loss": 16.151, + "step": 3000 + }, + { + "epoch": 20.193521245267142, + "eval_dim_128_cosine_accuracy": 0.8765972550875533, + "eval_dim_128_dot_accuracy": 0.1261239943208708, + "eval_dim_128_euclidean_accuracy": 0.8761239943208708, + "eval_dim_128_manhattan_accuracy": 0.8797917652626597, + "eval_dim_128_max_accuracy": 0.8797917652626597, + "eval_dim_256_cosine_accuracy": 0.8780170373876006, + "eval_dim_256_dot_accuracy": 0.12245622337908188, + "eval_dim_256_euclidean_accuracy": 0.8771888310459063, + "eval_dim_256_manhattan_accuracy": 0.8801467108376716, + "eval_dim_256_max_accuracy": 0.8801467108376716, + "eval_dim_384_cosine_accuracy": 0.8780170373876006, + "eval_dim_384_dot_accuracy": 0.12198296261239944, + "eval_dim_384_euclidean_accuracy": 0.8780170373876006, + "eval_dim_384_manhattan_accuracy": 0.8819214387127308, + "eval_dim_384_max_accuracy": 0.8819214387127308, + "eval_dim_64_cosine_accuracy": 0.8750591575958353, + "eval_dim_64_dot_accuracy": 0.1361807856128727, + "eval_dim_64_euclidean_accuracy": 0.8730477993374349, + "eval_dim_64_manhattan_accuracy": 0.878490298154283, + "eval_dim_64_max_accuracy": 0.878490298154283, + "eval_loss": 16.44074821472168, + "eval_runtime": 101.9564, + "eval_samples_per_second": 82.898, + "eval_sequential_score": 0.8750591575958353, + "eval_steps_per_second": 2.599, + "step": 3000 + }, + { + "epoch": 20.530079932688263, + "grad_norm": 5.225156784057617, + "learning_rate": 1.4627943229672992e-05, + "loss": 16.1081, + "step": 3050 + }, + { + "epoch": 20.530079932688263, + "eval_dim_128_cosine_accuracy": 0.8758873639375295, + "eval_dim_128_dot_accuracy": 0.1261239943208708, + "eval_dim_128_euclidean_accuracy": 0.8758873639375295, + "eval_dim_128_manhattan_accuracy": 0.8781353525792712, + "eval_dim_128_max_accuracy": 0.8781353525792712, + "eval_dim_256_cosine_accuracy": 0.8775437766209181, + "eval_dim_256_dot_accuracy": 0.12245622337908188, + "eval_dim_256_euclidean_accuracy": 0.8778987221959299, + "eval_dim_256_manhattan_accuracy": 0.8776620918125887, + "eval_dim_256_max_accuracy": 0.8778987221959299, + "eval_dim_384_cosine_accuracy": 0.8774254614292475, + "eval_dim_384_dot_accuracy": 0.12257453857075248, + "eval_dim_384_euclidean_accuracy": 0.8774254614292475, + "eval_dim_384_manhattan_accuracy": 0.8788452437292948, + "eval_dim_384_max_accuracy": 0.8788452437292948, + "eval_dim_64_cosine_accuracy": 0.8749408424041647, + "eval_dim_64_dot_accuracy": 0.13712730714623758, + "eval_dim_64_euclidean_accuracy": 0.8743492664458117, + "eval_dim_64_manhattan_accuracy": 0.8765972550875533, + "eval_dim_64_max_accuracy": 0.8765972550875533, + "eval_loss": 16.442630767822266, + "eval_runtime": 104.3455, + "eval_samples_per_second": 81.0, + "eval_sequential_score": 0.8749408424041647, + "eval_steps_per_second": 2.54, + "step": 3050 + }, + { + "epoch": 20.86663862010938, + "grad_norm": 4.5568132400512695, + "learning_rate": 1.4417597828801833e-05, + "loss": 16.0864, + "step": 3100 + }, + { + "epoch": 20.86663862010938, + "eval_dim_128_cosine_accuracy": 0.8774254614292475, + "eval_dim_128_dot_accuracy": 0.12659725508755323, + "eval_dim_128_euclidean_accuracy": 0.8765972550875533, + "eval_dim_128_manhattan_accuracy": 0.879081874112636, + "eval_dim_128_max_accuracy": 0.879081874112636, + "eval_dim_256_cosine_accuracy": 0.8781353525792712, + "eval_dim_256_dot_accuracy": 0.12292948414576432, + "eval_dim_256_euclidean_accuracy": 0.8780170373876006, + "eval_dim_256_manhattan_accuracy": 0.880619971604354, + "eval_dim_256_max_accuracy": 0.880619971604354, + "eval_dim_384_cosine_accuracy": 0.8787269285376242, + "eval_dim_384_dot_accuracy": 0.12127307146237577, + "eval_dim_384_euclidean_accuracy": 0.8787269285376242, + "eval_dim_384_manhattan_accuracy": 0.8793185044959773, + "eval_dim_384_max_accuracy": 0.8793185044959773, + "eval_dim_64_cosine_accuracy": 0.8745858968291529, + "eval_dim_64_dot_accuracy": 0.13724562233790819, + "eval_dim_64_euclidean_accuracy": 0.8744675816374823, + "eval_dim_64_manhattan_accuracy": 0.8780170373876006, + "eval_dim_64_max_accuracy": 0.8780170373876006, + "eval_loss": 16.441152572631836, + "eval_runtime": 103.8678, + "eval_samples_per_second": 81.373, + "eval_sequential_score": 0.8745858968291529, + "eval_steps_per_second": 2.551, + "step": 3100 + }, + { + "epoch": 21.203197307530502, + "grad_norm": 6.664557933807373, + "learning_rate": 1.4204795132692146e-05, + "loss": 16.0934, + "step": 3150 + }, + { + "epoch": 21.203197307530502, + "eval_dim_128_cosine_accuracy": 0.8768338854708945, + "eval_dim_128_dot_accuracy": 0.12789872219592996, + "eval_dim_128_euclidean_accuracy": 0.8758873639375295, + "eval_dim_128_manhattan_accuracy": 0.8803833412210128, + "eval_dim_128_max_accuracy": 0.8803833412210128, + "eval_dim_256_cosine_accuracy": 0.8782536677709418, + "eval_dim_256_dot_accuracy": 0.12411263606247042, + "eval_dim_256_euclidean_accuracy": 0.8777804070042593, + "eval_dim_256_manhattan_accuracy": 0.88180312352106, + "eval_dim_256_max_accuracy": 0.88180312352106, + "eval_dim_384_cosine_accuracy": 0.8794368196876479, + "eval_dim_384_dot_accuracy": 0.12056318031235211, + "eval_dim_384_euclidean_accuracy": 0.8794368196876479, + "eval_dim_384_manhattan_accuracy": 0.881211547562707, + "eval_dim_384_max_accuracy": 0.881211547562707, + "eval_dim_64_cosine_accuracy": 0.8745858968291529, + "eval_dim_64_dot_accuracy": 0.14008518693800284, + "eval_dim_64_euclidean_accuracy": 0.8729294841457643, + "eval_dim_64_manhattan_accuracy": 0.8795551348793185, + "eval_dim_64_max_accuracy": 0.8795551348793185, + "eval_loss": 16.4547176361084, + "eval_runtime": 105.011, + "eval_samples_per_second": 80.487, + "eval_sequential_score": 0.8745858968291529, + "eval_steps_per_second": 2.524, + "step": 3150 + }, + { + "epoch": 21.53975599495162, + "grad_norm": 6.669680118560791, + "learning_rate": 1.3989653513154165e-05, + "loss": 16.0382, + "step": 3200 + }, + { + "epoch": 21.53975599495162, + "eval_dim_128_cosine_accuracy": 0.8742309512541411, + "eval_dim_128_dot_accuracy": 0.1283719829626124, + "eval_dim_128_euclidean_accuracy": 0.8738760056791292, + "eval_dim_128_manhattan_accuracy": 0.8748225272124941, + "eval_dim_128_max_accuracy": 0.8748225272124941, + "eval_dim_256_cosine_accuracy": 0.8751774727875059, + "eval_dim_256_dot_accuracy": 0.12446758163748226, + "eval_dim_256_euclidean_accuracy": 0.8754141031708471, + "eval_dim_256_manhattan_accuracy": 0.8765972550875533, + "eval_dim_256_max_accuracy": 0.8765972550875533, + "eval_dim_384_cosine_accuracy": 0.8765972550875533, + "eval_dim_384_dot_accuracy": 0.12340274491244675, + "eval_dim_384_euclidean_accuracy": 0.8765972550875533, + "eval_dim_384_manhattan_accuracy": 0.8761239943208708, + "eval_dim_384_max_accuracy": 0.8765972550875533, + "eval_dim_64_cosine_accuracy": 0.8723379081874113, + "eval_dim_64_dot_accuracy": 0.14020350212967345, + "eval_dim_64_euclidean_accuracy": 0.8703265499290109, + "eval_dim_64_manhattan_accuracy": 0.8754141031708471, + "eval_dim_64_max_accuracy": 0.8754141031708471, + "eval_loss": 16.458948135375977, + "eval_runtime": 101.007, + "eval_samples_per_second": 83.677, + "eval_sequential_score": 0.8723379081874113, + "eval_steps_per_second": 2.624, + "step": 3200 + }, + { + "epoch": 21.87631468237274, + "grad_norm": 5.666304588317871, + "learning_rate": 1.37722926430277e-05, + "loss": 16.0279, + "step": 3250 + }, + { + "epoch": 21.87631468237274, + "eval_dim_128_cosine_accuracy": 0.8751774727875059, + "eval_dim_128_dot_accuracy": 0.12979176526265973, + "eval_dim_128_euclidean_accuracy": 0.8743492664458117, + "eval_dim_128_manhattan_accuracy": 0.8771888310459063, + "eval_dim_128_max_accuracy": 0.8771888310459063, + "eval_dim_256_cosine_accuracy": 0.8765972550875533, + "eval_dim_256_dot_accuracy": 0.1260056791292002, + "eval_dim_256_euclidean_accuracy": 0.8761239943208708, + "eval_dim_256_manhattan_accuracy": 0.8796734500709891, + "eval_dim_256_max_accuracy": 0.8796734500709891, + "eval_dim_384_cosine_accuracy": 0.8773071462375769, + "eval_dim_384_dot_accuracy": 0.1226928537624231, + "eval_dim_384_euclidean_accuracy": 0.8773071462375769, + "eval_dim_384_manhattan_accuracy": 0.8777804070042593, + "eval_dim_384_max_accuracy": 0.8777804070042593, + "eval_dim_64_cosine_accuracy": 0.8728111689540937, + "eval_dim_64_dot_accuracy": 0.14221486038807382, + "eval_dim_64_euclidean_accuracy": 0.8732844297207761, + "eval_dim_64_manhattan_accuracy": 0.8776620918125887, + "eval_dim_64_max_accuracy": 0.8776620918125887, + "eval_loss": 16.46676254272461, + "eval_runtime": 102.9103, + "eval_samples_per_second": 82.13, + "eval_sequential_score": 0.8728111689540937, + "eval_steps_per_second": 2.575, + "step": 3250 + }, + { + "epoch": 22.212873369793858, + "grad_norm": 6.600480556488037, + "learning_rate": 1.3552833429613939e-05, + "loss": 16.0327, + "step": 3300 + }, + { + "epoch": 22.212873369793858, + "eval_dim_128_cosine_accuracy": 0.8742309512541411, + "eval_dim_128_dot_accuracy": 0.13002839564600094, + "eval_dim_128_euclidean_accuracy": 0.8742309512541411, + "eval_dim_128_manhattan_accuracy": 0.880028395646001, + "eval_dim_128_max_accuracy": 0.880028395646001, + "eval_dim_256_cosine_accuracy": 0.8768338854708945, + "eval_dim_256_dot_accuracy": 0.12363937529578797, + "eval_dim_256_euclidean_accuracy": 0.8764789398958827, + "eval_dim_256_manhattan_accuracy": 0.8814481779460482, + "eval_dim_256_max_accuracy": 0.8814481779460482, + "eval_dim_384_cosine_accuracy": 0.8773071462375769, + "eval_dim_384_dot_accuracy": 0.1226928537624231, + "eval_dim_384_euclidean_accuracy": 0.8773071462375769, + "eval_dim_384_manhattan_accuracy": 0.8807382867960246, + "eval_dim_384_max_accuracy": 0.8807382867960246, + "eval_dim_64_cosine_accuracy": 0.8726928537624231, + "eval_dim_64_dot_accuracy": 0.1432796971131093, + "eval_dim_64_euclidean_accuracy": 0.869971604353999, + "eval_dim_64_manhattan_accuracy": 0.8795551348793185, + "eval_dim_64_max_accuracy": 0.8795551348793185, + "eval_loss": 16.47365379333496, + "eval_runtime": 104.4255, + "eval_samples_per_second": 80.938, + "eval_sequential_score": 0.8726928537624231, + "eval_steps_per_second": 2.538, + "step": 3300 + }, + { + "epoch": 22.549432057214975, + "grad_norm": 7.925108432769775, + "learning_rate": 1.3331397947420578e-05, + "loss": 15.979, + "step": 3350 + }, + { + "epoch": 22.549432057214975, + "eval_dim_128_cosine_accuracy": 0.8739943208707998, + "eval_dim_128_dot_accuracy": 0.1293185044959773, + "eval_dim_128_euclidean_accuracy": 0.8732844297207761, + "eval_dim_128_manhattan_accuracy": 0.8782536677709418, + "eval_dim_128_max_accuracy": 0.8782536677709418, + "eval_dim_256_cosine_accuracy": 0.8770705158542357, + "eval_dim_256_dot_accuracy": 0.12328442972077615, + "eval_dim_256_euclidean_accuracy": 0.8773071462375769, + "eval_dim_256_manhattan_accuracy": 0.8793185044959773, + "eval_dim_256_max_accuracy": 0.8793185044959773, + "eval_dim_384_cosine_accuracy": 0.8770705158542357, + "eval_dim_384_dot_accuracy": 0.12292948414576432, + "eval_dim_384_euclidean_accuracy": 0.8770705158542357, + "eval_dim_384_manhattan_accuracy": 0.8778987221959299, + "eval_dim_384_max_accuracy": 0.8778987221959299, + "eval_dim_64_cosine_accuracy": 0.8722195929957407, + "eval_dim_64_dot_accuracy": 0.14162328442972077, + "eval_dim_64_euclidean_accuracy": 0.8700899195456696, + "eval_dim_64_manhattan_accuracy": 0.8767155702792239, + "eval_dim_64_max_accuracy": 0.8767155702792239, + "eval_loss": 16.468605041503906, + "eval_runtime": 101.8518, + "eval_samples_per_second": 82.983, + "eval_sequential_score": 0.8722195929957407, + "eval_steps_per_second": 2.602, + "step": 3350 + }, + { + "epoch": 22.885990744636096, + "grad_norm": 6.396854877471924, + "learning_rate": 1.3108109370257714e-05, + "loss": 15.9622, + "step": 3400 + }, + { + "epoch": 22.885990744636096, + "eval_dim_128_cosine_accuracy": 0.8743492664458117, + "eval_dim_128_dot_accuracy": 0.13002839564600094, + "eval_dim_128_euclidean_accuracy": 0.873639375295788, + "eval_dim_128_manhattan_accuracy": 0.8786086133459536, + "eval_dim_128_max_accuracy": 0.8786086133459536, + "eval_dim_256_cosine_accuracy": 0.8760056791292002, + "eval_dim_256_dot_accuracy": 0.12434926644581164, + "eval_dim_256_euclidean_accuracy": 0.8757690487458589, + "eval_dim_256_manhattan_accuracy": 0.8805016564126834, + "eval_dim_256_max_accuracy": 0.8805016564126834, + "eval_dim_384_cosine_accuracy": 0.8764789398958827, + "eval_dim_384_dot_accuracy": 0.12352106010411737, + "eval_dim_384_euclidean_accuracy": 0.8764789398958827, + "eval_dim_384_manhattan_accuracy": 0.8807382867960246, + "eval_dim_384_max_accuracy": 0.8807382867960246, + "eval_dim_64_cosine_accuracy": 0.8721012778040701, + "eval_dim_64_dot_accuracy": 0.14351632749645055, + "eval_dim_64_euclidean_accuracy": 0.8703265499290109, + "eval_dim_64_manhattan_accuracy": 0.8781353525792712, + "eval_dim_64_max_accuracy": 0.8781353525792712, + "eval_loss": 16.473587036132812, + "eval_runtime": 103.4538, + "eval_samples_per_second": 81.698, + "eval_sequential_score": 0.8721012778040701, + "eval_steps_per_second": 2.562, + "step": 3400 + }, + { + "epoch": 23.222549432057214, + "grad_norm": 4.757622241973877, + "learning_rate": 1.288309190272222e-05, + "loss": 15.9881, + "step": 3450 + }, + { + "epoch": 23.222549432057214, + "eval_dim_128_cosine_accuracy": 0.8743492664458117, + "eval_dim_128_dot_accuracy": 0.13097491717936582, + "eval_dim_128_euclidean_accuracy": 0.8737576904874585, + "eval_dim_128_manhattan_accuracy": 0.879081874112636, + "eval_dim_128_max_accuracy": 0.879081874112636, + "eval_dim_256_cosine_accuracy": 0.8756507335541883, + "eval_dim_256_dot_accuracy": 0.12588736393752958, + "eval_dim_256_euclidean_accuracy": 0.8747042120208235, + "eval_dim_256_manhattan_accuracy": 0.8795551348793185, + "eval_dim_256_max_accuracy": 0.8795551348793185, + "eval_dim_384_cosine_accuracy": 0.8755324183625177, + "eval_dim_384_dot_accuracy": 0.12446758163748226, + "eval_dim_384_euclidean_accuracy": 0.8755324183625177, + "eval_dim_384_manhattan_accuracy": 0.879081874112636, + "eval_dim_384_max_accuracy": 0.879081874112636, + "eval_dim_64_cosine_accuracy": 0.8723379081874113, + "eval_dim_64_dot_accuracy": 0.14375295787979175, + "eval_dim_64_euclidean_accuracy": 0.8700899195456696, + "eval_dim_64_manhattan_accuracy": 0.8788452437292948, + "eval_dim_64_max_accuracy": 0.8788452437292948, + "eval_loss": 16.48019790649414, + "eval_runtime": 104.0826, + "eval_samples_per_second": 81.205, + "eval_sequential_score": 0.8723379081874113, + "eval_steps_per_second": 2.546, + "step": 3450 + }, + { + "epoch": 23.559108119478335, + "grad_norm": 5.279081344604492, + "learning_rate": 1.2656470711108763e-05, + "loss": 15.9482, + "step": 3500 + }, + { + "epoch": 23.559108119478335, + "eval_dim_128_cosine_accuracy": 0.8724562233790819, + "eval_dim_128_dot_accuracy": 0.13073828679602462, + "eval_dim_128_euclidean_accuracy": 0.8728111689540937, + "eval_dim_128_manhattan_accuracy": 0.8783719829626124, + "eval_dim_128_max_accuracy": 0.8783719829626124, + "eval_dim_256_cosine_accuracy": 0.8761239943208708, + "eval_dim_256_dot_accuracy": 0.1250591575958353, + "eval_dim_256_euclidean_accuracy": 0.8761239943208708, + "eval_dim_256_manhattan_accuracy": 0.8797917652626597, + "eval_dim_256_max_accuracy": 0.8797917652626597, + "eval_dim_384_cosine_accuracy": 0.8761239943208708, + "eval_dim_384_dot_accuracy": 0.1238760056791292, + "eval_dim_384_euclidean_accuracy": 0.8761239943208708, + "eval_dim_384_manhattan_accuracy": 0.8770705158542357, + "eval_dim_384_max_accuracy": 0.8770705158542357, + "eval_dim_64_cosine_accuracy": 0.8710364410790346, + "eval_dim_64_dot_accuracy": 0.143989588263133, + "eval_dim_64_euclidean_accuracy": 0.867841930903928, + "eval_dim_64_manhattan_accuracy": 0.8764789398958827, + "eval_dim_64_max_accuracy": 0.8764789398958827, + "eval_loss": 16.482074737548828, + "eval_runtime": 102.3602, + "eval_samples_per_second": 82.571, + "eval_sequential_score": 0.8710364410790346, + "eval_steps_per_second": 2.589, + "step": 3500 + }, + { + "epoch": 23.895666806899452, + "grad_norm": 5.999639511108398, + "learning_rate": 1.2428371853785872e-05, + "loss": 15.9228, + "step": 3550 + }, + { + "epoch": 23.895666806899452, + "eval_dim_128_cosine_accuracy": 0.8725745385707525, + "eval_dim_128_dot_accuracy": 0.13310459062943683, + "eval_dim_128_euclidean_accuracy": 0.8719829626123994, + "eval_dim_128_manhattan_accuracy": 0.878490298154283, + "eval_dim_128_max_accuracy": 0.878490298154283, + "eval_dim_256_cosine_accuracy": 0.8748225272124941, + "eval_dim_256_dot_accuracy": 0.12671557027922387, + "eval_dim_256_euclidean_accuracy": 0.8743492664458117, + "eval_dim_256_manhattan_accuracy": 0.879081874112636, + "eval_dim_256_max_accuracy": 0.879081874112636, + "eval_dim_384_cosine_accuracy": 0.8750591575958353, + "eval_dim_384_dot_accuracy": 0.1249408424041647, + "eval_dim_384_euclidean_accuracy": 0.8750591575958353, + "eval_dim_384_manhattan_accuracy": 0.8781353525792712, + "eval_dim_384_max_accuracy": 0.8781353525792712, + "eval_dim_64_cosine_accuracy": 0.870918125887364, + "eval_dim_64_dot_accuracy": 0.14469947941315664, + "eval_dim_64_euclidean_accuracy": 0.8691433980123048, + "eval_dim_64_manhattan_accuracy": 0.8776620918125887, + "eval_dim_64_max_accuracy": 0.8776620918125887, + "eval_loss": 16.499635696411133, + "eval_runtime": 103.3405, + "eval_samples_per_second": 81.788, + "eval_sequential_score": 0.870918125887364, + "eval_steps_per_second": 2.564, + "step": 3550 + }, + { + "epoch": 24.232225494320573, + "grad_norm": 6.511181354522705, + "learning_rate": 1.2198922211075779e-05, + "loss": 15.9418, + "step": 3600 + }, + { + "epoch": 24.232225494320573, + "eval_dim_128_cosine_accuracy": 0.870918125887364, + "eval_dim_128_dot_accuracy": 0.1353525792711784, + "eval_dim_128_euclidean_accuracy": 0.8703265499290109, + "eval_dim_128_manhattan_accuracy": 0.8783719829626124, + "eval_dim_128_max_accuracy": 0.8783719829626124, + "eval_dim_256_cosine_accuracy": 0.8728111689540937, + "eval_dim_256_dot_accuracy": 0.12884524372929484, + "eval_dim_256_euclidean_accuracy": 0.8721012778040701, + "eval_dim_256_manhattan_accuracy": 0.8794368196876479, + "eval_dim_256_max_accuracy": 0.8794368196876479, + "eval_dim_384_cosine_accuracy": 0.8734027449124467, + "eval_dim_384_dot_accuracy": 0.12659725508755323, + "eval_dim_384_euclidean_accuracy": 0.8734027449124467, + "eval_dim_384_manhattan_accuracy": 0.8794368196876479, + "eval_dim_384_max_accuracy": 0.8794368196876479, + "eval_dim_64_cosine_accuracy": 0.8698532891623284, + "eval_dim_64_dot_accuracy": 0.14564600094652153, + "eval_dim_64_euclidean_accuracy": 0.8680785612872692, + "eval_dim_64_manhattan_accuracy": 0.8770705158542357, + "eval_dim_64_max_accuracy": 0.8770705158542357, + "eval_loss": 16.497343063354492, + "eval_runtime": 104.6868, + "eval_samples_per_second": 80.736, + "eval_sequential_score": 0.8698532891623284, + "eval_steps_per_second": 2.531, + "step": 3600 + }, + { + "epoch": 24.56878418174169, + "grad_norm": 5.682207107543945, + "learning_rate": 1.1968249414677055e-05, + "loss": 15.896, + "step": 3650 + }, + { + "epoch": 24.56878418174169, + "eval_dim_128_cosine_accuracy": 0.8696166587789872, + "eval_dim_128_dot_accuracy": 0.13487931850449597, + "eval_dim_128_euclidean_accuracy": 0.8685518220539518, + "eval_dim_128_manhattan_accuracy": 0.8764789398958827, + "eval_dim_128_max_accuracy": 0.8764789398958827, + "eval_dim_256_cosine_accuracy": 0.8716280170373876, + "eval_dim_256_dot_accuracy": 0.12896355892096545, + "eval_dim_256_euclidean_accuracy": 0.871509701845717, + "eval_dim_256_manhattan_accuracy": 0.8777804070042593, + "eval_dim_256_max_accuracy": 0.8777804070042593, + "eval_dim_384_cosine_accuracy": 0.8726928537624231, + "eval_dim_384_dot_accuracy": 0.1273071462375769, + "eval_dim_384_euclidean_accuracy": 0.8726928537624231, + "eval_dim_384_manhattan_accuracy": 0.8773071462375769, + "eval_dim_384_max_accuracy": 0.8773071462375769, + "eval_dim_64_cosine_accuracy": 0.8685518220539518, + "eval_dim_64_dot_accuracy": 0.14694746805489825, + "eval_dim_64_euclidean_accuracy": 0.8659488878371983, + "eval_dim_64_manhattan_accuracy": 0.8760056791292002, + "eval_dim_64_max_accuracy": 0.8760056791292002, + "eval_loss": 16.498498916625977, + "eval_runtime": 102.6029, + "eval_samples_per_second": 82.376, + "eval_sequential_score": 0.8685518220539518, + "eval_steps_per_second": 2.583, + "step": 3650 + }, + { + "epoch": 24.90534286916281, + "grad_norm": 5.5915117263793945, + "learning_rate": 1.1736481776669307e-05, + "loss": 15.8788, + "step": 3700 + }, + { + "epoch": 24.90534286916281, + "eval_dim_128_cosine_accuracy": 0.8691433980123048, + "eval_dim_128_dot_accuracy": 0.1361807856128727, + "eval_dim_128_euclidean_accuracy": 0.8697349739706578, + "eval_dim_128_manhattan_accuracy": 0.8747042120208235, + "eval_dim_128_max_accuracy": 0.8747042120208235, + "eval_dim_256_cosine_accuracy": 0.871509701845717, + "eval_dim_256_dot_accuracy": 0.13073828679602462, + "eval_dim_256_euclidean_accuracy": 0.8704448651206815, + "eval_dim_256_manhattan_accuracy": 0.8770705158542357, + "eval_dim_256_max_accuracy": 0.8770705158542357, + "eval_dim_384_cosine_accuracy": 0.8717463322290582, + "eval_dim_384_dot_accuracy": 0.1282536677709418, + "eval_dim_384_euclidean_accuracy": 0.8717463322290582, + "eval_dim_384_manhattan_accuracy": 0.8758873639375295, + "eval_dim_384_max_accuracy": 0.8758873639375295, + "eval_dim_64_cosine_accuracy": 0.8661855182205395, + "eval_dim_64_dot_accuracy": 0.14824893516327498, + "eval_dim_64_euclidean_accuracy": 0.8667770941788926, + "eval_dim_64_manhattan_accuracy": 0.8744675816374823, + "eval_dim_64_max_accuracy": 0.8744675816374823, + "eval_loss": 16.517175674438477, + "eval_runtime": 103.5179, + "eval_samples_per_second": 81.648, + "eval_sequential_score": 0.8661855182205395, + "eval_steps_per_second": 2.56, + "step": 3700 + }, + { + "epoch": 25.24190155658393, + "grad_norm": 5.408066749572754, + "learning_rate": 1.150374821813937e-05, + "loss": 15.9147, + "step": 3750 + }, + { + "epoch": 25.24190155658393, + "eval_dim_128_cosine_accuracy": 0.8677236157122574, + "eval_dim_128_dot_accuracy": 0.13724562233790819, + "eval_dim_128_euclidean_accuracy": 0.8673686701372456, + "eval_dim_128_manhattan_accuracy": 0.8768338854708945, + "eval_dim_128_max_accuracy": 0.8768338854708945, + "eval_dim_256_cosine_accuracy": 0.8705631803123521, + "eval_dim_256_dot_accuracy": 0.13144817794604827, + "eval_dim_256_euclidean_accuracy": 0.869971604353999, + "eval_dim_256_manhattan_accuracy": 0.8782536677709418, + "eval_dim_256_max_accuracy": 0.8782536677709418, + "eval_dim_384_cosine_accuracy": 0.8711547562707052, + "eval_dim_384_dot_accuracy": 0.12884524372929484, + "eval_dim_384_euclidean_accuracy": 0.8711547562707052, + "eval_dim_384_manhattan_accuracy": 0.8782536677709418, + "eval_dim_384_max_accuracy": 0.8782536677709418, + "eval_dim_64_cosine_accuracy": 0.8661855182205395, + "eval_dim_64_dot_accuracy": 0.14955040227165167, + "eval_dim_64_euclidean_accuracy": 0.866658778987222, + "eval_dim_64_manhattan_accuracy": 0.8768338854708945, + "eval_dim_64_max_accuracy": 0.8768338854708945, + "eval_loss": 16.506189346313477, + "eval_runtime": 103.9114, + "eval_samples_per_second": 81.339, + "eval_sequential_score": 0.8661855182205395, + "eval_steps_per_second": 2.55, + "step": 3750 + }, + { + "epoch": 25.578460244005047, + "grad_norm": 6.964442253112793, + "learning_rate": 1.1270178197468788e-05, + "loss": 15.857, + "step": 3800 + }, + { + "epoch": 25.578460244005047, + "eval_dim_128_cosine_accuracy": 0.8683151916706106, + "eval_dim_128_dot_accuracy": 0.13712730714623758, + "eval_dim_128_euclidean_accuracy": 0.86819687647894, + "eval_dim_128_manhattan_accuracy": 0.8739943208707998, + "eval_dim_128_max_accuracy": 0.8739943208707998, + "eval_dim_256_cosine_accuracy": 0.8717463322290582, + "eval_dim_256_dot_accuracy": 0.13073828679602462, + "eval_dim_256_euclidean_accuracy": 0.871509701845717, + "eval_dim_256_manhattan_accuracy": 0.8761239943208708, + "eval_dim_256_max_accuracy": 0.8761239943208708, + "eval_dim_384_cosine_accuracy": 0.8731661145291055, + "eval_dim_384_dot_accuracy": 0.12683388547089447, + "eval_dim_384_euclidean_accuracy": 0.8731661145291055, + "eval_dim_384_manhattan_accuracy": 0.8755324183625177, + "eval_dim_384_max_accuracy": 0.8755324183625177, + "eval_dim_64_cosine_accuracy": 0.8663038334122102, + "eval_dim_64_dot_accuracy": 0.1499053478466635, + "eval_dim_64_euclidean_accuracy": 0.865120681495504, + "eval_dim_64_manhattan_accuracy": 0.8748225272124941, + "eval_dim_64_max_accuracy": 0.8748225272124941, + "eval_loss": 16.505783081054688, + "eval_runtime": 102.7207, + "eval_samples_per_second": 82.281, + "eval_sequential_score": 0.8663038334122102, + "eval_steps_per_second": 2.58, + "step": 3800 + }, + { + "epoch": 25.915018931426168, + "grad_norm": 17.978727340698242, + "learning_rate": 1.1035901638322392e-05, + "loss": 15.8291, + "step": 3850 + }, + { + "epoch": 25.915018931426168, + "eval_dim_128_cosine_accuracy": 0.8673686701372456, + "eval_dim_128_dot_accuracy": 0.13771888310459063, + "eval_dim_128_euclidean_accuracy": 0.8665404637955514, + "eval_dim_128_manhattan_accuracy": 0.8745858968291529, + "eval_dim_128_max_accuracy": 0.8745858968291529, + "eval_dim_256_cosine_accuracy": 0.8702082347373403, + "eval_dim_256_dot_accuracy": 0.1320397539044013, + "eval_dim_256_euclidean_accuracy": 0.868788452437293, + "eval_dim_256_manhattan_accuracy": 0.8756507335541883, + "eval_dim_256_max_accuracy": 0.8756507335541883, + "eval_dim_384_cosine_accuracy": 0.8705631803123521, + "eval_dim_384_dot_accuracy": 0.1294368196876479, + "eval_dim_384_euclidean_accuracy": 0.8705631803123521, + "eval_dim_384_manhattan_accuracy": 0.8762423095125415, + "eval_dim_384_max_accuracy": 0.8762423095125415, + "eval_dim_64_cosine_accuracy": 0.8644107903454804, + "eval_dim_64_dot_accuracy": 0.15309985802177, + "eval_dim_64_euclidean_accuracy": 0.8640558447704685, + "eval_dim_64_manhattan_accuracy": 0.8731661145291055, + "eval_dim_64_max_accuracy": 0.8731661145291055, + "eval_loss": 16.520679473876953, + "eval_runtime": 104.1552, + "eval_samples_per_second": 81.148, + "eval_sequential_score": 0.8644107903454804, + "eval_steps_per_second": 2.544, + "step": 3850 + }, + { + "epoch": 26.251577618847286, + "grad_norm": 7.759204387664795, + "learning_rate": 1.080104885737807e-05, + "loss": 15.8802, + "step": 3900 + }, + { + "epoch": 26.251577618847286, + "eval_dim_128_cosine_accuracy": 0.867841930903928, + "eval_dim_128_dot_accuracy": 0.13913866540463796, + "eval_dim_128_euclidean_accuracy": 0.86819687647894, + "eval_dim_128_manhattan_accuracy": 0.8750591575958353, + "eval_dim_128_max_accuracy": 0.8750591575958353, + "eval_dim_256_cosine_accuracy": 0.8697349739706578, + "eval_dim_256_dot_accuracy": 0.1318031235210601, + "eval_dim_256_euclidean_accuracy": 0.8697349739706578, + "eval_dim_256_manhattan_accuracy": 0.8764789398958827, + "eval_dim_256_max_accuracy": 0.8764789398958827, + "eval_dim_384_cosine_accuracy": 0.8713913866540464, + "eval_dim_384_dot_accuracy": 0.1286086133459536, + "eval_dim_384_euclidean_accuracy": 0.8713913866540464, + "eval_dim_384_manhattan_accuracy": 0.8762423095125415, + "eval_dim_384_max_accuracy": 0.8762423095125415, + "eval_dim_64_cosine_accuracy": 0.8664221486038808, + "eval_dim_64_dot_accuracy": 0.15061523899668716, + "eval_dim_64_euclidean_accuracy": 0.8655939422621864, + "eval_dim_64_manhattan_accuracy": 0.8737576904874585, + "eval_dim_64_max_accuracy": 0.8737576904874585, + "eval_loss": 16.52326011657715, + "eval_runtime": 103.0214, + "eval_samples_per_second": 82.041, + "eval_sequential_score": 0.8664221486038808, + "eval_steps_per_second": 2.572, + "step": 3900 + }, + { + "epoch": 26.588136306268407, + "grad_norm": 6.951057434082031, + "learning_rate": 1.0565750491837925e-05, + "loss": 15.846, + "step": 3950 + }, + { + "epoch": 26.588136306268407, + "eval_dim_128_cosine_accuracy": 0.8685518220539518, + "eval_dim_128_dot_accuracy": 0.13689067676289635, + "eval_dim_128_euclidean_accuracy": 0.867841930903928, + "eval_dim_128_manhattan_accuracy": 0.8729294841457643, + "eval_dim_128_max_accuracy": 0.8729294841457643, + "eval_dim_256_cosine_accuracy": 0.8712730714623758, + "eval_dim_256_dot_accuracy": 0.13097491717936582, + "eval_dim_256_euclidean_accuracy": 0.8704448651206815, + "eval_dim_256_manhattan_accuracy": 0.8765972550875533, + "eval_dim_256_max_accuracy": 0.8765972550875533, + "eval_dim_384_cosine_accuracy": 0.8717463322290582, + "eval_dim_384_dot_accuracy": 0.1282536677709418, + "eval_dim_384_euclidean_accuracy": 0.8717463322290582, + "eval_dim_384_manhattan_accuracy": 0.8741126360624705, + "eval_dim_384_max_accuracy": 0.8741126360624705, + "eval_dim_64_cosine_accuracy": 0.8654756270705158, + "eval_dim_64_dot_accuracy": 0.1499053478466635, + "eval_dim_64_euclidean_accuracy": 0.865120681495504, + "eval_dim_64_manhattan_accuracy": 0.8729294841457643, + "eval_dim_64_max_accuracy": 0.8729294841457643, + "eval_loss": 16.517038345336914, + "eval_runtime": 103.0824, + "eval_samples_per_second": 81.993, + "eval_sequential_score": 0.8654756270705158, + "eval_steps_per_second": 2.571, + "step": 3950 + }, + { + "epoch": 26.924694993689524, + "grad_norm": 7.398913860321045, + "learning_rate": 1.0330137426761136e-05, + "loss": 15.8012, + "step": 4000 + }, + { + "epoch": 26.924694993689524, + "eval_dim_128_cosine_accuracy": 0.8663038334122102, + "eval_dim_128_dot_accuracy": 0.1386654046379555, + "eval_dim_128_euclidean_accuracy": 0.865712257453857, + "eval_dim_128_manhattan_accuracy": 0.8742309512541411, + "eval_dim_128_max_accuracy": 0.8742309512541411, + "eval_dim_256_cosine_accuracy": 0.86819687647894, + "eval_dim_256_dot_accuracy": 0.1319214387127307, + "eval_dim_256_euclidean_accuracy": 0.8680785612872692, + "eval_dim_256_manhattan_accuracy": 0.8762423095125415, + "eval_dim_256_max_accuracy": 0.8762423095125415, + "eval_dim_384_cosine_accuracy": 0.8698532891623284, + "eval_dim_384_dot_accuracy": 0.13014671083767157, + "eval_dim_384_euclidean_accuracy": 0.8698532891623284, + "eval_dim_384_manhattan_accuracy": 0.8737576904874585, + "eval_dim_384_max_accuracy": 0.8737576904874585, + "eval_dim_64_cosine_accuracy": 0.8634642688121155, + "eval_dim_64_dot_accuracy": 0.1508518693800284, + "eval_dim_64_euclidean_accuracy": 0.8637008991954567, + "eval_dim_64_manhattan_accuracy": 0.8728111689540937, + "eval_dim_64_max_accuracy": 0.8728111689540937, + "eval_loss": 16.53356170654297, + "eval_runtime": 104.1737, + "eval_samples_per_second": 81.134, + "eval_sequential_score": 0.8634642688121155, + "eval_steps_per_second": 2.544, + "step": 4000 + } + ], + "logging_steps": 50, + "max_steps": 7400, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}