{"output_dir": "/data2/assaf/mamba/outputs/models", "cache_dir": "/data2/hf_cache", "activate_logging": true, "wandb_dir": "/data2/assaf/wandb/mamba", "run_name_addon": "retrieve doc id, train full dataset, append query to each noise doc + query/doc flag, 11 train noise docs, inj policy=random_loc, batch_size=64, deci_layer=12, Lbase=2000", "record_debug_params": false, "recover_step": null, "eval_mode": false, "mamba_arch": "deci", "model_type": "mamba-130m", "use_finetuned_model": false, "load_cp": null, "clip_grad": true, "clip_grad_max_norm": 1, "seed": 123, "lr_sched_type": "const", "sampling_temperature": 1.2, "save_steps": 50, "eval_steps": 10, "grad_flow_steps": 100, "max_step": 200000, "epochs": 10, "model_device": "cuda:0", "dataset": "squad_retrieve", "train_set_size": 6144, "eval_set_size": 20, "eval_samples_to_log": 10, "log_eval_predictions_steps": 10, "eval_max_len": 10, "max_train_input_len": 20000, "enable_eos_token": false, "scrolls_evaluator_path": "/data1/assaf/datasets/scrolls/evaluator/dataset_evaluator.py", "niah_train_set_size": 6144, "niah_context_len_train": 2000, "niah_needle_depths_eval": [0, 0.25, 0.5, 0.75, 1], "niah_context_lens_eval": [128000, 144000, 176000], "ppl_test_context_len_train": 2000, "ppl_test_pred_len": 100, "multidoc_num_noise_docs_train": 11, "multidoc_num_noise_docs_eval": [0, 5, 10, 20, 40, 80, 120, 160, 200], "multidoc_noise_injection_policy": "random_loc", "activate_decimation": true, "decimation_type": "max_p", "decimation_k": 6, "min_decimating_layer": 12, "max_decimating_layer": 20, "decimating_layers": [12], "decimation_min_seq_len": 20, "decimation_max_p_L_base": 2000, "decimation_freeze_encoder": false, "find_deci_layer": false, "lr": 0.0001, "weight_decay": 0.1, "grad_accum_steps": 64, "activate_profiling": false, "deci_num_chunks": 1}