{ "d_model": 128, "num_layers": 2, "T_local": 3, "cluster_size": 8, "seq_len": 256, "batch_size": 96, "learning_rate": 4.76e-4, "weight_decay": 0.0541, "dropout": 0.30 }