Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json +1 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json +27 -0
- gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json +1 -0
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "0"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 217.6, "l1_loss": 12697.6, "l0": 9219.10849609375, "frac_variance_explained": -1.03125, "cossim": 0.0041290283203125, "l2_ratio": 1.1546875, "relative_reconstruction_bias": 242.8, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "154"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 128.35, "l1_loss": 3496.0, "l0": 3833.566796875, "frac_variance_explained": -0.017578125, "cossim": 0.4515625, "l2_ratio": 0.462890625, "relative_reconstruction_bias": 1.023046875, "loss_original": 2.440642213821411, "loss_reconstructed": 8.181695604324341, "loss_zero": 12.452932643890382, "frac_recovered": 0.4267542868852615, "frac_alive": 0.9962565302848816, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "1544"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 69.55, "l1_loss": 493.4, "l0": 255.4000045776367, "frac_variance_explained": 0.64609375, "cossim": 0.8765625, "l2_ratio": 0.824609375, "relative_reconstruction_bias": 0.947265625, "loss_original": 2.440642213821411, "loss_reconstructed": 3.072607707977295, "loss_zero": 12.452932643890382, "frac_recovered": 0.9368688404560089, "frac_alive": 0.2028537392616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "15440"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 45.65, "l1_loss": 719.6, "l0": 673.3541748046875, "frac_variance_explained": 0.8828125, "cossim": 0.948046875, "l2_ratio": 0.91171875, "relative_reconstruction_bias": 0.974609375, "loss_original": 2.440642213821411, "loss_reconstructed": 2.5144663572311403, "loss_zero": 12.452932643890382, "frac_recovered": 0.992665809392929, "frac_alive": 0.6722548007965088, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "48"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 174.1, "l1_loss": 11296.0, "l0": 8454.97939453125, "frac_variance_explained": -0.75546875, "cossim": 0.10068359375, "l2_ratio": 0.75859375, "relative_reconstruction_bias": 7.478125, "loss_original": 2.440642213821411, "loss_reconstructed": 10.471371078491211, "loss_zero": 12.452932643890382, "frac_recovered": 0.1980000004172325, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "488"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 89.5, "l1_loss": 508.4, "l0": 306.70000915527345, "frac_variance_explained": 0.43046875, "cossim": 0.784375, "l2_ratio": 0.713671875, "relative_reconstruction_bias": 0.91953125, "loss_original": 2.440642213821411, "loss_reconstructed": 5.207838249206543, "loss_zero": 12.452932643890382, "frac_recovered": 0.7236662685871125, "frac_alive": 0.7305229902267456, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.025,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "4882"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 54.025, "l1_loss": 653.2, "l0": 478.7708465576172, "frac_variance_explained": 0.78828125, "cossim": 0.926953125, "l2_ratio": 0.87734375, "relative_reconstruction_bias": 0.951953125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.5821482658386232, "loss_zero": 12.452932643890382, "frac_recovered": 0.98592569231987, "frac_alive": 0.2722981870174408, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "0"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 230.8, "l1_loss": 13510.4, "l0": 9219.1044921875, "frac_variance_explained": -1.053125, "cossim": 0.0073699951171875, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 181.7, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "154"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 138.4, "l1_loss": 3404.8, "l0": 3756.1584228515626, "frac_variance_explained": -0.034765625, "cossim": 0.35234375, "l2_ratio": 0.4197265625, "relative_reconstruction_bias": 1.165625, "loss_original": 2.440642213821411, "loss_reconstructed": 9.118078804016113, "loss_zero": 12.452932643890382, "frac_recovered": 0.33320634365081786, "frac_alive": 0.997178852558136, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "1544"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 80.25, "l1_loss": 283.6, "l0": 86.57500228881835, "frac_variance_explained": 0.50859375, "cossim": 0.832421875, "l2_ratio": 0.780859375, "relative_reconstruction_bias": 0.944140625, "loss_original": 2.440642213821411, "loss_reconstructed": 4.01392765045166, "loss_zero": 12.452932643890382, "frac_recovered": 0.8428499519824981, "frac_alive": 0.138671875, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "15440"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 58.025, "l1_loss": 424.0, "l0": 212.88750610351562, "frac_variance_explained": 0.785546875, "cossim": 0.91328125, "l2_ratio": 0.867578125, "relative_reconstruction_bias": 0.959765625, "loss_original": 2.440642213821411, "loss_reconstructed": 2.6162596464157106, "loss_zero": 12.452932643890382, "frac_recovered": 0.9825248777866363, "frac_alive": 0.3571506142616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "48"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 188.9, "l1_loss": 12384.0, "l0": 8454.1712890625, "frac_variance_explained": -0.6765625, "cossim": 0.08720703125, "l2_ratio": 0.75625, "relative_reconstruction_bias": 8.128125, "loss_original": 2.440642213821411, "loss_reconstructed": 10.604339790344238, "loss_zero": 12.452932643890382, "frac_recovered": 0.18471183478832245, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "488"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 98.15, "l1_loss": 381.2, "l0": 213.7291748046875, "frac_variance_explained": 0.3546875, "cossim": 0.727734375, "l2_ratio": 0.65, "relative_reconstruction_bias": 0.910546875, "loss_original": 2.440642213821411, "loss_reconstructed": 6.74614634513855, "loss_zero": 12.452932643890382, "frac_recovered": 0.5700634002685547, "frac_alive": 0.7088758945465088, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.035,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "4882"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 68.95, "l1_loss": 354.8, "l0": 141.5416732788086, "frac_variance_explained": 0.626953125, "cossim": 0.87421875, "l2_ratio": 0.82109375, "relative_reconstruction_bias": 0.94296875, "loss_original": 2.440642213821411, "loss_reconstructed": 2.8861867427825927, "loss_zero": 12.452932643890382, "frac_recovered": 0.9555670261383057, "frac_alive": 0.1347113698720932, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "0"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 227.9, "l1_loss": 13337.6, "l0": 9220.1462890625, "frac_variance_explained": -1.034375, "cossim": 0.00837249755859375, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 130.1, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "154"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 141.2, "l1_loss": 3417.6, "l0": 3691.47509765625, "frac_variance_explained": -0.040234375, "cossim": 0.3244140625, "l2_ratio": 0.4064453125, "relative_reconstruction_bias": 1.22109375, "loss_original": 2.440642213821411, "loss_reconstructed": 9.383785438537597, "loss_zero": 12.452932643890382, "frac_recovered": 0.3066366076469421, "frac_alive": 0.99658203125, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "1544"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 82.7, "l1_loss": 241.6, "l0": 60.62500152587891, "frac_variance_explained": 0.5109375, "cossim": 0.8140625, "l2_ratio": 0.765625, "relative_reconstruction_bias": 0.950390625, "loss_original": 2.440642213821411, "loss_reconstructed": 4.482995939254761, "loss_zero": 12.452932643890382, "frac_recovered": 0.7959964573383331, "frac_alive": 0.1371527761220932, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "15440"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 63.65, "l1_loss": 358.4, "l0": 140.45833892822264, "frac_variance_explained": 0.72890625, "cossim": 0.897265625, "l2_ratio": 0.844921875, "relative_reconstruction_bias": 0.951953125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.716301202774048, "loss_zero": 12.452932643890382, "frac_recovered": 0.9725344896316528, "frac_alive": 0.241970494389534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "48"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 176.7, "l1_loss": 11347.2, "l0": 8398.85029296875, "frac_variance_explained": -0.7203125, "cossim": 0.086279296875, "l2_ratio": 0.7546875, "relative_reconstruction_bias": 8.26875, "loss_original": 2.440642213821411, "loss_reconstructed": 10.636037349700928, "loss_zero": 12.452932643890382, "frac_recovered": 0.18154401183128357, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "488"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 101.1, "l1_loss": 326.8, "l0": 210.6291702270508, "frac_variance_explained": 0.2640625, "cossim": 0.70234375, "l2_ratio": 0.614453125, "relative_reconstruction_bias": 0.890625, "loss_original": 2.440642213821411, "loss_reconstructed": 7.406615495681763, "loss_zero": 12.452932643890382, "frac_recovered": 0.5041129201650619, "frac_alive": 0.7194553017616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.04,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "4882"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 72.2, "l1_loss": 301.6, "l0": 86.50000228881837, "frac_variance_explained": 0.73359375, "cossim": 0.8578125, "l2_ratio": 0.798828125, "relative_reconstruction_bias": 0.959375, "loss_original": 2.440642213821411, "loss_reconstructed": 3.290687155723572, "loss_zero": 12.452932643890382, "frac_recovered": 0.9150846123695373, "frac_alive": 0.0876193568110466, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.05,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "0"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 224.6, "l1_loss": 13132.8, "l0": 9213.05458984375, "frac_variance_explained": -1.03046875, "cossim": 0.007816314697265625, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 195.5, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.05,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "154"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 140.9, "l1_loss": 3142.4, "l0": 3520.612548828125, "frac_variance_explained": -0.069140625, "cossim": 0.291015625, "l2_ratio": 0.3875, "relative_reconstruction_bias": 1.308203125, "loss_original": 2.440642213821411, "loss_reconstructed": 9.744282913208007, "loss_zero": 12.452932643890382, "frac_recovered": 0.2705884039402008, "frac_alive": 0.9943576455116272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.05,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "1544"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 91.6, "l1_loss": 197.6, "l0": 37.650001525878906, "frac_variance_explained": 0.503515625, "cossim": 0.78046875, "l2_ratio": 0.72265625, "relative_reconstruction_bias": 0.947265625, "loss_original": 2.440642213821411, "loss_reconstructed": 5.352246809005737, "loss_zero": 12.452932643890382, "frac_recovered": 0.709167218208313, "frac_alive": 0.1135525181889534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"trainer": {
|
3 |
+
"dict_class": "AutoEncoder",
|
4 |
+
"trainer_class": "StandardTrainer",
|
5 |
+
"activation_dim": 2304,
|
6 |
+
"dict_size": 18432,
|
7 |
+
"lr": 0.0003,
|
8 |
+
"l1_penalty": 0.05,
|
9 |
+
"warmup_steps": 1000,
|
10 |
+
"resample_steps": null,
|
11 |
+
"device": "cuda:0",
|
12 |
+
"layer": 11,
|
13 |
+
"lm_name": "google/gemma-2-2b",
|
14 |
+
"wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
|
15 |
+
"submodule_name": "resid_post_layer_11",
|
16 |
+
"steps": "15440"
|
17 |
+
},
|
18 |
+
"buffer": {
|
19 |
+
"d_submodule": 2304,
|
20 |
+
"io": "out",
|
21 |
+
"n_ctxs": 2000,
|
22 |
+
"ctx_len": 128,
|
23 |
+
"refresh_batch_size": 24,
|
24 |
+
"out_batch_size": 4096,
|
25 |
+
"device": "cuda:0"
|
26 |
+
}
|
27 |
+
}
|
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"l2_loss": 72.05, "l1_loss": 252.3, "l0": 64.75000114440918, "frac_variance_explained": 0.631640625, "cossim": 0.861328125, "l2_ratio": 0.807421875, "relative_reconstruction_bias": 0.9453125, "loss_original": 2.440642213821411, "loss_reconstructed": 3.132000136375427, "loss_zero": 12.452932643890382, "frac_recovered": 0.9310269713401794, "frac_alive": 0.099500872194767, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
|