canrager commited on
Commit
2756bc6
1 Parent(s): 3715107

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json +27 -0
  2. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json +1 -0
  3. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json +27 -0
  4. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json +1 -0
  5. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json +27 -0
  6. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json +1 -0
  7. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json +27 -0
  8. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json +1 -0
  9. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json +27 -0
  10. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json +1 -0
  11. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json +27 -0
  12. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json +1 -0
  13. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json +27 -0
  14. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json +1 -0
  15. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json +27 -0
  16. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json +1 -0
  17. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json +27 -0
  18. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json +1 -0
  19. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json +27 -0
  20. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json +1 -0
  21. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json +27 -0
  22. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json +1 -0
  23. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json +27 -0
  24. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json +1 -0
  25. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json +27 -0
  26. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json +1 -0
  27. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json +27 -0
  28. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json +1 -0
  29. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json +27 -0
  30. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json +1 -0
  31. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json +27 -0
  32. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json +1 -0
  33. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json +27 -0
  34. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json +1 -0
  35. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json +27 -0
  36. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json +1 -0
  37. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json +27 -0
  38. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json +1 -0
  39. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json +27 -0
  40. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json +1 -0
  41. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json +27 -0
  42. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json +1 -0
  43. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json +27 -0
  44. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json +1 -0
  45. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json +27 -0
  46. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json +1 -0
  47. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json +27 -0
  48. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json +1 -0
  49. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json +27 -0
  50. gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json +1 -0
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "0"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 217.6, "l1_loss": 12697.6, "l0": 9219.10849609375, "frac_variance_explained": -1.03125, "cossim": 0.0041290283203125, "l2_ratio": 1.1546875, "relative_reconstruction_bias": 242.8, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "154"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_154/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 128.35, "l1_loss": 3496.0, "l0": 3833.566796875, "frac_variance_explained": -0.017578125, "cossim": 0.4515625, "l2_ratio": 0.462890625, "relative_reconstruction_bias": 1.023046875, "loss_original": 2.440642213821411, "loss_reconstructed": 8.181695604324341, "loss_zero": 12.452932643890382, "frac_recovered": 0.4267542868852615, "frac_alive": 0.9962565302848816, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "1544"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_1544/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 69.55, "l1_loss": 493.4, "l0": 255.4000045776367, "frac_variance_explained": 0.64609375, "cossim": 0.8765625, "l2_ratio": 0.824609375, "relative_reconstruction_bias": 0.947265625, "loss_original": 2.440642213821411, "loss_reconstructed": 3.072607707977295, "loss_zero": 12.452932643890382, "frac_recovered": 0.9368688404560089, "frac_alive": 0.2028537392616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "15440"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_15440/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 45.65, "l1_loss": 719.6, "l0": 673.3541748046875, "frac_variance_explained": 0.8828125, "cossim": 0.948046875, "l2_ratio": 0.91171875, "relative_reconstruction_bias": 0.974609375, "loss_original": 2.440642213821411, "loss_reconstructed": 2.5144663572311403, "loss_zero": 12.452932643890382, "frac_recovered": 0.992665809392929, "frac_alive": 0.6722548007965088, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "48"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_48/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 174.1, "l1_loss": 11296.0, "l0": 8454.97939453125, "frac_variance_explained": -0.75546875, "cossim": 0.10068359375, "l2_ratio": 0.75859375, "relative_reconstruction_bias": 7.478125, "loss_original": 2.440642213821411, "loss_reconstructed": 10.471371078491211, "loss_zero": 12.452932643890382, "frac_recovered": 0.1980000004172325, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "488"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_488/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 89.5, "l1_loss": 508.4, "l0": 306.70000915527345, "frac_variance_explained": 0.43046875, "cossim": 0.784375, "l2_ratio": 0.713671875, "relative_reconstruction_bias": 0.91953125, "loss_original": 2.440642213821411, "loss_reconstructed": 5.207838249206543, "loss_zero": 12.452932643890382, "frac_recovered": 0.7236662685871125, "frac_alive": 0.7305229902267456, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.025,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "4882"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_0_step_4882/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 54.025, "l1_loss": 653.2, "l0": 478.7708465576172, "frac_variance_explained": 0.78828125, "cossim": 0.926953125, "l2_ratio": 0.87734375, "relative_reconstruction_bias": 0.951953125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.5821482658386232, "loss_zero": 12.452932643890382, "frac_recovered": 0.98592569231987, "frac_alive": 0.2722981870174408, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "0"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 230.8, "l1_loss": 13510.4, "l0": 9219.1044921875, "frac_variance_explained": -1.053125, "cossim": 0.0073699951171875, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 181.7, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "154"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_154/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 138.4, "l1_loss": 3404.8, "l0": 3756.1584228515626, "frac_variance_explained": -0.034765625, "cossim": 0.35234375, "l2_ratio": 0.4197265625, "relative_reconstruction_bias": 1.165625, "loss_original": 2.440642213821411, "loss_reconstructed": 9.118078804016113, "loss_zero": 12.452932643890382, "frac_recovered": 0.33320634365081786, "frac_alive": 0.997178852558136, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "1544"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_1544/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 80.25, "l1_loss": 283.6, "l0": 86.57500228881835, "frac_variance_explained": 0.50859375, "cossim": 0.832421875, "l2_ratio": 0.780859375, "relative_reconstruction_bias": 0.944140625, "loss_original": 2.440642213821411, "loss_reconstructed": 4.01392765045166, "loss_zero": 12.452932643890382, "frac_recovered": 0.8428499519824981, "frac_alive": 0.138671875, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "15440"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_15440/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 58.025, "l1_loss": 424.0, "l0": 212.88750610351562, "frac_variance_explained": 0.785546875, "cossim": 0.91328125, "l2_ratio": 0.867578125, "relative_reconstruction_bias": 0.959765625, "loss_original": 2.440642213821411, "loss_reconstructed": 2.6162596464157106, "loss_zero": 12.452932643890382, "frac_recovered": 0.9825248777866363, "frac_alive": 0.3571506142616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "48"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_48/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 188.9, "l1_loss": 12384.0, "l0": 8454.1712890625, "frac_variance_explained": -0.6765625, "cossim": 0.08720703125, "l2_ratio": 0.75625, "relative_reconstruction_bias": 8.128125, "loss_original": 2.440642213821411, "loss_reconstructed": 10.604339790344238, "loss_zero": 12.452932643890382, "frac_recovered": 0.18471183478832245, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "488"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_488/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 98.15, "l1_loss": 381.2, "l0": 213.7291748046875, "frac_variance_explained": 0.3546875, "cossim": 0.727734375, "l2_ratio": 0.65, "relative_reconstruction_bias": 0.910546875, "loss_original": 2.440642213821411, "loss_reconstructed": 6.74614634513855, "loss_zero": 12.452932643890382, "frac_recovered": 0.5700634002685547, "frac_alive": 0.7088758945465088, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.035,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "4882"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_1_step_4882/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 68.95, "l1_loss": 354.8, "l0": 141.5416732788086, "frac_variance_explained": 0.626953125, "cossim": 0.87421875, "l2_ratio": 0.82109375, "relative_reconstruction_bias": 0.94296875, "loss_original": 2.440642213821411, "loss_reconstructed": 2.8861867427825927, "loss_zero": 12.452932643890382, "frac_recovered": 0.9555670261383057, "frac_alive": 0.1347113698720932, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "0"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 227.9, "l1_loss": 13337.6, "l0": 9220.1462890625, "frac_variance_explained": -1.034375, "cossim": 0.00837249755859375, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 130.1, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "154"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_154/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 141.2, "l1_loss": 3417.6, "l0": 3691.47509765625, "frac_variance_explained": -0.040234375, "cossim": 0.3244140625, "l2_ratio": 0.4064453125, "relative_reconstruction_bias": 1.22109375, "loss_original": 2.440642213821411, "loss_reconstructed": 9.383785438537597, "loss_zero": 12.452932643890382, "frac_recovered": 0.3066366076469421, "frac_alive": 0.99658203125, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "1544"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_1544/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 82.7, "l1_loss": 241.6, "l0": 60.62500152587891, "frac_variance_explained": 0.5109375, "cossim": 0.8140625, "l2_ratio": 0.765625, "relative_reconstruction_bias": 0.950390625, "loss_original": 2.440642213821411, "loss_reconstructed": 4.482995939254761, "loss_zero": 12.452932643890382, "frac_recovered": 0.7959964573383331, "frac_alive": 0.1371527761220932, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "15440"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_15440/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 63.65, "l1_loss": 358.4, "l0": 140.45833892822264, "frac_variance_explained": 0.72890625, "cossim": 0.897265625, "l2_ratio": 0.844921875, "relative_reconstruction_bias": 0.951953125, "loss_original": 2.440642213821411, "loss_reconstructed": 2.716301202774048, "loss_zero": 12.452932643890382, "frac_recovered": 0.9725344896316528, "frac_alive": 0.241970494389534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "48"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_48/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 176.7, "l1_loss": 11347.2, "l0": 8398.85029296875, "frac_variance_explained": -0.7203125, "cossim": 0.086279296875, "l2_ratio": 0.7546875, "relative_reconstruction_bias": 8.26875, "loss_original": 2.440642213821411, "loss_reconstructed": 10.636037349700928, "loss_zero": 12.452932643890382, "frac_recovered": 0.18154401183128357, "frac_alive": 0.9999457597732544, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "488"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_488/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 101.1, "l1_loss": 326.8, "l0": 210.6291702270508, "frac_variance_explained": 0.2640625, "cossim": 0.70234375, "l2_ratio": 0.614453125, "relative_reconstruction_bias": 0.890625, "loss_original": 2.440642213821411, "loss_reconstructed": 7.406615495681763, "loss_zero": 12.452932643890382, "frac_recovered": 0.5041129201650619, "frac_alive": 0.7194553017616272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.04,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "4882"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_2_step_4882/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 72.2, "l1_loss": 301.6, "l0": 86.50000228881837, "frac_variance_explained": 0.73359375, "cossim": 0.8578125, "l2_ratio": 0.798828125, "relative_reconstruction_bias": 0.959375, "loss_original": 2.440642213821411, "loss_reconstructed": 3.290687155723572, "loss_zero": 12.452932643890382, "frac_recovered": 0.9150846123695373, "frac_alive": 0.0876193568110466, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.05,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "0"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_0/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 224.6, "l1_loss": 13132.8, "l0": 9213.05458984375, "frac_variance_explained": -1.03046875, "cossim": 0.007816314697265625, "l2_ratio": 1.15546875, "relative_reconstruction_bias": 195.5, "loss_original": 2.440642213821411, "loss_reconstructed": 19.563122940063476, "loss_zero": 12.452932643890382, "frac_recovered": -0.710543018579483, "frac_alive": 1.0, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.05,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "154"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_154/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 140.9, "l1_loss": 3142.4, "l0": 3520.612548828125, "frac_variance_explained": -0.069140625, "cossim": 0.291015625, "l2_ratio": 0.3875, "relative_reconstruction_bias": 1.308203125, "loss_original": 2.440642213821411, "loss_reconstructed": 9.744282913208007, "loss_zero": 12.452932643890382, "frac_recovered": 0.2705884039402008, "frac_alive": 0.9943576455116272, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.05,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "1544"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_1544/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 91.6, "l1_loss": 197.6, "l0": 37.650001525878906, "frac_variance_explained": 0.503515625, "cossim": 0.78046875, "l2_ratio": 0.72265625, "relative_reconstruction_bias": 0.947265625, "loss_original": 2.440642213821411, "loss_reconstructed": 5.352246809005737, "loss_zero": 12.452932643890382, "frac_recovered": 0.709167218208313, "frac_alive": 0.1135525181889534, "hyperparameters": {"n_inputs": 250, "context_length": 128}}
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "trainer": {
3
+ "dict_class": "AutoEncoder",
4
+ "trainer_class": "StandardTrainer",
5
+ "activation_dim": 2304,
6
+ "dict_size": 18432,
7
+ "lr": 0.0003,
8
+ "l1_penalty": 0.05,
9
+ "warmup_steps": 1000,
10
+ "resample_steps": null,
11
+ "device": "cuda:0",
12
+ "layer": 11,
13
+ "lm_name": "google/gemma-2-2b",
14
+ "wandb_name": "StandardTrainer-google/gemma-2-2b-resid_post_layer_11",
15
+ "submodule_name": "resid_post_layer_11",
16
+ "steps": "15440"
17
+ },
18
+ "buffer": {
19
+ "d_submodule": 2304,
20
+ "io": "out",
21
+ "n_ctxs": 2000,
22
+ "ctx_len": 128,
23
+ "refresh_batch_size": 24,
24
+ "out_batch_size": 4096,
25
+ "device": "cuda:0"
26
+ }
27
+ }
gemma-2-2b_sweep_standard_ctx128_ef8_0824/resid_post_layer_11_checkpoints/trainer_3_step_15440/eval_results.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"l2_loss": 72.05, "l1_loss": 252.3, "l0": 64.75000114440918, "frac_variance_explained": 0.631640625, "cossim": 0.861328125, "l2_ratio": 0.807421875, "relative_reconstruction_bias": 0.9453125, "loss_original": 2.440642213821411, "loss_reconstructed": 3.132000136375427, "loss_zero": 12.452932643890382, "frac_recovered": 0.9310269713401794, "frac_alive": 0.099500872194767, "hyperparameters": {"n_inputs": 250, "context_length": 128}}