Add files using upload-large-folder tool
Browse files- .gitattributes +6 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +268 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +3 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +0 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +323 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +670 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +414 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +74 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +74 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +74 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +74 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +74 -0
- old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json +74 -0
.gitattributes
CHANGED
@@ -38,3 +38,9 @@ random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_re
|
|
38 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
39 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
40 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
39 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
40 |
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
41 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
42 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
43 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
44 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
45 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
46 |
+
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "431e48c6-55b1-4fe2-a188-fdf282380ea2",
|
17 |
+
"datetime_epoch_millis": 1738783485930,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.09751356079257899,
|
21 |
+
"mean_full_absorption_score": 0.004661694890776431,
|
22 |
+
"mean_num_split_features": 1.0384615384615385,
|
23 |
+
"std_dev_absorption_fraction_score": 0.10468056684076392,
|
24 |
+
"std_dev_full_absorption_score": 0.00923750174821687,
|
25 |
+
"std_dev_num_split_features": 0.19611613513818404
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.10952248859777117,
|
32 |
+
"full_absorption_rate": 0.0007757951900698216,
|
33 |
+
"num_full_absorption": 2,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.037033530481398055,
|
40 |
+
"full_absorption_rate": 0.0,
|
41 |
+
"num_full_absorption": 0,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.32412380384249956,
|
48 |
+
"full_absorption_rate": 0.012851684612712747,
|
49 |
+
"num_full_absorption": 37,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.2652438933350776,
|
56 |
+
"full_absorption_rate": 0.015439429928741092,
|
57 |
+
"num_full_absorption": 26,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.029511883739283127,
|
64 |
+
"full_absorption_rate": 0.005829015544041451,
|
65 |
+
"num_full_absorption": 9,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.03661148893601744,
|
72 |
+
"full_absorption_rate": 0.0008598452278589854,
|
73 |
+
"num_full_absorption": 1,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.07005964729193298,
|
80 |
+
"full_absorption_rate": 0.0027223230490018148,
|
81 |
+
"num_full_absorption": 3,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.025913905709330304,
|
88 |
+
"full_absorption_rate": 0.0019821605550049554,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.16489142397701123,
|
96 |
+
"full_absorption_rate": 0.005441354292623942,
|
97 |
+
"num_full_absorption": 9,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.017172634110049358,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.016921317745917687,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.043985977343370744,
|
120 |
+
"full_absorption_rate": 0.0,
|
121 |
+
"num_full_absorption": 0,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.02707755806490339,
|
128 |
+
"full_absorption_rate": 0.0011025358324145535,
|
129 |
+
"num_full_absorption": 2,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.04764539422889375,
|
136 |
+
"full_absorption_rate": 0.003778337531486146,
|
137 |
+
"num_full_absorption": 3,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.11199046697502817,
|
144 |
+
"full_absorption_rate": 0.002857142857142857,
|
145 |
+
"num_full_absorption": 3,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.2957251949612795,
|
152 |
+
"full_absorption_rate": 0.002926421404682274,
|
153 |
+
"num_full_absorption": 7,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.00767947180411824,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.06558605221479297,
|
168 |
+
"full_absorption_rate": 0.0006016847172081829,
|
169 |
+
"num_full_absorption": 1,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.24032658294094394,
|
176 |
+
"full_absorption_rate": 0.015951790145338533,
|
177 |
+
"num_full_absorption": 45,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.04682942316731627,
|
184 |
+
"full_absorption_rate": 0.0,
|
185 |
+
"num_full_absorption": 0,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.33437220839551257,
|
192 |
+
"full_absorption_rate": 0.043824701195219126,
|
193 |
+
"num_full_absorption": 33,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.017881854389667527,
|
200 |
+
"full_absorption_rate": 0.0011876484560570072,
|
201 |
+
"num_full_absorption": 1,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.03906620984144505,
|
208 |
+
"full_absorption_rate": 0.0030721966205837174,
|
209 |
+
"num_full_absorption": 2,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.11287436426111006,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 2
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.030323895956477616,
|
224 |
+
"full_absorption_rate": 0.0,
|
225 |
+
"num_full_absorption": 0,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.016981908295905283,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "2f7686cc-05f0-4e61-bd5d-1e2c4da21721",
|
17 |
+
"datetime_epoch_millis": 1738785879962,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.12074450732711571,
|
21 |
+
"mean_full_absorption_score": 0.013239901484226383,
|
22 |
+
"mean_num_split_features": 1.1153846153846154,
|
23 |
+
"std_dev_absorption_fraction_score": 0.1336485927173351,
|
24 |
+
"std_dev_full_absorption_score": 0.026396743147366758,
|
25 |
+
"std_dev_num_split_features": 0.3258125936084211
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.1828675228579153,
|
32 |
+
"full_absorption_rate": 0.004266873545384018,
|
33 |
+
"num_full_absorption": 11,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.06300828457462988,
|
40 |
+
"full_absorption_rate": 0.003605769230769231,
|
41 |
+
"num_full_absorption": 6,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.37049847095340266,
|
48 |
+
"full_absorption_rate": 0.04307051059395624,
|
49 |
+
"num_full_absorption": 124,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.4320868702385805,
|
56 |
+
"full_absorption_rate": 0.083729216152019,
|
57 |
+
"num_full_absorption": 141,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.04056307305862558,
|
64 |
+
"full_absorption_rate": 0.007772020725388601,
|
65 |
+
"num_full_absorption": 12,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 2
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.0644915910448518,
|
72 |
+
"full_absorption_rate": 0.0025795356835769563,
|
73 |
+
"num_full_absorption": 3,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.06433309126945641,
|
80 |
+
"full_absorption_rate": 0.0027223230490018148,
|
81 |
+
"num_full_absorption": 3,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.04143727370568341,
|
88 |
+
"full_absorption_rate": 0.0019821605550049554,
|
89 |
+
"num_full_absorption": 2,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.18541141915011472,
|
96 |
+
"full_absorption_rate": 0.009673518742442563,
|
97 |
+
"num_full_absorption": 16,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0066235494743313895,
|
104 |
+
"full_absorption_rate": 0.0,
|
105 |
+
"num_full_absorption": 0,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.010681486258684182,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.09145893836903148,
|
120 |
+
"full_absorption_rate": 0.00502092050209205,
|
121 |
+
"num_full_absorption": 6,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.06668737220718245,
|
128 |
+
"full_absorption_rate": 0.004410143329658214,
|
129 |
+
"num_full_absorption": 8,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.0602452762340166,
|
136 |
+
"full_absorption_rate": 0.003778337531486146,
|
137 |
+
"num_full_absorption": 3,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.12143825786919218,
|
144 |
+
"full_absorption_rate": 0.009523809523809525,
|
145 |
+
"num_full_absorption": 10,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.37822836353215283,
|
152 |
+
"full_absorption_rate": 0.022993311036789296,
|
153 |
+
"num_full_absorption": 55,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.006096479671690135,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.09546917920176193,
|
168 |
+
"full_absorption_rate": 0.007821901323706379,
|
169 |
+
"num_full_absorption": 13,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.44667699863373395,
|
176 |
+
"full_absorption_rate": 0.10953562566465792,
|
177 |
+
"num_full_absorption": 309,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.09178269157296917,
|
184 |
+
"full_absorption_rate": 0.0011926058437686344,
|
185 |
+
"num_full_absorption": 2,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.08318646522525573,
|
192 |
+
"full_absorption_rate": 0.0013280212483399733,
|
193 |
+
"num_full_absorption": 1,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.01805468943015878,
|
200 |
+
"full_absorption_rate": 0.0011876484560570072,
|
201 |
+
"num_full_absorption": 1,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.046187567611688574,
|
208 |
+
"full_absorption_rate": 0.007680491551459293,
|
209 |
+
"num_full_absorption": 5,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.1272381587024434,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.030348097021386293,
|
224 |
+
"full_absorption_rate": 0.010362694300518135,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.014256022636069204,
|
232 |
+
"full_absorption_rate": 0.0,
|
233 |
+
"num_full_absorption": 0,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "8ebb674d-9f15-4d5c-8f47-48448dd21b35",
|
17 |
+
"datetime_epoch_millis": 1738793140191,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.19926936403983087,
|
21 |
+
"mean_full_absorption_score": 0.05967727904676934,
|
22 |
+
"mean_num_split_features": 1.0769230769230769,
|
23 |
+
"std_dev_absorption_fraction_score": 0.200193825605343,
|
24 |
+
"std_dev_full_absorption_score": 0.10500077608288533,
|
25 |
+
"std_dev_num_split_features": 0.271746488194703
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.37783256022605965,
|
32 |
+
"full_absorption_rate": 0.024437548487199378,
|
33 |
+
"num_full_absorption": 63,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.15587574948959265,
|
40 |
+
"full_absorption_rate": 0.007211538461538462,
|
41 |
+
"num_full_absorption": 12,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5719603969783236,
|
48 |
+
"full_absorption_rate": 0.2580757207363668,
|
49 |
+
"num_full_absorption": 743,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 2
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5621089686749688,
|
56 |
+
"full_absorption_rate": 0.2672209026128266,
|
57 |
+
"num_full_absorption": 450,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.16834370820991082,
|
64 |
+
"full_absorption_rate": 0.05699481865284974,
|
65 |
+
"num_full_absorption": 88,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.11667156140480404,
|
72 |
+
"full_absorption_rate": 0.018056749785038694,
|
73 |
+
"num_full_absorption": 21,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.06481540727629757,
|
80 |
+
"full_absorption_rate": 0.0054446460980036296,
|
81 |
+
"num_full_absorption": 6,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.06261331028463099,
|
88 |
+
"full_absorption_rate": 0.0009910802775024777,
|
89 |
+
"num_full_absorption": 1,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.24185250920050555,
|
96 |
+
"full_absorption_rate": 0.039298669891172915,
|
97 |
+
"num_full_absorption": 65,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.016041299343090145,
|
104 |
+
"full_absorption_rate": 0.0024630541871921183,
|
105 |
+
"num_full_absorption": 1,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.012157821213304845,
|
112 |
+
"full_absorption_rate": 0.0,
|
113 |
+
"num_full_absorption": 0,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.13470793723735,
|
120 |
+
"full_absorption_rate": 0.015899581589958158,
|
121 |
+
"num_full_absorption": 19,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.20217104889211931,
|
128 |
+
"full_absorption_rate": 0.017089305402425578,
|
129 |
+
"num_full_absorption": 31,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.06165938260438851,
|
136 |
+
"full_absorption_rate": 0.007556675062972292,
|
137 |
+
"num_full_absorption": 6,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.17666919527397562,
|
144 |
+
"full_absorption_rate": 0.05142857142857143,
|
145 |
+
"num_full_absorption": 54,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.602060568057986,
|
152 |
+
"full_absorption_rate": 0.20108695652173914,
|
153 |
+
"num_full_absorption": 481,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.00665463288277255,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.2618176100631538,
|
168 |
+
"full_absorption_rate": 0.05174488567990373,
|
169 |
+
"num_full_absorption": 86,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.6729389684863054,
|
176 |
+
"full_absorption_rate": 0.4143920595533499,
|
177 |
+
"num_full_absorption": 1169,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 1
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.212218166643232,
|
184 |
+
"full_absorption_rate": 0.015503875968992248,
|
185 |
+
"num_full_absorption": 26,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.2805675145009456,
|
192 |
+
"full_absorption_rate": 0.07702523240371846,
|
193 |
+
"num_full_absorption": 58,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.02793276716008858,
|
200 |
+
"full_absorption_rate": 0.004750593824228029,
|
201 |
+
"num_full_absorption": 4,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.0646989645046496,
|
208 |
+
"full_absorption_rate": 0.010752688172043012,
|
209 |
+
"num_full_absorption": 7,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.08669041060559739,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.030923398913955746,
|
224 |
+
"full_absorption_rate": 0.0,
|
225 |
+
"num_full_absorption": 0,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.009019606907593795,
|
232 |
+
"full_absorption_rate": 0.0041841004184100415,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "8ac82cb6-4d80-4788-b030-d3ff9ecabc15",
|
17 |
+
"datetime_epoch_millis": 1738790694222,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.27380462363873964,
|
21 |
+
"mean_full_absorption_score": 0.11174761587082688,
|
22 |
+
"mean_num_split_features": 1.3846153846153846,
|
23 |
+
"std_dev_absorption_fraction_score": 0.2085371119401205,
|
24 |
+
"std_dev_full_absorption_score": 0.10395305649996002,
|
25 |
+
"std_dev_num_split_features": 0.8978607053178383
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.6334103628494663,
|
32 |
+
"full_absorption_rate": 0.28316524437548485,
|
33 |
+
"num_full_absorption": 730,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.5612972872403076,
|
40 |
+
"full_absorption_rate": 0.19831730769230768,
|
41 |
+
"num_full_absorption": 330,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.4456334749181992,
|
48 |
+
"full_absorption_rate": 0.13928447377561654,
|
49 |
+
"num_full_absorption": 401,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 5
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.4663763563664617,
|
56 |
+
"full_absorption_rate": 0.1680522565320665,
|
57 |
+
"num_full_absorption": 283,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.28617098631064275,
|
64 |
+
"full_absorption_rate": 0.1794041450777202,
|
65 |
+
"num_full_absorption": 277,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.31944391642843717,
|
72 |
+
"full_absorption_rate": 0.13929492691315562,
|
73 |
+
"num_full_absorption": 162,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.326763907694398,
|
80 |
+
"full_absorption_rate": 0.1705989110707804,
|
81 |
+
"num_full_absorption": 188,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.11808222242991506,
|
88 |
+
"full_absorption_rate": 0.018830525272547076,
|
89 |
+
"num_full_absorption": 19,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.33518338041432005,
|
96 |
+
"full_absorption_rate": 0.1505441354292624,
|
97 |
+
"num_full_absorption": 249,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.03867125135736307,
|
104 |
+
"full_absorption_rate": 0.0024630541871921183,
|
105 |
+
"num_full_absorption": 1,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.02708629091329992,
|
112 |
+
"full_absorption_rate": 0.009022556390977444,
|
113 |
+
"num_full_absorption": 6,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.27172567963607736,
|
120 |
+
"full_absorption_rate": 0.11799163179916318,
|
121 |
+
"num_full_absorption": 141,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.500443622143782,
|
128 |
+
"full_absorption_rate": 0.19570011025358325,
|
129 |
+
"num_full_absorption": 355,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 2
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.13894620345252578,
|
136 |
+
"full_absorption_rate": 0.03904282115869018,
|
137 |
+
"num_full_absorption": 31,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.20516155603410746,
|
144 |
+
"full_absorption_rate": 0.11619047619047619,
|
145 |
+
"num_full_absorption": 122,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6877451457818394,
|
152 |
+
"full_absorption_rate": 0.3520066889632107,
|
153 |
+
"num_full_absorption": 842,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.017662373425384282,
|
160 |
+
"full_absorption_rate": 0.0,
|
161 |
+
"num_full_absorption": 0,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.31924702002139765,
|
168 |
+
"full_absorption_rate": 0.07220216606498195,
|
169 |
+
"num_full_absorption": 120,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5966041750062926,
|
176 |
+
"full_absorption_rate": 0.3342786246012052,
|
177 |
+
"num_full_absorption": 943,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.33112850242087766,
|
184 |
+
"full_absorption_rate": 0.11150864639236732,
|
185 |
+
"num_full_absorption": 187,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.1736379175397297,
|
192 |
+
"full_absorption_rate": 0.0451527224435591,
|
193 |
+
"num_full_absorption": 34,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.06615323067748713,
|
200 |
+
"full_absorption_rate": 0.020190023752969122,
|
201 |
+
"num_full_absorption": 17,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.11299054531700574,
|
208 |
+
"full_absorption_rate": 0.027649769585253458,
|
209 |
+
"num_full_absorption": 18,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.09675091301510899,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.04066091772139128,
|
224 |
+
"full_absorption_rate": 0.010362694300518135,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0019429754914134399,
|
232 |
+
"full_absorption_rate": 0.0041841004184100415,
|
233 |
+
"num_full_absorption": 1,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "a6c157c2-8266-4539-a364-d4f3d4b95e63",
|
17 |
+
"datetime_epoch_millis": 1738795756067,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.2970629494295943,
|
21 |
+
"mean_full_absorption_score": 0.16060743819078988,
|
22 |
+
"mean_num_split_features": 1.6923076923076923,
|
23 |
+
"std_dev_absorption_fraction_score": 0.21833356962674993,
|
24 |
+
"std_dev_full_absorption_score": 0.13683720697531063,
|
25 |
+
"std_dev_num_split_features": 1.2575923272422036
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.4413904461749635,
|
32 |
+
"full_absorption_rate": 0.16446858029480219,
|
33 |
+
"num_full_absorption": 424,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 5
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.450685431712029,
|
40 |
+
"full_absorption_rate": 0.18088942307692307,
|
41 |
+
"num_full_absorption": 301,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5821904564134256,
|
48 |
+
"full_absorption_rate": 0.30670371656825285,
|
49 |
+
"num_full_absorption": 883,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 5
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.5742337169635329,
|
56 |
+
"full_absorption_rate": 0.32185273159144895,
|
57 |
+
"num_full_absorption": 542,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.18981298174844446,
|
64 |
+
"full_absorption_rate": 0.13471502590673576,
|
65 |
+
"num_full_absorption": 208,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.4303705672075229,
|
72 |
+
"full_absorption_rate": 0.24333619948409285,
|
73 |
+
"num_full_absorption": 283,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.39300790034337457,
|
80 |
+
"full_absorption_rate": 0.26406533575317603,
|
81 |
+
"num_full_absorption": 291,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.1473633705426197,
|
88 |
+
"full_absorption_rate": 0.04162537165510406,
|
89 |
+
"num_full_absorption": 42,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.2615839241768494,
|
96 |
+
"full_absorption_rate": 0.14691656590084642,
|
97 |
+
"num_full_absorption": 243,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.05117406667896947,
|
104 |
+
"full_absorption_rate": 0.007389162561576354,
|
105 |
+
"num_full_absorption": 3,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.03448794297537904,
|
112 |
+
"full_absorption_rate": 0.01804511278195489,
|
113 |
+
"num_full_absorption": 12,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.528743346032827,
|
120 |
+
"full_absorption_rate": 0.37489539748953976,
|
121 |
+
"num_full_absorption": 448,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.38150848462448217,
|
128 |
+
"full_absorption_rate": 0.1703417861080485,
|
129 |
+
"num_full_absorption": 309,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.21705599444161572,
|
136 |
+
"full_absorption_rate": 0.0818639798488665,
|
137 |
+
"num_full_absorption": 65,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.19684352710791495,
|
144 |
+
"full_absorption_rate": 0.14476190476190476,
|
145 |
+
"num_full_absorption": 152,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.756351767767991,
|
152 |
+
"full_absorption_rate": 0.4744983277591973,
|
153 |
+
"num_full_absorption": 1135,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.022738719106524842,
|
160 |
+
"full_absorption_rate": 0.005747126436781609,
|
161 |
+
"num_full_absorption": 1,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5716800098523727,
|
168 |
+
"full_absorption_rate": 0.32912154031287605,
|
169 |
+
"num_full_absorption": 547,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5491532507334174,
|
176 |
+
"full_absorption_rate": 0.3548387096774194,
|
177 |
+
"num_full_absorption": 1001,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.4160762059426235,
|
184 |
+
"full_absorption_rate": 0.2081097197376267,
|
185 |
+
"num_full_absorption": 349,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.17104284092706945,
|
192 |
+
"full_absorption_rate": 0.07171314741035857,
|
193 |
+
"num_full_absorption": 54,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.08154865944312527,
|
200 |
+
"full_absorption_rate": 0.032066508313539195,
|
201 |
+
"num_full_absorption": 27,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.14250459566400003,
|
208 |
+
"full_absorption_rate": 0.05837173579109063,
|
209 |
+
"num_full_absorption": 38,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.06885068971289977,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.061906340273470535,
|
224 |
+
"full_absorption_rate": 0.031088082901554404,
|
225 |
+
"num_full_absorption": 6,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0013314486020077486,
|
232 |
+
"full_absorption_rate": 0.008368200836820083,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/absorption/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "59329e73-810f-472d-8ab5-7d0b59ee8032",
|
17 |
+
"datetime_epoch_millis": 1738788312573,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.27616667840053044,
|
21 |
+
"mean_full_absorption_score": 0.18311710219845018,
|
22 |
+
"mean_num_split_features": 2.0,
|
23 |
+
"std_dev_absorption_fraction_score": 0.17955401555761985,
|
24 |
+
"std_dev_full_absorption_score": 0.12841765476462225,
|
25 |
+
"std_dev_num_split_features": 1.6
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.4135742482669448,
|
32 |
+
"full_absorption_rate": 0.21644685802948022,
|
33 |
+
"num_full_absorption": 558,
|
34 |
+
"num_probe_true_positives": 2578,
|
35 |
+
"num_split_features": 4
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.4015526296583575,
|
40 |
+
"full_absorption_rate": 0.20072115384615385,
|
41 |
+
"num_full_absorption": 334,
|
42 |
+
"num_probe_true_positives": 1664,
|
43 |
+
"num_split_features": 3
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.4404130651991934,
|
48 |
+
"full_absorption_rate": 0.2549496352900313,
|
49 |
+
"num_full_absorption": 734,
|
50 |
+
"num_probe_true_positives": 2879,
|
51 |
+
"num_split_features": 6
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.4588083161124332,
|
56 |
+
"full_absorption_rate": 0.28028503562945367,
|
57 |
+
"num_full_absorption": 472,
|
58 |
+
"num_probe_true_positives": 1684,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.1447774581665048,
|
64 |
+
"full_absorption_rate": 0.13147668393782383,
|
65 |
+
"num_full_absorption": 203,
|
66 |
+
"num_probe_true_positives": 1544,
|
67 |
+
"num_split_features": 3
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.3873865408321871,
|
72 |
+
"full_absorption_rate": 0.2656921754084265,
|
73 |
+
"num_full_absorption": 309,
|
74 |
+
"num_probe_true_positives": 1163,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.2375981200227858,
|
80 |
+
"full_absorption_rate": 0.16696914700544466,
|
81 |
+
"num_full_absorption": 184,
|
82 |
+
"num_probe_true_positives": 1102,
|
83 |
+
"num_split_features": 2
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.18660967928208133,
|
88 |
+
"full_absorption_rate": 0.0802775024777007,
|
89 |
+
"num_full_absorption": 81,
|
90 |
+
"num_probe_true_positives": 1009,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.3631443822738456,
|
96 |
+
"full_absorption_rate": 0.29081015719467956,
|
97 |
+
"num_full_absorption": 481,
|
98 |
+
"num_probe_true_positives": 1654,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.07115954525673361,
|
104 |
+
"full_absorption_rate": 0.017241379310344827,
|
105 |
+
"num_full_absorption": 7,
|
106 |
+
"num_probe_true_positives": 406,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.04879204937766494,
|
112 |
+
"full_absorption_rate": 0.031578947368421054,
|
113 |
+
"num_full_absorption": 21,
|
114 |
+
"num_probe_true_positives": 665,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.4822787469830486,
|
120 |
+
"full_absorption_rate": 0.401673640167364,
|
121 |
+
"num_full_absorption": 480,
|
122 |
+
"num_probe_true_positives": 1195,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.38377290557583654,
|
128 |
+
"full_absorption_rate": 0.2552370452039691,
|
129 |
+
"num_full_absorption": 463,
|
130 |
+
"num_probe_true_positives": 1814,
|
131 |
+
"num_split_features": 3
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.17978290229285845,
|
136 |
+
"full_absorption_rate": 0.10327455919395466,
|
137 |
+
"num_full_absorption": 82,
|
138 |
+
"num_probe_true_positives": 794,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.19812942766943445,
|
144 |
+
"full_absorption_rate": 0.17047619047619048,
|
145 |
+
"num_full_absorption": 179,
|
146 |
+
"num_probe_true_positives": 1050,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.6056222996145454,
|
152 |
+
"full_absorption_rate": 0.3448996655518395,
|
153 |
+
"num_full_absorption": 825,
|
154 |
+
"num_probe_true_positives": 2392,
|
155 |
+
"num_split_features": 5
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.03644851552466968,
|
160 |
+
"full_absorption_rate": 0.022988505747126436,
|
161 |
+
"num_full_absorption": 4,
|
162 |
+
"num_probe_true_positives": 174,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.5460252773593768,
|
168 |
+
"full_absorption_rate": 0.3802647412755716,
|
169 |
+
"num_full_absorption": 632,
|
170 |
+
"num_probe_true_positives": 1662,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.43503717196696245,
|
176 |
+
"full_absorption_rate": 0.2977667493796526,
|
177 |
+
"num_full_absorption": 840,
|
178 |
+
"num_probe_true_positives": 2821,
|
179 |
+
"num_split_features": 6
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.37168244953794066,
|
184 |
+
"full_absorption_rate": 0.24388789505068575,
|
185 |
+
"num_full_absorption": 409,
|
186 |
+
"num_probe_true_positives": 1677,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.4015811028782945,
|
192 |
+
"full_absorption_rate": 0.3930942895086321,
|
193 |
+
"num_full_absorption": 296,
|
194 |
+
"num_probe_true_positives": 753,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.09352294964084032,
|
200 |
+
"full_absorption_rate": 0.060570071258907364,
|
201 |
+
"num_full_absorption": 51,
|
202 |
+
"num_probe_true_positives": 842,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.16147162387821953,
|
208 |
+
"full_absorption_rate": 0.08294930875576037,
|
209 |
+
"num_full_absorption": 54,
|
210 |
+
"num_probe_true_positives": 651,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.08565750731693882,
|
216 |
+
"full_absorption_rate": 0.018691588785046728,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 107,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.04526030996735048,
|
224 |
+
"full_absorption_rate": 0.03626943005181347,
|
225 |
+
"num_full_absorption": 7,
|
226 |
+
"num_probe_true_positives": 193,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.00024441375874335945,
|
232 |
+
"full_absorption_rate": 0.012552301255230125,
|
233 |
+
"num_full_absorption": 3,
|
234 |
+
"num_probe_true_positives": 239,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "standard",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fade7ed161c5db9c7c7ac163c599bd3859bb24ecc70c0a1c824a46d9e37fda6d
|
3 |
+
size 27824414
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5adce21b197509292cdd2f4da8895a5762bf93e1f0b2fbcd3ba2daf6d93f8e9d
|
3 |
+
size 27583837
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2e3c88b391388eccb6fa9d88e5f912d3862401c3acef072ed8a20527e0ac372
|
3 |
+
size 27435249
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f9d90013affb818d37b90201097e3c443522b80e5e3832fe9715633decb95ec
|
3 |
+
size 27194850
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8acd6328873671a729f89408de8ca580ec5b368493dabd9363953487b74019b8
|
3 |
+
size 27102881
|
old_relu_eval_results/autointerp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4a72d03c8276dce0adb0b65cdecd992edbff8cc4f97a61d17ad63b7e936c53cb
|
3 |
+
size 26823026
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/core/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "f84c237b-b63d-44d6-9184-83af39eb778e",
|
73 |
+
"datetime_epoch_millis": 1738800834845,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.23234423354969463,
|
77 |
+
"scr_metric_threshold_2": 0.09820290124961623,
|
78 |
+
"scr_dir2_threshold_2": 0.10051744442943668,
|
79 |
+
"scr_dir1_threshold_5": 0.22836236745436214,
|
80 |
+
"scr_metric_threshold_5": 0.1612618897051168,
|
81 |
+
"scr_dir2_threshold_5": 0.16833831095315674,
|
82 |
+
"scr_dir1_threshold_10": 0.23399130656918807,
|
83 |
+
"scr_metric_threshold_10": 0.21393385636206694,
|
84 |
+
"scr_dir2_threshold_10": 0.2210026206525063,
|
85 |
+
"scr_dir1_threshold_20": 0.24236966949927263,
|
86 |
+
"scr_metric_threshold_20": 0.26432320579946816,
|
87 |
+
"scr_dir2_threshold_20": 0.26846685069449183,
|
88 |
+
"scr_dir1_threshold_50": 0.21470674119765001,
|
89 |
+
"scr_metric_threshold_50": 0.3265154510614142,
|
90 |
+
"scr_dir2_threshold_50": 0.3172547508429693,
|
91 |
+
"scr_dir1_threshold_100": 0.20032932591386368,
|
92 |
+
"scr_metric_threshold_100": 0.38983340792408966,
|
93 |
+
"scr_dir2_threshold_100": 0.38098404313994894,
|
94 |
+
"scr_dir1_threshold_500": 0.037709607495273495,
|
95 |
+
"scr_metric_threshold_500": 0.3428859758427782,
|
96 |
+
"scr_dir2_threshold_500": 0.34688913659267395
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.4687504074535596,
|
103 |
+
"scr_metric_threshold_2": 0.012315258641572036,
|
104 |
+
"scr_dir2_threshold_2": 0.012315258641572036,
|
105 |
+
"scr_dir1_threshold_5": 0.5156247962732202,
|
106 |
+
"scr_metric_threshold_5": 0.022167494916722333,
|
107 |
+
"scr_dir2_threshold_5": 0.022167494916722333,
|
108 |
+
"scr_dir1_threshold_10": 0.578124912688523,
|
109 |
+
"scr_metric_threshold_10": 0.03201958438240929,
|
110 |
+
"scr_dir2_threshold_10": 0.03201958438240929,
|
111 |
+
"scr_dir1_threshold_20": 0.4843752037267798,
|
112 |
+
"scr_metric_threshold_20": 0.06403931557428191,
|
113 |
+
"scr_dir2_threshold_20": 0.06403931557428191,
|
114 |
+
"scr_dir1_threshold_50": 0.5,
|
115 |
+
"scr_metric_threshold_50": 0.08128076575816078,
|
116 |
+
"scr_dir2_threshold_50": 0.08128076575816078,
|
117 |
+
"scr_dir1_threshold_100": 0.43749988358469727,
|
118 |
+
"scr_metric_threshold_100": 0.10098509149899802,
|
119 |
+
"scr_dir2_threshold_100": 0.10098509149899802,
|
120 |
+
"scr_dir1_threshold_500": 0.3593749708961743,
|
121 |
+
"scr_metric_threshold_500": 0.16748757624916502,
|
122 |
+
"scr_dir2_threshold_500": 0.16748757624916502
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.4205609298770807,
|
127 |
+
"scr_metric_threshold_2": 0.1239194296368033,
|
128 |
+
"scr_dir2_threshold_2": 0.1239194296368033,
|
129 |
+
"scr_dir1_threshold_5": 0.35514009320607276,
|
130 |
+
"scr_metric_threshold_5": 0.1556196529738595,
|
131 |
+
"scr_dir2_threshold_5": 0.1556196529738595,
|
132 |
+
"scr_dir1_threshold_10": 0.30841116289226267,
|
133 |
+
"scr_metric_threshold_10": 0.23342947946078924,
|
134 |
+
"scr_dir2_threshold_10": 0.23342947946078924,
|
135 |
+
"scr_dir1_threshold_20": 0.2616822325784525,
|
136 |
+
"scr_metric_threshold_20": 0.2881845043727822,
|
137 |
+
"scr_dir2_threshold_20": 0.2881845043727822,
|
138 |
+
"scr_dir1_threshold_50": 0.17757004660303638,
|
139 |
+
"scr_metric_threshold_50": 0.37463975262183136,
|
140 |
+
"scr_dir2_threshold_50": 0.37463975262183136,
|
141 |
+
"scr_dir1_threshold_100": 0.08411218597541614,
|
142 |
+
"scr_metric_threshold_100": 0.4783861879377377,
|
143 |
+
"scr_dir2_threshold_100": 0.4783861879377377,
|
144 |
+
"scr_dir1_threshold_500": -0.6168223257845253,
|
145 |
+
"scr_metric_threshold_500": 0.18155626180283607,
|
146 |
+
"scr_dir2_threshold_500": 0.18155626180283607
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5156247962732202,
|
151 |
+
"scr_metric_threshold_2": 0.03544318161412681,
|
152 |
+
"scr_dir2_threshold_2": 0.03544318161412681,
|
153 |
+
"scr_dir1_threshold_5": 0.4843752037267798,
|
154 |
+
"scr_metric_threshold_5": 0.05063300880745302,
|
155 |
+
"scr_dir2_threshold_5": 0.05063300880745302,
|
156 |
+
"scr_dir1_threshold_10": 0.4843752037267798,
|
157 |
+
"scr_metric_threshold_10": 0.09620264128525105,
|
158 |
+
"scr_dir2_threshold_10": 0.09620264128525105,
|
159 |
+
"scr_dir1_threshold_20": 0.5156247962732202,
|
160 |
+
"scr_metric_threshold_20": 0.12911405928564068,
|
161 |
+
"scr_dir2_threshold_20": 0.12911405928564068,
|
162 |
+
"scr_dir1_threshold_50": 0.39062549476503666,
|
163 |
+
"scr_metric_threshold_50": 0.1797469171952743,
|
164 |
+
"scr_dir2_threshold_50": 0.1797469171952743,
|
165 |
+
"scr_dir1_threshold_100": 0.32812537834973393,
|
166 |
+
"scr_metric_threshold_100": 0.23797476415048074,
|
167 |
+
"scr_dir2_threshold_100": 0.23797476415048074,
|
168 |
+
"scr_dir1_threshold_500": -0.12499930150818353,
|
169 |
+
"scr_metric_threshold_500": 0.07848105047818764,
|
170 |
+
"scr_dir2_threshold_500": 0.07848105047818764
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.34645683334217237,
|
175 |
+
"scr_metric_threshold_2": 0.06528194057151832,
|
176 |
+
"scr_dir2_threshold_2": 0.06528194057151832,
|
177 |
+
"scr_dir1_threshold_5": 0.26771677194471133,
|
178 |
+
"scr_metric_threshold_5": 0.13946602022454502,
|
179 |
+
"scr_dir2_threshold_5": 0.13946602022454502,
|
180 |
+
"scr_dir1_threshold_10": 0.21259844736981667,
|
181 |
+
"scr_metric_threshold_10": 0.1869436826330466,
|
182 |
+
"scr_dir2_threshold_10": 0.1869436826330466,
|
183 |
+
"scr_dir1_threshold_20": 0.18110232894527498,
|
184 |
+
"scr_metric_threshold_20": 0.2670624037614951,
|
185 |
+
"scr_dir2_threshold_20": 0.2670624037614951,
|
186 |
+
"scr_dir1_threshold_50": -0.07086614912327223,
|
187 |
+
"scr_metric_threshold_50": 0.3412463065461462,
|
188 |
+
"scr_dir2_threshold_50": 0.3412463065461462,
|
189 |
+
"scr_dir1_threshold_100": -0.14960621052073325,
|
190 |
+
"scr_metric_threshold_100": 0.42433234841230566,
|
191 |
+
"scr_dir2_threshold_100": 0.42433234841230566,
|
192 |
+
"scr_dir1_threshold_500": -0.4724408377125527,
|
193 |
+
"scr_metric_threshold_500": 0.08011872112844853,
|
194 |
+
"scr_dir2_threshold_500": 0.08011872112844853
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.021739172687599125,
|
199 |
+
"scr_metric_threshold_2": 0.17647042323973455,
|
200 |
+
"scr_dir2_threshold_2": 0.17647042323973455,
|
201 |
+
"scr_dir1_threshold_5": 0.010869424374669583,
|
202 |
+
"scr_metric_threshold_5": 0.38039217519558505,
|
203 |
+
"scr_dir2_threshold_5": 0.38039217519558505,
|
204 |
+
"scr_dir1_threshold_10": -0.021739172687599125,
|
205 |
+
"scr_metric_threshold_10": 0.5058823941900663,
|
206 |
+
"scr_dir2_threshold_10": 0.5058823941900663,
|
207 |
+
"scr_dir1_threshold_20": -0.005434712187334791,
|
208 |
+
"scr_metric_threshold_20": 0.5647058686033112,
|
209 |
+
"scr_dir2_threshold_20": 0.5647058686033112,
|
210 |
+
"scr_dir1_threshold_50": -0.07608694243746696,
|
211 |
+
"scr_metric_threshold_50": 0.6470586860331121,
|
212 |
+
"scr_dir2_threshold_50": 0.6470586860331121,
|
213 |
+
"scr_dir1_threshold_100": -0.059782805875462586,
|
214 |
+
"scr_metric_threshold_100": 0.6941176058099336,
|
215 |
+
"scr_dir2_threshold_100": 0.6941176058099336,
|
216 |
+
"scr_dir1_threshold_500": 0.010869424374669583,
|
217 |
+
"scr_metric_threshold_500": 0.7411765255867552,
|
218 |
+
"scr_dir2_threshold_500": 0.7411765255867552
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.015463717977866399,
|
223 |
+
"scr_metric_threshold_2": 0.08467752208135552,
|
224 |
+
"scr_dir2_threshold_2": 0.08467752208135552,
|
225 |
+
"scr_dir1_threshold_5": 0.08247408426990457,
|
226 |
+
"scr_metric_threshold_5": 0.12096785435318151,
|
227 |
+
"scr_dir2_threshold_5": 0.12096785435318151,
|
228 |
+
"scr_dir1_threshold_10": 0.1340204253436484,
|
229 |
+
"scr_metric_threshold_10": 0.15322574055158325,
|
230 |
+
"scr_dir2_threshold_10": 0.15322574055158325,
|
231 |
+
"scr_dir1_threshold_20": 0.2061853642949754,
|
232 |
+
"scr_metric_threshold_20": 0.20161293036111277,
|
233 |
+
"scr_dir2_threshold_20": 0.20161293036111277,
|
234 |
+
"scr_dir1_threshold_50": 0.24742256005014163,
|
235 |
+
"scr_metric_threshold_50": 0.2661291834404855,
|
236 |
+
"scr_dir2_threshold_50": 0.2661291834404855,
|
237 |
+
"scr_dir1_threshold_100": 0.2886597558053079,
|
238 |
+
"scr_metric_threshold_100": 0.3508064651805564,
|
239 |
+
"scr_dir2_threshold_100": 0.3508064651805564,
|
240 |
+
"scr_dir1_threshold_500": 0.2886597558053079,
|
241 |
+
"scr_metric_threshold_500": 0.5564516012738088,
|
242 |
+
"scr_dir2_threshold_500": 0.5564516012738088
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.03153139365860286,
|
247 |
+
"scr_metric_threshold_2": 0.2488888370843636,
|
248 |
+
"scr_dir2_threshold_2": 0.2488888370843636,
|
249 |
+
"scr_dir1_threshold_5": 0.07207194871208326,
|
250 |
+
"scr_metric_threshold_5": 0.38222229404213226,
|
251 |
+
"scr_dir2_threshold_5": 0.38222229404213226,
|
252 |
+
"scr_dir1_threshold_10": 0.09459444946519599,
|
253 |
+
"scr_metric_threshold_10": 0.42222230463851246,
|
254 |
+
"scr_dir2_threshold_10": 0.42222230463851246,
|
255 |
+
"scr_dir1_threshold_20": 0.16666666666666666,
|
256 |
+
"scr_metric_threshold_20": 0.4711110887409752,
|
257 |
+
"scr_dir2_threshold_20": 0.4711110887409752,
|
258 |
+
"scr_dir1_threshold_50": 0.35585583408644605,
|
259 |
+
"scr_metric_threshold_50": 0.5288889112590248,
|
260 |
+
"scr_dir2_threshold_50": 0.5288889112590248,
|
261 |
+
"scr_dir1_threshold_100": 0.45045055204102946,
|
262 |
+
"scr_metric_threshold_100": 0.6088889324517851,
|
263 |
+
"scr_dir2_threshold_100": 0.6088889324517851,
|
264 |
+
"scr_dir1_threshold_500": 0.6081080573128186,
|
265 |
+
"scr_metric_threshold_500": 0.6888889536445455,
|
266 |
+
"scr_dir2_threshold_500": 0.6888889536445455
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03862661712745569,
|
271 |
+
"scr_metric_threshold_2": 0.03862661712745569,
|
272 |
+
"scr_dir2_threshold_2": 0.05714296256601923,
|
273 |
+
"scr_dir1_threshold_5": 0.03862661712745569,
|
274 |
+
"scr_metric_threshold_5": 0.03862661712745569,
|
275 |
+
"scr_dir2_threshold_5": 0.09523798711177515,
|
276 |
+
"scr_dir1_threshold_10": 0.08154502375487709,
|
277 |
+
"scr_metric_threshold_10": 0.08154502375487709,
|
278 |
+
"scr_dir2_threshold_10": 0.13809513807839202,
|
279 |
+
"scr_dir1_threshold_20": 0.12875547569614632,
|
280 |
+
"scr_metric_threshold_20": 0.12875547569614632,
|
281 |
+
"scr_dir2_threshold_20": 0.1619046348563358,
|
282 |
+
"scr_dir1_threshold_50": 0.19313308563727843,
|
283 |
+
"scr_metric_threshold_50": 0.19313308563727843,
|
284 |
+
"scr_dir2_threshold_50": 0.11904748388971893,
|
285 |
+
"scr_dir1_threshold_100": 0.22317586795092056,
|
286 |
+
"scr_metric_threshold_100": 0.22317586795092056,
|
287 |
+
"scr_dir2_threshold_100": 0.1523809496777944,
|
288 |
+
"scr_dir1_threshold_500": 0.2489271165784791,
|
289 |
+
"scr_metric_threshold_500": 0.2489271165784791,
|
290 |
+
"scr_dir2_threshold_500": 0.280952402577645
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "e55f5870-c524-4a98-bbbd-3745c4e6ba5f",
|
73 |
+
"datetime_epoch_millis": 1738801279714,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.23710567889490614,
|
77 |
+
"scr_metric_threshold_2": 0.09984043514036153,
|
78 |
+
"scr_dir2_threshold_2": 0.10096448219391552,
|
79 |
+
"scr_dir1_threshold_5": 0.23277434004406478,
|
80 |
+
"scr_metric_threshold_5": 0.1533444054899716,
|
81 |
+
"scr_dir2_threshold_5": 0.1564892090403169,
|
82 |
+
"scr_dir1_threshold_10": 0.25301255397775474,
|
83 |
+
"scr_metric_threshold_10": 0.21059082421971573,
|
84 |
+
"scr_dir2_threshold_10": 0.2181373055614877,
|
85 |
+
"scr_dir1_threshold_20": 0.24499730767158928,
|
86 |
+
"scr_metric_threshold_20": 0.2566161420240508,
|
87 |
+
"scr_dir2_threshold_20": 0.2605247568722085,
|
88 |
+
"scr_dir1_threshold_50": 0.22983255987764495,
|
89 |
+
"scr_metric_threshold_50": 0.32999437667348414,
|
90 |
+
"scr_dir2_threshold_50": 0.3275827600927625,
|
91 |
+
"scr_dir1_threshold_100": 0.1800306346905109,
|
92 |
+
"scr_metric_threshold_100": 0.38054020119475096,
|
93 |
+
"scr_dir2_threshold_100": 0.37907635825688035,
|
94 |
+
"scr_dir1_threshold_500": 0.0590895362527501,
|
95 |
+
"scr_metric_threshold_500": 0.3194485925466525,
|
96 |
+
"scr_dir2_threshold_500": 0.33138143776464996
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.4531256111803394,
|
103 |
+
"scr_metric_threshold_2": 0.017241303374415515,
|
104 |
+
"scr_dir2_threshold_2": 0.017241303374415515,
|
105 |
+
"scr_dir1_threshold_5": 0.4687504074535596,
|
106 |
+
"scr_metric_threshold_5": 0.02709353964956581,
|
107 |
+
"scr_dir2_threshold_5": 0.02709353964956581,
|
108 |
+
"scr_dir1_threshold_10": 0.5,
|
109 |
+
"scr_metric_threshold_10": 0.039408798291137845,
|
110 |
+
"scr_dir2_threshold_10": 0.039408798291137845,
|
111 |
+
"scr_dir1_threshold_20": 0.5312505238688624,
|
112 |
+
"scr_metric_threshold_20": 0.0467980121998664,
|
113 |
+
"scr_dir2_threshold_20": 0.0467980121998664,
|
114 |
+
"scr_dir1_threshold_50": 0.4843752037267798,
|
115 |
+
"scr_metric_threshold_50": 0.08374378812458251,
|
116 |
+
"scr_dir2_threshold_50": 0.08374378812458251,
|
117 |
+
"scr_dir1_threshold_100": 0.3125005820765137,
|
118 |
+
"scr_metric_threshold_100": 0.09113300203331107,
|
119 |
+
"scr_dir2_threshold_100": 0.09113300203331107,
|
120 |
+
"scr_dir1_threshold_500": 0.15624982537704588,
|
121 |
+
"scr_metric_threshold_500": 0.16009850914989981,
|
122 |
+
"scr_dir2_threshold_500": 0.16009850914989981
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.411214698172087,
|
127 |
+
"scr_metric_threshold_2": 0.1268012368908431,
|
128 |
+
"scr_dir2_threshold_2": 0.1268012368908431,
|
129 |
+
"scr_dir1_threshold_5": 0.3271025121966708,
|
130 |
+
"scr_metric_threshold_5": 0.19308366259030446,
|
131 |
+
"scr_dir2_threshold_5": 0.19308366259030446,
|
132 |
+
"scr_dir1_threshold_10": 0.25233655792624843,
|
133 |
+
"scr_metric_threshold_10": 0.2881845043727822,
|
134 |
+
"scr_dir2_threshold_10": 0.2881845043727822,
|
135 |
+
"scr_dir1_threshold_20": 0.1588786972986282,
|
136 |
+
"scr_metric_threshold_20": 0.311239305947719,
|
137 |
+
"scr_dir2_threshold_20": 0.311239305947719,
|
138 |
+
"scr_dir1_threshold_50": 0.19626139590744457,
|
139 |
+
"scr_metric_threshold_50": 0.4034583404761569,
|
140 |
+
"scr_dir2_threshold_50": 0.4034583404761569,
|
141 |
+
"scr_dir1_threshold_100": 0.056074604966014206,
|
142 |
+
"scr_metric_threshold_100": 0.4755043806836979,
|
143 |
+
"scr_dir2_threshold_100": 0.4755043806836979,
|
144 |
+
"scr_dir1_threshold_500": 0.09345786062762024,
|
145 |
+
"scr_metric_threshold_500": 0.1556196529738595,
|
146 |
+
"scr_dir2_threshold_500": 0.1556196529738595
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5312505238688624,
|
151 |
+
"scr_metric_threshold_2": 0.030379805284471813,
|
152 |
+
"scr_dir2_threshold_2": 0.030379805284471813,
|
153 |
+
"scr_dir1_threshold_5": 0.5156247962732202,
|
154 |
+
"scr_metric_threshold_5": 0.058227846955206435,
|
155 |
+
"scr_dir2_threshold_5": 0.058227846955206435,
|
156 |
+
"scr_dir1_threshold_10": 0.5156247962732202,
|
157 |
+
"scr_metric_threshold_10": 0.09873425400116885,
|
158 |
+
"scr_dir2_threshold_10": 0.09873425400116885,
|
159 |
+
"scr_dir1_threshold_20": 0.4843752037267798,
|
160 |
+
"scr_metric_threshold_20": 0.12911405928564068,
|
161 |
+
"scr_dir2_threshold_20": 0.12911405928564068,
|
162 |
+
"scr_dir1_threshold_50": 0.39062549476503666,
|
163 |
+
"scr_metric_threshold_50": 0.1848101426271099,
|
164 |
+
"scr_dir2_threshold_50": 0.1848101426271099,
|
165 |
+
"scr_dir1_threshold_100": 0.17187555297268803,
|
166 |
+
"scr_metric_threshold_100": 0.23797476415048074,
|
167 |
+
"scr_dir2_threshold_100": 0.23797476415048074,
|
168 |
+
"scr_dir1_threshold_500": -0.5468743888196606,
|
169 |
+
"scr_metric_threshold_500": 0.09873425400116885,
|
170 |
+
"scr_dir2_threshold_500": 0.09873425400116885
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.33858292106798354,
|
175 |
+
"scr_metric_threshold_2": 0.09198818094766784,
|
176 |
+
"scr_dir2_threshold_2": 0.09198818094766784,
|
177 |
+
"scr_dir1_threshold_5": 0.259842390342736,
|
178 |
+
"scr_metric_threshold_5": 0.15430280078147524,
|
179 |
+
"scr_dir2_threshold_5": 0.15430280078147524,
|
180 |
+
"scr_dir1_threshold_10": 0.2834645964930889,
|
181 |
+
"scr_metric_threshold_10": 0.2136499230091961,
|
182 |
+
"scr_dir2_threshold_10": 0.2136499230091961,
|
183 |
+
"scr_dir1_threshold_20": 0.16535450439689736,
|
184 |
+
"scr_metric_threshold_20": 0.249258302466854,
|
185 |
+
"scr_dir2_threshold_20": 0.249258302466854,
|
186 |
+
"scr_dir1_threshold_50": 0.03149611842454171,
|
187 |
+
"scr_metric_threshold_50": 0.3531157663653655,
|
188 |
+
"scr_dir2_threshold_50": 0.3531157663653655,
|
189 |
+
"scr_dir1_threshold_100": -0.03149611842454171,
|
190 |
+
"scr_metric_threshold_100": 0.4154303861991729,
|
191 |
+
"scr_dir2_threshold_100": 0.4154303861991729,
|
192 |
+
"scr_dir1_threshold_500": -0.10236179822002743,
|
193 |
+
"scr_metric_threshold_500": 0.09198818094766784,
|
194 |
+
"scr_dir2_threshold_500": 0.09198818094766784
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.03260859706226871,
|
199 |
+
"scr_metric_threshold_2": 0.18431369340772616,
|
200 |
+
"scr_dir2_threshold_2": 0.18431369340772616,
|
201 |
+
"scr_dir1_threshold_5": 0.0380433092496035,
|
202 |
+
"scr_metric_threshold_5": 0.25490195620110373,
|
203 |
+
"scr_dir2_threshold_5": 0.25490195620110373,
|
204 |
+
"scr_dir1_threshold_10": 0.07065223025013216,
|
205 |
+
"scr_metric_threshold_10": 0.4274508612286972,
|
206 |
+
"scr_dir2_threshold_10": 0.4274508612286972,
|
207 |
+
"scr_dir1_threshold_20": 0.1195652878126652,
|
208 |
+
"scr_metric_threshold_20": 0.5294117372066224,
|
209 |
+
"scr_dir2_threshold_20": 0.5294117372066224,
|
210 |
+
"scr_dir1_threshold_50": 0.016304136562004374,
|
211 |
+
"scr_metric_threshold_50": 0.623529343016556,
|
212 |
+
"scr_dir2_threshold_50": 0.623529343016556,
|
213 |
+
"scr_dir1_threshold_100": 0.010869424374669583,
|
214 |
+
"scr_metric_threshold_100": 0.6784312992176598,
|
215 |
+
"scr_dir2_threshold_100": 0.6784312992176598,
|
216 |
+
"scr_dir1_threshold_500": 0.11413025168707046,
|
217 |
+
"scr_metric_threshold_500": 0.6980391240220748,
|
218 |
+
"scr_dir2_threshold_500": 0.6980391240220748
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.04639146117402712,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.0979381094881989,
|
226 |
+
"scr_metric_threshold_5": 0.14112912335516434,
|
227 |
+
"scr_dir2_threshold_5": 0.14112912335516434,
|
228 |
+
"scr_dir1_threshold_10": 0.18041219375810347,
|
229 |
+
"scr_metric_threshold_10": 0.15322574055158325,
|
230 |
+
"scr_dir2_threshold_10": 0.15322574055158325,
|
231 |
+
"scr_dir1_threshold_20": 0.22680396217255852,
|
232 |
+
"scr_metric_threshold_20": 0.21774199363095595,
|
233 |
+
"scr_dir2_threshold_20": 0.21774199363095595,
|
234 |
+
"scr_dir1_threshold_50": 0.23711341473156403,
|
235 |
+
"scr_metric_threshold_50": 0.2862904524424683,
|
236 |
+
"scr_dir2_threshold_50": 0.2862904524424683,
|
237 |
+
"scr_dir1_threshold_100": 0.26804115792772476,
|
238 |
+
"scr_metric_threshold_100": 0.3306451961785736,
|
239 |
+
"scr_dir2_threshold_100": 0.3306451961785736,
|
240 |
+
"scr_dir1_threshold_500": 0.32989664432004623,
|
241 |
+
"scr_metric_threshold_500": 0.5040322057321397,
|
242 |
+
"scr_dir2_threshold_500": 0.5040322057321397
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.045045001506225466,
|
247 |
+
"scr_metric_threshold_2": 0.2488888370843636,
|
248 |
+
"scr_dir2_threshold_2": 0.2488888370843636,
|
249 |
+
"scr_dir1_threshold_5": 0.09909916440732847,
|
250 |
+
"scr_metric_threshold_5": 0.3422222834457521,
|
251 |
+
"scr_dir2_threshold_5": 0.3422222834457521,
|
252 |
+
"scr_dir1_threshold_10": 0.148648612366299,
|
253 |
+
"scr_metric_threshold_10": 0.39111106754821484,
|
254 |
+
"scr_dir2_threshold_10": 0.39111106754821484,
|
255 |
+
"scr_dir1_threshold_20": 0.1621622202139216,
|
256 |
+
"scr_metric_threshold_20": 0.45777766357234706,
|
257 |
+
"scr_dir2_threshold_20": 0.45777766357234706,
|
258 |
+
"scr_dir1_threshold_50": 0.3108108325802206,
|
259 |
+
"scr_metric_threshold_50": 0.5333332980120661,
|
260 |
+
"scr_dir2_threshold_50": 0.5333332980120661,
|
261 |
+
"scr_dir1_threshold_100": 0.4549549984937745,
|
262 |
+
"scr_metric_threshold_100": 0.6177777059578677,
|
263 |
+
"scr_dir2_threshold_100": 0.6177777059578677,
|
264 |
+
"scr_dir1_threshold_500": 0.25225222322637253,
|
265 |
+
"scr_metric_threshold_500": 0.671111141722876,
|
266 |
+
"scr_dir2_threshold_500": 0.671111141722876
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03862661712745569,
|
271 |
+
"scr_metric_threshold_2": 0.03862661712745569,
|
272 |
+
"scr_dir2_threshold_2": 0.04761899355588758,
|
273 |
+
"scr_dir1_threshold_5": 0.05579403094120067,
|
274 |
+
"scr_metric_threshold_5": 0.05579403094120067,
|
275 |
+
"scr_dir2_threshold_5": 0.08095245934396302,
|
276 |
+
"scr_dir1_threshold_10": 0.07296144475494565,
|
277 |
+
"scr_metric_threshold_10": 0.07296144475494565,
|
278 |
+
"scr_dir2_threshold_10": 0.1333332954891213,
|
279 |
+
"scr_dir1_threshold_20": 0.11158806188240133,
|
280 |
+
"scr_metric_threshold_20": 0.11158806188240133,
|
281 |
+
"scr_dir2_threshold_20": 0.14285698066766273,
|
282 |
+
"scr_dir1_threshold_50": 0.17167388232356773,
|
283 |
+
"scr_metric_threshold_50": 0.17167388232356773,
|
284 |
+
"scr_dir2_threshold_50": 0.1523809496777944,
|
285 |
+
"scr_dir1_threshold_100": 0.19742487513724416,
|
286 |
+
"scr_metric_threshold_100": 0.19742487513724416,
|
287 |
+
"scr_dir2_threshold_100": 0.18571413163427958,
|
288 |
+
"scr_dir1_threshold_500": 0.17596567182353345,
|
289 |
+
"scr_metric_threshold_500": 0.17596567182353345,
|
290 |
+
"scr_dir2_threshold_500": 0.27142843356751334
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "3c85fb5d-0428-4028-827e-87b37ec72a2d",
|
73 |
+
"datetime_epoch_millis": 1738802658127,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.2469231071310974,
|
77 |
+
"scr_metric_threshold_2": 0.10476438206495488,
|
78 |
+
"scr_dir2_threshold_2": 0.10439652483056149,
|
79 |
+
"scr_dir1_threshold_5": 0.266262331969709,
|
80 |
+
"scr_metric_threshold_5": 0.16042571052625748,
|
81 |
+
"scr_dir2_threshold_5": 0.16547375379885432,
|
82 |
+
"scr_dir1_threshold_10": 0.2695043161578262,
|
83 |
+
"scr_metric_threshold_10": 0.20632414219873058,
|
84 |
+
"scr_dir2_threshold_10": 0.21190100220122254,
|
85 |
+
"scr_dir1_threshold_20": 0.2804753590978531,
|
86 |
+
"scr_metric_threshold_20": 0.2545814385824276,
|
87 |
+
"scr_dir2_threshold_20": 0.2506855391924104,
|
88 |
+
"scr_dir1_threshold_50": 0.22659001552042826,
|
89 |
+
"scr_metric_threshold_50": 0.3220650150830612,
|
90 |
+
"scr_dir2_threshold_50": 0.3238788383620121,
|
91 |
+
"scr_dir1_threshold_100": 0.11656540795529736,
|
92 |
+
"scr_metric_threshold_100": 0.35549666827962545,
|
93 |
+
"scr_dir2_threshold_100": 0.35367262506496205,
|
94 |
+
"scr_dir1_threshold_500": -0.15991300862854543,
|
95 |
+
"scr_metric_threshold_500": 0.3376438530231709,
|
96 |
+
"scr_dir2_threshold_500": 0.3442118974127407
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.42187508731147705,
|
103 |
+
"scr_metric_threshold_2": 0.0073890670992652185,
|
104 |
+
"scr_dir2_threshold_2": 0.0073890670992652185,
|
105 |
+
"scr_dir1_threshold_5": 0.42187508731147705,
|
106 |
+
"scr_metric_threshold_5": 0.02709353964956581,
|
107 |
+
"scr_dir2_threshold_5": 0.02709353964956581,
|
108 |
+
"scr_dir1_threshold_10": 0.4531256111803394,
|
109 |
+
"scr_metric_threshold_10": 0.03448275355829437,
|
110 |
+
"scr_dir2_threshold_10": 0.03448275355829437,
|
111 |
+
"scr_dir1_threshold_20": 0.4062502910382569,
|
112 |
+
"scr_metric_threshold_20": 0.03694577592471611,
|
113 |
+
"scr_dir2_threshold_20": 0.03694577592471611,
|
114 |
+
"scr_dir1_threshold_50": 0.4062502910382569,
|
115 |
+
"scr_metric_threshold_50": 0.07881774339173903,
|
116 |
+
"scr_dir2_threshold_50": 0.07881774339173903,
|
117 |
+
"scr_dir1_threshold_100": 0.4062502910382569,
|
118 |
+
"scr_metric_threshold_100": 0.10591128304130484,
|
119 |
+
"scr_dir2_threshold_100": 0.10591128304130484,
|
120 |
+
"scr_dir1_threshold_500": 0.0937506402841651,
|
121 |
+
"scr_metric_threshold_500": 0.17241376779147183,
|
122 |
+
"scr_dir2_threshold_500": 0.17241376779147183
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.3738319995632706,
|
127 |
+
"scr_metric_threshold_2": 0.1325648513989227,
|
128 |
+
"scr_dir2_threshold_2": 0.1325648513989227,
|
129 |
+
"scr_dir1_threshold_5": 0.34579441855386867,
|
130 |
+
"scr_metric_threshold_5": 0.19884727709838407,
|
131 |
+
"scr_dir2_threshold_5": 0.19884727709838407,
|
132 |
+
"scr_dir1_threshold_10": 0.2242989769168465,
|
133 |
+
"scr_metric_threshold_10": 0.2708934890772342,
|
134 |
+
"scr_dir2_threshold_10": 0.2708934890772342,
|
135 |
+
"scr_dir1_threshold_20": 0.2149533022646424,
|
136 |
+
"scr_metric_threshold_20": 0.2997119051602506,
|
137 |
+
"scr_dir2_threshold_20": 0.2997119051602506,
|
138 |
+
"scr_dir1_threshold_50": 0.2616822325784525,
|
139 |
+
"scr_metric_threshold_50": 0.40057636145080794,
|
140 |
+
"scr_dir2_threshold_50": 0.40057636145080794,
|
141 |
+
"scr_dir1_threshold_100": -0.35514009320607276,
|
142 |
+
"scr_metric_threshold_100": 0.4495677718547213,
|
143 |
+
"scr_dir2_threshold_100": 0.4495677718547213,
|
144 |
+
"scr_dir1_threshold_500": -1.065420836671008,
|
145 |
+
"scr_metric_threshold_500": 0.24783868750229743,
|
146 |
+
"scr_dir2_threshold_500": 0.24783868750229743
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.4062502910382569,
|
151 |
+
"scr_metric_threshold_2": 0.0126583653752278,
|
152 |
+
"scr_dir2_threshold_2": 0.0126583653752278,
|
153 |
+
"scr_dir1_threshold_5": 0.5312505238688624,
|
154 |
+
"scr_metric_threshold_5": 0.030379805284471813,
|
155 |
+
"scr_dir2_threshold_5": 0.030379805284471813,
|
156 |
+
"scr_dir1_threshold_10": 0.5,
|
157 |
+
"scr_metric_threshold_10": 0.04810139609153522,
|
158 |
+
"scr_dir2_threshold_10": 0.04810139609153522,
|
159 |
+
"scr_dir1_threshold_20": 0.5468753201420825,
|
160 |
+
"scr_metric_threshold_20": 0.11139246847857727,
|
161 |
+
"scr_dir2_threshold_20": 0.11139246847857727,
|
162 |
+
"scr_dir1_threshold_50": 0.2968748544808716,
|
163 |
+
"scr_metric_threshold_50": 0.1620253263882109,
|
164 |
+
"scr_dir2_threshold_50": 0.1620253263882109,
|
165 |
+
"scr_dir1_threshold_100": 0.1250002328306055,
|
166 |
+
"scr_metric_threshold_100": 0.2050633461500911,
|
167 |
+
"scr_dir2_threshold_100": 0.2050633461500911,
|
168 |
+
"scr_dir1_threshold_500": -0.9531246798579175,
|
169 |
+
"scr_metric_threshold_500": 0.14683549919488467,
|
170 |
+
"scr_dir2_threshold_500": 0.14683549919488467
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.30708680264344185,
|
175 |
+
"scr_metric_threshold_2": 0.10089032002917621,
|
176 |
+
"scr_dir2_threshold_2": 0.10089032002917621,
|
177 |
+
"scr_dir1_threshold_5": 0.2362206535201696,
|
178 |
+
"scr_metric_threshold_5": 0.14243334096225593,
|
179 |
+
"scr_dir2_threshold_5": 0.14243334096225593,
|
180 |
+
"scr_dir1_threshold_10": 0.2362206535201696,
|
181 |
+
"scr_metric_threshold_10": 0.2017804631899768,
|
182 |
+
"scr_dir2_threshold_10": 0.2017804631899768,
|
183 |
+
"scr_dir1_threshold_20": 0.17322841667108618,
|
184 |
+
"scr_metric_threshold_20": 0.26112776228607326,
|
185 |
+
"scr_dir2_threshold_20": 0.26112776228607326,
|
186 |
+
"scr_dir1_threshold_50": -0.10236179822002743,
|
187 |
+
"scr_metric_threshold_50": 0.3560830871030764,
|
188 |
+
"scr_dir2_threshold_50": 0.3560830871030764,
|
189 |
+
"scr_dir1_threshold_100": -0.05511785524710812,
|
190 |
+
"scr_metric_threshold_100": 0.412463065461462,
|
191 |
+
"scr_dir2_threshold_100": 0.412463065461462,
|
192 |
+
"scr_dir1_threshold_500": -0.259842390342736,
|
193 |
+
"scr_metric_threshold_500": 0.1899110033707575,
|
194 |
+
"scr_dir2_threshold_500": 0.1899110033707575
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.048913057562533044,
|
199 |
+
"scr_metric_threshold_2": 0.1647058686033112,
|
200 |
+
"scr_dir2_threshold_2": 0.1647058686033112,
|
201 |
+
"scr_dir1_threshold_5": 0.08695636681213655,
|
202 |
+
"scr_metric_threshold_5": 0.3490195620110374,
|
203 |
+
"scr_dir2_threshold_5": 0.3490195620110374,
|
204 |
+
"scr_dir1_threshold_10": 0.08695636681213655,
|
205 |
+
"scr_metric_threshold_10": 0.443137167820971,
|
206 |
+
"scr_dir2_threshold_10": 0.443137167820971,
|
207 |
+
"scr_dir1_threshold_20": 0.10326082731240087,
|
208 |
+
"scr_metric_threshold_20": 0.5098039124022075,
|
209 |
+
"scr_dir2_threshold_20": 0.5098039124022075,
|
210 |
+
"scr_dir1_threshold_50": -0.005434712187334791,
|
211 |
+
"scr_metric_threshold_50": 0.6156863065922739,
|
212 |
+
"scr_dir2_threshold_50": 0.6156863065922739,
|
213 |
+
"scr_dir1_threshold_100": -0.0869566907503965,
|
214 |
+
"scr_metric_threshold_100": 0.6274508612286972,
|
215 |
+
"scr_dir2_threshold_100": 0.6274508612286972,
|
216 |
+
"scr_dir1_threshold_500": 0.0,
|
217 |
+
"scr_metric_threshold_500": 0.6901960875977925,
|
218 |
+
"scr_dir2_threshold_500": 0.6901960875977925
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.15463902322123155,
|
223 |
+
"scr_metric_threshold_2": 0.08870972781349516,
|
224 |
+
"scr_dir2_threshold_2": 0.08870972781349516,
|
225 |
+
"scr_dir1_threshold_5": 0.1958762189763978,
|
226 |
+
"scr_metric_threshold_5": 0.1290322658174608,
|
227 |
+
"scr_dir2_threshold_5": 0.1290322658174608,
|
228 |
+
"scr_dir1_threshold_10": 0.23711341473156403,
|
229 |
+
"scr_metric_threshold_10": 0.1491935348194436,
|
230 |
+
"scr_dir2_threshold_10": 0.1491935348194436,
|
231 |
+
"scr_dir1_threshold_20": 0.2886597558053079,
|
232 |
+
"scr_metric_threshold_20": 0.20967734182539205,
|
233 |
+
"scr_dir2_threshold_20": 0.20967734182539205,
|
234 |
+
"scr_dir1_threshold_50": 0.36082469475663487,
|
235 |
+
"scr_metric_threshold_50": 0.25403232590278196,
|
236 |
+
"scr_dir2_threshold_50": 0.25403232590278196,
|
237 |
+
"scr_dir1_threshold_100": 0.38659786529350676,
|
238 |
+
"scr_metric_threshold_100": 0.282258006369044,
|
239 |
+
"scr_dir2_threshold_100": 0.282258006369044,
|
240 |
+
"scr_dir1_threshold_500": 0.37113384007521244,
|
241 |
+
"scr_metric_threshold_500": 0.43951619299405154,
|
242 |
+
"scr_dir2_threshold_500": 0.43951619299405154
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.20270277526740202,
|
247 |
+
"scr_metric_threshold_2": 0.27111103575907436,
|
248 |
+
"scr_dir2_threshold_2": 0.27111103575907436,
|
249 |
+
"scr_dir1_threshold_5": 0.24774777677362747,
|
250 |
+
"scr_metric_threshold_5": 0.3422222834457521,
|
251 |
+
"scr_dir2_threshold_5": 0.3422222834457521,
|
252 |
+
"scr_dir1_threshold_10": 0.3153152790329657,
|
253 |
+
"scr_metric_threshold_10": 0.4000001059638017,
|
254 |
+
"scr_dir2_threshold_10": 0.4000001059638017,
|
255 |
+
"scr_dir1_threshold_20": 0.3603602805391911,
|
256 |
+
"scr_metric_threshold_20": 0.45777766357234706,
|
257 |
+
"scr_dir2_threshold_20": 0.45777766357234706,
|
258 |
+
"scr_dir1_threshold_50": 0.4189188898930392,
|
259 |
+
"scr_metric_threshold_50": 0.5333332980120661,
|
260 |
+
"scr_dir2_threshold_50": 0.5333332980120661,
|
261 |
+
"scr_dir1_threshold_100": 0.297297224732598,
|
262 |
+
"scr_metric_threshold_100": 0.5466667231806942,
|
263 |
+
"scr_dir2_threshold_100": 0.5466667231806942,
|
264 |
+
"scr_dir1_threshold_500": 0.3153152790329657,
|
265 |
+
"scr_metric_threshold_500": 0.595555507283157,
|
266 |
+
"scr_dir2_threshold_500": 0.595555507283157
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.060085820441166386,
|
271 |
+
"scr_metric_threshold_2": 0.060085820441166386,
|
272 |
+
"scr_dir2_threshold_2": 0.05714296256601923,
|
273 |
+
"scr_dir1_threshold_5": 0.0643776099411321,
|
274 |
+
"scr_metric_threshold_5": 0.0643776099411321,
|
275 |
+
"scr_dir2_threshold_5": 0.10476195612190681,
|
276 |
+
"scr_dir1_threshold_10": 0.10300422706858779,
|
277 |
+
"scr_metric_threshold_10": 0.10300422706858779,
|
278 |
+
"scr_dir2_threshold_10": 0.14761910708852366,
|
279 |
+
"scr_dir1_threshold_20": 0.15021467900985702,
|
280 |
+
"scr_metric_threshold_20": 0.15021467900985702,
|
281 |
+
"scr_dir2_threshold_20": 0.11904748388971893,
|
282 |
+
"scr_dir1_threshold_50": 0.17596567182353345,
|
283 |
+
"scr_metric_threshold_50": 0.17596567182353345,
|
284 |
+
"scr_dir2_threshold_50": 0.19047625805514054,
|
285 |
+
"scr_dir1_threshold_100": 0.21459228895098914,
|
286 |
+
"scr_metric_threshold_100": 0.21459228895098914,
|
287 |
+
"scr_dir2_threshold_100": 0.19999994323368195,
|
288 |
+
"scr_dir1_threshold_500": 0.21888407845095484,
|
289 |
+
"scr_metric_threshold_500": 0.21888407845095484,
|
290 |
+
"scr_dir2_threshold_500": 0.27142843356751334
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "941954f3-cc39-43ab-951e-ef7dac65494f",
|
73 |
+
"datetime_epoch_millis": 1738802202746,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.1998397667116698,
|
77 |
+
"scr_metric_threshold_2": 0.09773468518068165,
|
78 |
+
"scr_dir2_threshold_2": 0.1036206457814039,
|
79 |
+
"scr_dir1_threshold_5": 0.19389433803907274,
|
80 |
+
"scr_metric_threshold_5": 0.1494009315204827,
|
81 |
+
"scr_dir2_threshold_5": 0.15724885650288437,
|
82 |
+
"scr_dir1_threshold_10": 0.20252891396157224,
|
83 |
+
"scr_metric_threshold_10": 0.20638626724773887,
|
84 |
+
"scr_dir2_threshold_10": 0.21464552766444467,
|
85 |
+
"scr_dir1_threshold_20": 0.1869289133935145,
|
86 |
+
"scr_metric_threshold_20": 0.2550003527011395,
|
87 |
+
"scr_dir2_threshold_20": 0.2671248172217763,
|
88 |
+
"scr_dir1_threshold_50": 0.1218689486768515,
|
89 |
+
"scr_metric_threshold_50": 0.29936929010341096,
|
90 |
+
"scr_dir2_threshold_50": 0.3102521588192187,
|
91 |
+
"scr_dir1_threshold_100": 0.0633538494619269,
|
92 |
+
"scr_metric_threshold_100": 0.2931972361072011,
|
93 |
+
"scr_dir2_threshold_100": 0.31354517528118225,
|
94 |
+
"scr_dir1_threshold_500": -0.1510882278136979,
|
95 |
+
"scr_metric_threshold_500": 0.270963096956086,
|
96 |
+
"scr_dir2_threshold_500": 0.290943143416725
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.3749997671693945,
|
103 |
+
"scr_metric_threshold_2": 0.012315258641572036,
|
104 |
+
"scr_dir2_threshold_2": 0.012315258641572036,
|
105 |
+
"scr_dir1_threshold_5": 0.39062549476503666,
|
106 |
+
"scr_metric_threshold_5": 0.02709353964956581,
|
107 |
+
"scr_dir2_threshold_5": 0.02709353964956581,
|
108 |
+
"scr_dir1_threshold_10": 0.4062502910382569,
|
109 |
+
"scr_metric_threshold_10": 0.03694577592471611,
|
110 |
+
"scr_dir2_threshold_10": 0.03694577592471611,
|
111 |
+
"scr_dir1_threshold_20": 0.4062502910382569,
|
112 |
+
"scr_metric_threshold_20": 0.051724056932709886,
|
113 |
+
"scr_dir2_threshold_20": 0.051724056932709886,
|
114 |
+
"scr_dir1_threshold_50": 0.3593749708961743,
|
115 |
+
"scr_metric_threshold_50": 0.07635457421585395,
|
116 |
+
"scr_dir2_threshold_50": 0.07635457421585395,
|
117 |
+
"scr_dir1_threshold_100": 0.3125005820765137,
|
118 |
+
"scr_metric_threshold_100": 0.10837430540772658,
|
119 |
+
"scr_dir2_threshold_100": 0.10837430540772658,
|
120 |
+
"scr_dir1_threshold_500": -0.10937450523496334,
|
121 |
+
"scr_metric_threshold_500": 0.06157629320786018,
|
122 |
+
"scr_dir2_threshold_500": 0.06157629320786018
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.35514009320607276,
|
127 |
+
"scr_metric_threshold_2": 0.16714705376132788,
|
128 |
+
"scr_dir2_threshold_2": 0.16714705376132788,
|
129 |
+
"scr_dir1_threshold_5": 0.2429903262212547,
|
130 |
+
"scr_metric_threshold_5": 0.22190207867332085,
|
131 |
+
"scr_dir2_threshold_5": 0.22190207867332085,
|
132 |
+
"scr_dir1_threshold_10": 0.2056076276124383,
|
133 |
+
"scr_metric_threshold_10": 0.3054755196683302,
|
134 |
+
"scr_dir2_threshold_10": 0.3054755196683302,
|
135 |
+
"scr_dir1_threshold_20": 0.14018679094143036,
|
136 |
+
"scr_metric_threshold_20": 0.36599433085971195,
|
137 |
+
"scr_dir2_threshold_20": 0.36599433085971195,
|
138 |
+
"scr_dir1_threshold_50": 0.04672893031381012,
|
139 |
+
"scr_metric_threshold_50": 0.4092219549842365,
|
140 |
+
"scr_dir2_threshold_50": 0.4092219549842365,
|
141 |
+
"scr_dir1_threshold_100": -0.35514009320607276,
|
142 |
+
"scr_metric_threshold_100": 0.20749287063181265,
|
143 |
+
"scr_dir2_threshold_100": 0.20749287063181265,
|
144 |
+
"scr_dir1_threshold_500": -0.8317756280491677,
|
145 |
+
"scr_metric_threshold_500": 0.18443806905687588,
|
146 |
+
"scr_dir2_threshold_500": 0.18443806905687588
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.42187508731147705,
|
151 |
+
"scr_metric_threshold_2": 0.017721590807063405,
|
152 |
+
"scr_dir2_threshold_2": 0.017721590807063405,
|
153 |
+
"scr_dir1_threshold_5": 0.43749988358469727,
|
154 |
+
"scr_metric_threshold_5": 0.030379805284471813,
|
155 |
+
"scr_dir2_threshold_5": 0.030379805284471813,
|
156 |
+
"scr_dir1_threshold_10": 0.3749997671693945,
|
157 |
+
"scr_metric_threshold_10": 0.06582283600077923,
|
158 |
+
"scr_dir2_threshold_10": 0.06582283600077923,
|
159 |
+
"scr_dir1_threshold_20": 0.42187508731147705,
|
160 |
+
"scr_metric_threshold_20": 0.11392408119449507,
|
161 |
+
"scr_dir2_threshold_20": 0.11392408119449507,
|
162 |
+
"scr_dir1_threshold_50": 0.18750034924590825,
|
163 |
+
"scr_metric_threshold_50": 0.1544304882404575,
|
164 |
+
"scr_dir2_threshold_50": 0.1544304882404575,
|
165 |
+
"scr_dir1_threshold_100": 0.2968748544808716,
|
166 |
+
"scr_metric_threshold_100": 0.18227852991119212,
|
167 |
+
"scr_dir2_threshold_100": 0.18227852991119212,
|
168 |
+
"scr_dir1_threshold_500": -0.5312495925464404,
|
169 |
+
"scr_metric_threshold_500": 0.13417728471747628,
|
170 |
+
"scr_dir2_threshold_500": 0.13417728471747628
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2440945657943584,
|
175 |
+
"scr_metric_threshold_2": 0.12166174206152826,
|
176 |
+
"scr_dir2_threshold_2": 0.12166174206152826,
|
177 |
+
"scr_dir1_threshold_5": 0.1968506228214391,
|
178 |
+
"scr_metric_threshold_5": 0.18100904115762476,
|
179 |
+
"scr_dir2_threshold_5": 0.18100904115762476,
|
180 |
+
"scr_dir1_threshold_10": 0.2519684780685472,
|
181 |
+
"scr_metric_threshold_10": 0.23145402430383724,
|
182 |
+
"scr_dir2_threshold_10": 0.23145402430383724,
|
183 |
+
"scr_dir1_threshold_20": 0.007874381601975306,
|
184 |
+
"scr_metric_threshold_20": 0.2640950830237842,
|
185 |
+
"scr_dir2_threshold_20": 0.2640950830237842,
|
186 |
+
"scr_dir1_threshold_50": -0.13385791664456914,
|
187 |
+
"scr_metric_threshold_50": 0.35014844562765457,
|
188 |
+
"scr_dir2_threshold_50": 0.35014844562765457,
|
189 |
+
"scr_dir1_threshold_100": -0.05511785524710812,
|
190 |
+
"scr_metric_threshold_100": 0.41839770693688383,
|
191 |
+
"scr_dir2_threshold_100": 0.41839770693688383,
|
192 |
+
"scr_dir1_threshold_500": -0.259842390342736,
|
193 |
+
"scr_metric_threshold_500": 0.17210690207611637,
|
194 |
+
"scr_dir2_threshold_500": 0.17210690207611637
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.03260859706226871,
|
199 |
+
"scr_metric_threshold_2": 0.13725477363090466,
|
200 |
+
"scr_dir2_threshold_2": 0.13725477363090466,
|
201 |
+
"scr_dir1_threshold_5": 0.021739172687599125,
|
202 |
+
"scr_metric_threshold_5": 0.2823528174298009,
|
203 |
+
"scr_dir2_threshold_5": 0.2823528174298009,
|
204 |
+
"scr_dir1_threshold_10": 0.03260859706226871,
|
205 |
+
"scr_metric_threshold_10": 0.41176455463642336,
|
206 |
+
"scr_dir2_threshold_10": 0.41176455463642336,
|
207 |
+
"scr_dir1_threshold_20": 0.05434776974986783,
|
208 |
+
"scr_metric_threshold_20": 0.5254902189944813,
|
209 |
+
"scr_dir2_threshold_20": 0.5254902189944813,
|
210 |
+
"scr_dir1_threshold_50": -0.005434712187334791,
|
211 |
+
"scr_metric_threshold_50": 0.6117645546364233,
|
212 |
+
"scr_dir2_threshold_50": 0.6117645546364233,
|
213 |
+
"scr_dir1_threshold_100": -0.125,
|
214 |
+
"scr_metric_threshold_100": 0.6156863065922739,
|
215 |
+
"scr_dir2_threshold_100": 0.6156863065922739,
|
216 |
+
"scr_dir1_threshold_500": -0.016304460500264333,
|
217 |
+
"scr_metric_threshold_500": 0.643137167820971,
|
218 |
+
"scr_dir2_threshold_500": 0.643137167820971
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.07731951161061577,
|
223 |
+
"scr_metric_threshold_2": 0.060483807005948444,
|
224 |
+
"scr_dir2_threshold_2": 0.060483807005948444,
|
225 |
+
"scr_dir1_threshold_5": 0.12886585268435963,
|
226 |
+
"scr_metric_threshold_5": 0.12096785435318151,
|
227 |
+
"scr_dir2_threshold_5": 0.12096785435318151,
|
228 |
+
"scr_dir1_threshold_10": 0.15463902322123155,
|
229 |
+
"scr_metric_threshold_10": 0.1491935348194436,
|
230 |
+
"scr_dir2_threshold_10": 0.1491935348194436,
|
231 |
+
"scr_dir1_threshold_20": 0.190721646317109,
|
232 |
+
"scr_metric_threshold_20": 0.1935485188968335,
|
233 |
+
"scr_dir2_threshold_20": 0.1935485188968335,
|
234 |
+
"scr_dir1_threshold_50": 0.18556676641739225,
|
235 |
+
"scr_metric_threshold_50": 0.2661291834404855,
|
236 |
+
"scr_dir2_threshold_50": 0.2661291834404855,
|
237 |
+
"scr_dir1_threshold_100": 0.2010307916356866,
|
238 |
+
"scr_metric_threshold_100": 0.2862904524424683,
|
239 |
+
"scr_dir2_threshold_100": 0.2862904524424683,
|
240 |
+
"scr_dir1_threshold_500": 0.25257713270943044,
|
241 |
+
"scr_metric_threshold_500": 0.3629033227182599,
|
242 |
+
"scr_dir2_threshold_500": 0.3629033227182599
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.05405389441171559,
|
247 |
+
"scr_metric_threshold_2": 0.22666663840965287,
|
248 |
+
"scr_dir2_threshold_2": 0.22666663840965287,
|
249 |
+
"scr_dir1_threshold_5": 0.0810811101069608,
|
250 |
+
"scr_metric_threshold_5": 0.2800000741746612,
|
251 |
+
"scr_dir2_threshold_5": 0.2800000741746612,
|
252 |
+
"scr_dir1_threshold_10": 0.11261250376556366,
|
253 |
+
"scr_metric_threshold_10": 0.36888886887350414,
|
254 |
+
"scr_dir2_threshold_10": 0.36888886887350414,
|
255 |
+
"scr_dir1_threshold_20": 0.17117111311941172,
|
256 |
+
"scr_metric_threshold_20": 0.42222230463851246,
|
257 |
+
"scr_dir2_threshold_20": 0.42222230463851246,
|
258 |
+
"scr_dir1_threshold_50": 0.27927917043223033,
|
259 |
+
"scr_metric_threshold_50": 0.4711110887409752,
|
260 |
+
"scr_dir2_threshold_50": 0.4711110887409752,
|
261 |
+
"scr_dir1_threshold_100": 0.18018027451428928,
|
262 |
+
"scr_metric_threshold_100": 0.4755554754940165,
|
263 |
+
"scr_dir2_threshold_100": 0.4755554754940165,
|
264 |
+
"scr_dir1_threshold_500": 0.1756755595721568,
|
265 |
+
"scr_metric_threshold_500": 0.4977776741687272,
|
266 |
+
"scr_dir2_threshold_500": 0.4977776741687272
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03862661712745569,
|
271 |
+
"scr_metric_threshold_2": 0.03862661712745569,
|
272 |
+
"scr_dir2_threshold_2": 0.08571430193323373,
|
273 |
+
"scr_dir1_threshold_5": 0.05150224144123495,
|
274 |
+
"scr_metric_threshold_5": 0.05150224144123495,
|
275 |
+
"scr_dir2_threshold_5": 0.11428564130044823,
|
276 |
+
"scr_dir1_threshold_10": 0.08154502375487709,
|
277 |
+
"scr_metric_threshold_10": 0.08154502375487709,
|
278 |
+
"scr_dir2_threshold_10": 0.14761910708852366,
|
279 |
+
"scr_dir1_threshold_20": 0.10300422706858779,
|
280 |
+
"scr_metric_threshold_20": 0.10300422706858779,
|
281 |
+
"scr_dir2_threshold_20": 0.19999994323368195,
|
282 |
+
"scr_dir1_threshold_50": 0.05579403094120067,
|
283 |
+
"scr_metric_threshold_50": 0.05579403094120067,
|
284 |
+
"scr_dir2_threshold_50": 0.14285698066766273,
|
285 |
+
"scr_dir1_threshold_100": 0.05150224144123495,
|
286 |
+
"scr_metric_threshold_100": 0.05150224144123495,
|
287 |
+
"scr_dir2_threshold_100": 0.21428575483308432,
|
288 |
+
"scr_dir1_threshold_500": 0.11158806188240133,
|
289 |
+
"scr_metric_threshold_500": 0.11158806188240133,
|
290 |
+
"scr_dir2_threshold_500": 0.27142843356751334
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "c41f7329-1294-48a8-9743-0a7d0c4f653a",
|
73 |
+
"datetime_epoch_millis": 1738803092208,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.16662920298711476,
|
77 |
+
"scr_metric_threshold_2": 0.09319447246027145,
|
78 |
+
"scr_dir2_threshold_2": 0.09836765398605983,
|
79 |
+
"scr_dir1_threshold_5": 0.18595827828202774,
|
80 |
+
"scr_metric_threshold_5": 0.14338717270565676,
|
81 |
+
"scr_dir2_threshold_5": 0.14980960699387474,
|
82 |
+
"scr_dir1_threshold_10": 0.142764782657517,
|
83 |
+
"scr_metric_threshold_10": 0.18711664692078173,
|
84 |
+
"scr_dir2_threshold_10": 0.19460437162639055,
|
85 |
+
"scr_dir1_threshold_20": 0.16422745930077481,
|
86 |
+
"scr_metric_threshold_20": 0.22564147974603221,
|
87 |
+
"scr_dir2_threshold_20": 0.23467224389709979,
|
88 |
+
"scr_dir1_threshold_50": 0.07719326484466627,
|
89 |
+
"scr_metric_threshold_50": 0.27652624846177276,
|
90 |
+
"scr_dir2_threshold_50": 0.2778112550108285,
|
91 |
+
"scr_dir1_threshold_100": 0.048861431225419735,
|
92 |
+
"scr_metric_threshold_100": 0.3011672908990718,
|
93 |
+
"scr_dir2_threshold_100": 0.3098454762519983,
|
94 |
+
"scr_dir1_threshold_500": -0.16907412416599507,
|
95 |
+
"scr_metric_threshold_500": 0.2543551795776049,
|
96 |
+
"scr_dir2_threshold_500": 0.26570810838714465
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.32812537834973393,
|
103 |
+
"scr_metric_threshold_2": 0.017241303374415515,
|
104 |
+
"scr_dir2_threshold_2": 0.017241303374415515,
|
105 |
+
"scr_dir1_threshold_5": 0.3593749708961743,
|
106 |
+
"scr_metric_threshold_5": 0.03448275355829437,
|
107 |
+
"scr_dir2_threshold_5": 0.03448275355829437,
|
108 |
+
"scr_dir1_threshold_10": 0.3437501746229541,
|
109 |
+
"scr_metric_threshold_10": 0.044334989833444666,
|
110 |
+
"scr_dir2_threshold_10": 0.044334989833444666,
|
111 |
+
"scr_dir1_threshold_20": 0.3437501746229541,
|
112 |
+
"scr_metric_threshold_20": 0.06403931557428191,
|
113 |
+
"scr_dir2_threshold_20": 0.06403931557428191,
|
114 |
+
"scr_dir1_threshold_50": 0.2968748544808716,
|
115 |
+
"scr_metric_threshold_50": 0.07142852948301047,
|
116 |
+
"scr_dir2_threshold_50": 0.07142852948301047,
|
117 |
+
"scr_dir1_threshold_100": 0.2343756693879908,
|
118 |
+
"scr_metric_threshold_100": 0.09852206913257629,
|
119 |
+
"scr_dir2_threshold_100": 0.09852206913257629,
|
120 |
+
"scr_dir1_threshold_500": -0.14062502910382568,
|
121 |
+
"scr_metric_threshold_500": 0.05418707929913162,
|
122 |
+
"scr_dir2_threshold_500": 0.05418707929913162
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2897198135878545,
|
127 |
+
"scr_metric_threshold_2": 0.1498558666944707,
|
128 |
+
"scr_dir2_threshold_2": 0.1498558666944707,
|
129 |
+
"scr_dir1_threshold_5": 0.33644874390166457,
|
130 |
+
"scr_metric_threshold_5": 0.21325648513989226,
|
131 |
+
"scr_dir2_threshold_5": 0.21325648513989226,
|
132 |
+
"scr_dir1_threshold_10": 0.16822437195083229,
|
133 |
+
"scr_metric_threshold_10": 0.23631128671482904,
|
134 |
+
"scr_dir2_threshold_10": 0.23631128671482904,
|
135 |
+
"scr_dir1_threshold_20": 0.1588786972986282,
|
136 |
+
"scr_metric_threshold_20": 0.328530321243267,
|
137 |
+
"scr_dir2_threshold_20": 0.328530321243267,
|
138 |
+
"scr_dir1_threshold_50": -0.3177573945972564,
|
139 |
+
"scr_metric_threshold_50": 0.37175794536779155,
|
140 |
+
"scr_dir2_threshold_50": 0.37175794536779155,
|
141 |
+
"scr_dir1_threshold_100": -0.2149533022646424,
|
142 |
+
"scr_metric_threshold_100": 0.4034583404761569,
|
143 |
+
"scr_dir2_threshold_100": 0.4034583404761569,
|
144 |
+
"scr_dir1_threshold_500": -0.9532710696861899,
|
145 |
+
"scr_metric_threshold_500": 0.19596546984434426,
|
146 |
+
"scr_dir2_threshold_500": 0.19596546984434426
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.3593749708961743,
|
151 |
+
"scr_metric_threshold_2": 0.025316579852636207,
|
152 |
+
"scr_dir2_threshold_2": 0.025316579852636207,
|
153 |
+
"scr_dir1_threshold_5": 0.39062549476503666,
|
154 |
+
"scr_metric_threshold_5": 0.04050640704596242,
|
155 |
+
"scr_dir2_threshold_5": 0.04050640704596242,
|
156 |
+
"scr_dir1_threshold_10": 0.2656252619344312,
|
157 |
+
"scr_metric_threshold_10": 0.06329122328486143,
|
158 |
+
"scr_dir2_threshold_10": 0.06329122328486143,
|
159 |
+
"scr_dir1_threshold_20": 0.3125005820765137,
|
160 |
+
"scr_metric_threshold_20": 0.10379747943300446,
|
161 |
+
"scr_dir2_threshold_20": 0.10379747943300446,
|
162 |
+
"scr_dir1_threshold_50": 0.21874994179234863,
|
163 |
+
"scr_metric_threshold_50": 0.14683549919488467,
|
164 |
+
"scr_dir2_threshold_50": 0.14683549919488467,
|
165 |
+
"scr_dir1_threshold_100": 0.15624982537704588,
|
166 |
+
"scr_metric_threshold_100": 0.1848101426271099,
|
167 |
+
"scr_dir2_threshold_100": 0.1848101426271099,
|
168 |
+
"scr_dir1_threshold_500": -0.4062493597158349,
|
169 |
+
"scr_metric_threshold_500": 0.10379747943300446,
|
170 |
+
"scr_dir2_threshold_500": 0.10379747943300446
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.1968506228214391,
|
175 |
+
"scr_metric_threshold_2": 0.11572710058610643,
|
176 |
+
"scr_dir2_threshold_2": 0.11572710058610643,
|
177 |
+
"scr_dir1_threshold_5": 0.22834674124598078,
|
178 |
+
"scr_metric_threshold_5": 0.16913958133840545,
|
179 |
+
"scr_dir2_threshold_5": 0.16913958133840545,
|
180 |
+
"scr_dir1_threshold_10": 0.07874006139746104,
|
181 |
+
"scr_metric_threshold_10": 0.22551938282841538,
|
182 |
+
"scr_dir2_threshold_10": 0.22551938282841538,
|
183 |
+
"scr_dir1_threshold_20": 0.08661444299943634,
|
184 |
+
"scr_metric_threshold_20": 0.26112776228607326,
|
185 |
+
"scr_dir2_threshold_20": 0.26112776228607326,
|
186 |
+
"scr_dir1_threshold_50": -0.023621736822566405,
|
187 |
+
"scr_metric_threshold_50": 0.33531166507072435,
|
188 |
+
"scr_dir2_threshold_50": 0.33531166507072435,
|
189 |
+
"scr_dir1_threshold_100": -0.09448788594583864,
|
190 |
+
"scr_metric_threshold_100": 0.3887241458230234,
|
191 |
+
"scr_dir2_threshold_100": 0.3887241458230234,
|
192 |
+
"scr_dir1_threshold_500": -0.2283462719181943,
|
193 |
+
"scr_metric_threshold_500": 0.1899110033707575,
|
194 |
+
"scr_dir2_threshold_500": 0.1899110033707575
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.027173884874933916,
|
199 |
+
"scr_metric_threshold_2": 0.12549021899448132,
|
200 |
+
"scr_dir2_threshold_2": 0.12549021899448132,
|
201 |
+
"scr_dir1_threshold_5": 0.0380433092496035,
|
202 |
+
"scr_metric_threshold_5": 0.24313716782097103,
|
203 |
+
"scr_dir2_threshold_5": 0.24313716782097103,
|
204 |
+
"scr_dir1_threshold_10": 0.0380433092496035,
|
205 |
+
"scr_metric_threshold_10": 0.3686273868154523,
|
206 |
+
"scr_dir2_threshold_10": 0.3686273868154523,
|
207 |
+
"scr_dir1_threshold_20": 0.06521719412453741,
|
208 |
+
"scr_metric_threshold_20": 0.41176455463642336,
|
209 |
+
"scr_dir2_threshold_20": 0.41176455463642336,
|
210 |
+
"scr_dir1_threshold_50": -0.07065223025013216,
|
211 |
+
"scr_metric_threshold_50": 0.5294117372066224,
|
212 |
+
"scr_dir2_threshold_50": 0.5294117372066224,
|
213 |
+
"scr_dir1_threshold_100": -0.0815219785630617,
|
214 |
+
"scr_metric_threshold_100": 0.5647058686033112,
|
215 |
+
"scr_dir2_threshold_100": 0.5647058686033112,
|
216 |
+
"scr_dir1_threshold_500": -0.03804363318786346,
|
217 |
+
"scr_metric_threshold_500": 0.5882352116198672,
|
218 |
+
"scr_dir2_threshold_500": 0.5882352116198672
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.05670091373303265,
|
223 |
+
"scr_metric_threshold_2": 0.06854845881151235,
|
224 |
+
"scr_dir2_threshold_2": 0.06854845881151235,
|
225 |
+
"scr_dir1_threshold_5": 0.08247408426990457,
|
226 |
+
"scr_metric_threshold_5": 0.14112912335516434,
|
227 |
+
"scr_dir2_threshold_5": 0.14112912335516434,
|
228 |
+
"scr_dir1_threshold_10": 0.1340204253436484,
|
229 |
+
"scr_metric_threshold_10": 0.16129039235714715,
|
230 |
+
"scr_dir2_threshold_10": 0.16129039235714715,
|
231 |
+
"scr_dir1_threshold_20": 0.14432987790265395,
|
232 |
+
"scr_metric_threshold_20": 0.18145166135912996,
|
233 |
+
"scr_dir2_threshold_20": 0.18145166135912996,
|
234 |
+
"scr_dir1_threshold_50": 0.12886585268435963,
|
235 |
+
"scr_metric_threshold_50": 0.23790326263293876,
|
236 |
+
"scr_dir2_threshold_50": 0.23790326263293876,
|
237 |
+
"scr_dir1_threshold_100": 0.12886585268435963,
|
238 |
+
"scr_metric_threshold_100": 0.2782258006369044,
|
239 |
+
"scr_dir2_threshold_100": 0.2782258006369044,
|
240 |
+
"scr_dir1_threshold_500": 0.1030926821474877,
|
241 |
+
"scr_metric_threshold_500": 0.35483867091269605,
|
242 |
+
"scr_dir2_threshold_500": 0.35483867091269605
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.045045001506225466,
|
247 |
+
"scr_metric_threshold_2": 0.21333321324102472,
|
248 |
+
"scr_dir2_threshold_2": 0.21333321324102472,
|
249 |
+
"scr_dir1_threshold_5": 0.01801805430036767,
|
250 |
+
"scr_metric_threshold_5": 0.27111103575907436,
|
251 |
+
"scr_dir2_threshold_5": 0.27111103575907436,
|
252 |
+
"scr_dir1_threshold_10": 0.045045001506225466,
|
253 |
+
"scr_metric_threshold_10": 0.32888885827712394,
|
254 |
+
"scr_dir2_threshold_10": 0.32888885827712394,
|
255 |
+
"scr_dir1_threshold_20": 0.1081080573128186,
|
256 |
+
"scr_metric_threshold_20": 0.36000009536742156,
|
257 |
+
"scr_dir2_threshold_20": 0.36000009536742156,
|
258 |
+
"scr_dir1_threshold_50": 0.24774777677362747,
|
259 |
+
"scr_metric_threshold_50": 0.38222229404213226,
|
260 |
+
"scr_dir2_threshold_50": 0.38222229404213226,
|
261 |
+
"scr_dir1_threshold_100": 0.19369361387252446,
|
262 |
+
"scr_metric_threshold_100": 0.42222230463851246,
|
263 |
+
"scr_dir2_threshold_100": 0.42222230463851246,
|
264 |
+
"scr_dir1_threshold_500": 0.22072082956776967,
|
265 |
+
"scr_metric_threshold_500": 0.45777766357234706,
|
266 |
+
"scr_dir2_threshold_500": 0.45777766357234706
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.030043038127524242,
|
271 |
+
"scr_metric_threshold_2": 0.030043038127524242,
|
272 |
+
"scr_dir2_threshold_2": 0.07142849033383136,
|
273 |
+
"scr_dir1_threshold_5": 0.03433482762748996,
|
274 |
+
"scr_metric_threshold_5": 0.03433482762748996,
|
275 |
+
"scr_dir2_threshold_5": 0.08571430193323373,
|
276 |
+
"scr_dir1_threshold_10": 0.06866965525497992,
|
277 |
+
"scr_metric_threshold_10": 0.06866965525497992,
|
278 |
+
"scr_dir2_threshold_10": 0.12857145289985059,
|
279 |
+
"scr_dir1_threshold_20": 0.09442064806865635,
|
280 |
+
"scr_metric_threshold_20": 0.09442064806865635,
|
281 |
+
"scr_dir2_threshold_20": 0.16666676127719673,
|
282 |
+
"scr_dir1_threshold_50": 0.13733905469607777,
|
283 |
+
"scr_metric_threshold_50": 0.13733905469607777,
|
284 |
+
"scr_dir2_threshold_50": 0.14761910708852366,
|
285 |
+
"scr_dir1_threshold_100": 0.06866965525497992,
|
286 |
+
"scr_metric_threshold_100": 0.06866965525497992,
|
287 |
+
"scr_dir2_threshold_100": 0.13809513807839202,
|
288 |
+
"scr_dir1_threshold_500": 0.09012885856869063,
|
289 |
+
"scr_metric_threshold_500": 0.09012885856869063,
|
290 |
+
"scr_dir2_threshold_500": 0.18095228904500887
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/scr/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "f172b11d-e73d-4265-bcff-b86b3f4483ba",
|
73 |
+
"datetime_epoch_millis": 1738801762536,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.1194317497817165,
|
77 |
+
"scr_metric_threshold_2": 0.07554747805773677,
|
78 |
+
"scr_dir2_threshold_2": 0.08125716524775614,
|
79 |
+
"scr_dir1_threshold_5": 0.14323898317939607,
|
80 |
+
"scr_metric_threshold_5": 0.11980903429955778,
|
81 |
+
"scr_dir2_threshold_5": 0.12575375153644316,
|
82 |
+
"scr_dir1_threshold_10": 0.1363385070457653,
|
83 |
+
"scr_metric_threshold_10": 0.1432143223046469,
|
84 |
+
"scr_dir2_threshold_10": 0.1479021298215021,
|
85 |
+
"scr_dir1_threshold_20": 0.09621199123652632,
|
86 |
+
"scr_metric_threshold_20": 0.16714578684669865,
|
87 |
+
"scr_dir2_threshold_20": 0.18160776541995755,
|
88 |
+
"scr_dir1_threshold_50": 0.009138466559556352,
|
89 |
+
"scr_metric_threshold_50": 0.22111083513489554,
|
90 |
+
"scr_dir2_threshold_50": 0.2409887142151447,
|
91 |
+
"scr_dir1_threshold_100": 0.0010560166167327316,
|
92 |
+
"scr_metric_threshold_100": 0.24574769061347318,
|
93 |
+
"scr_dir2_threshold_100": 0.2606209691420875,
|
94 |
+
"scr_dir1_threshold_500": -0.12225683433428539,
|
95 |
+
"scr_metric_threshold_500": 0.20851088490922143,
|
96 |
+
"scr_dir2_threshold_500": 0.22618408062658932
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.18750034924590825,
|
103 |
+
"scr_metric_threshold_2": 0.017241303374415515,
|
104 |
+
"scr_dir2_threshold_2": 0.017241303374415515,
|
105 |
+
"scr_dir1_threshold_5": 0.20312514551912844,
|
106 |
+
"scr_metric_threshold_5": 0.049261034566288144,
|
107 |
+
"scr_dir2_threshold_5": 0.049261034566288144,
|
108 |
+
"scr_dir1_threshold_10": 0.2968748544808716,
|
109 |
+
"scr_metric_threshold_10": 0.06157629320786018,
|
110 |
+
"scr_dir2_threshold_10": 0.06157629320786018,
|
111 |
+
"scr_dir1_threshold_20": 0.20312514551912844,
|
112 |
+
"scr_metric_threshold_20": 0.05911327084143844,
|
113 |
+
"scr_dir2_threshold_20": 0.05911327084143844,
|
114 |
+
"scr_dir1_threshold_50": 0.0937506402841651,
|
115 |
+
"scr_metric_threshold_50": 0.08128076575816078,
|
116 |
+
"scr_dir2_threshold_50": 0.08128076575816078,
|
117 |
+
"scr_dir1_threshold_100": 0.14062502910382568,
|
118 |
+
"scr_metric_threshold_100": 0.08620681049100425,
|
119 |
+
"scr_dir2_threshold_100": 0.08620681049100425,
|
120 |
+
"scr_dir1_threshold_500": -0.28125005820765137,
|
121 |
+
"scr_metric_threshold_500": 0.06403931557428191,
|
122 |
+
"scr_dir2_threshold_500": 0.06403931557428191
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2336446515690506,
|
127 |
+
"scr_metric_threshold_2": 0.11239202884933491,
|
128 |
+
"scr_dir2_threshold_2": 0.11239202884933491,
|
129 |
+
"scr_dir1_threshold_5": 0.2710279072306566,
|
130 |
+
"scr_metric_threshold_5": 0.1325648513989227,
|
131 |
+
"scr_dir2_threshold_5": 0.1325648513989227,
|
132 |
+
"scr_dir1_threshold_10": 0.14953246559363445,
|
133 |
+
"scr_metric_threshold_10": 0.16426524650728808,
|
134 |
+
"scr_dir2_threshold_10": 0.16426524650728808,
|
135 |
+
"scr_dir1_threshold_20": 0.16822437195083229,
|
136 |
+
"scr_metric_threshold_20": 0.1757924755234473,
|
137 |
+
"scr_dir2_threshold_20": 0.1757924755234473,
|
138 |
+
"scr_dir1_threshold_50": -0.2056076276124383,
|
139 |
+
"scr_metric_threshold_50": 0.3025937124142904,
|
140 |
+
"scr_dir2_threshold_50": 0.3025937124142904,
|
141 |
+
"scr_dir1_threshold_100": -0.24299088327404436,
|
142 |
+
"scr_metric_threshold_100": 0.328530321243267,
|
143 |
+
"scr_dir2_threshold_100": 0.328530321243267,
|
144 |
+
"scr_dir1_threshold_500": -0.46728986019089086,
|
145 |
+
"scr_metric_threshold_500": 0.21902027141928104,
|
146 |
+
"scr_dir2_threshold_500": 0.21902027141928104
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.250000465661211,
|
151 |
+
"scr_metric_threshold_2": 0.025316579852636207,
|
152 |
+
"scr_dir2_threshold_2": 0.025316579852636207,
|
153 |
+
"scr_dir1_threshold_5": 0.3125005820765137,
|
154 |
+
"scr_metric_threshold_5": 0.04303801976188022,
|
155 |
+
"scr_dir2_threshold_5": 0.04303801976188022,
|
156 |
+
"scr_dir1_threshold_10": 0.28125005820765137,
|
157 |
+
"scr_metric_threshold_10": 0.055696234239288635,
|
158 |
+
"scr_dir2_threshold_10": 0.055696234239288635,
|
159 |
+
"scr_dir1_threshold_20": 0.1250002328306055,
|
160 |
+
"scr_metric_threshold_20": 0.08860765223967824,
|
161 |
+
"scr_dir2_threshold_20": 0.08860765223967824,
|
162 |
+
"scr_dir1_threshold_50": 0.03125052386886235,
|
163 |
+
"scr_metric_threshold_50": 0.1265822956719035,
|
164 |
+
"scr_dir2_threshold_50": 0.1265822956719035,
|
165 |
+
"scr_dir1_threshold_100": -0.07812491268852294,
|
166 |
+
"scr_metric_threshold_100": 0.15696210095637528,
|
167 |
+
"scr_dir2_threshold_100": 0.15696210095637528,
|
168 |
+
"scr_dir1_threshold_500": -0.3593749708961743,
|
169 |
+
"scr_metric_threshold_500": 0.10126586671708666,
|
170 |
+
"scr_dir2_threshold_500": 0.10126586671708666
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.14960621052073325,
|
175 |
+
"scr_metric_threshold_2": 0.07715140039073762,
|
176 |
+
"scr_dir2_threshold_2": 0.07715140039073762,
|
177 |
+
"scr_dir1_threshold_5": 0.16535450439689736,
|
178 |
+
"scr_metric_threshold_5": 0.13946602022454502,
|
179 |
+
"scr_dir2_threshold_5": 0.13946602022454502,
|
180 |
+
"scr_dir1_threshold_10": 0.07874006139746104,
|
181 |
+
"scr_metric_threshold_10": 0.15727012151918615,
|
182 |
+
"scr_dir2_threshold_10": 0.15727012151918615,
|
183 |
+
"scr_dir1_threshold_20": 0.07874006139746104,
|
184 |
+
"scr_metric_threshold_20": 0.1988131424522659,
|
185 |
+
"scr_dir2_threshold_20": 0.1988131424522659,
|
186 |
+
"scr_dir1_threshold_50": 0.007874381601975306,
|
187 |
+
"scr_metric_threshold_50": 0.26112776228607326,
|
188 |
+
"scr_dir2_threshold_50": 0.26112776228607326,
|
189 |
+
"scr_dir1_threshold_100": -0.07874006139746104,
|
190 |
+
"scr_metric_threshold_100": 0.2759645428430035,
|
191 |
+
"scr_dir2_threshold_100": 0.2759645428430035,
|
192 |
+
"scr_dir1_threshold_500": -0.18110232894527498,
|
193 |
+
"scr_metric_threshold_500": 0.1364985226184585,
|
194 |
+
"scr_dir2_threshold_500": 0.1364985226184585
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.027173884874933916,
|
199 |
+
"scr_metric_threshold_2": 0.10588216044635698,
|
200 |
+
"scr_dir2_threshold_2": 0.10588216044635698,
|
201 |
+
"scr_dir1_threshold_5": 0.05434776974986783,
|
202 |
+
"scr_metric_threshold_5": 0.21568630659227384,
|
203 |
+
"scr_dir2_threshold_5": 0.21568630659227384,
|
204 |
+
"scr_dir1_threshold_10": 0.059782481937202626,
|
205 |
+
"scr_metric_threshold_10": 0.262744992625386,
|
206 |
+
"scr_dir2_threshold_10": 0.262744992625386,
|
207 |
+
"scr_dir1_threshold_20": 0.06521719412453741,
|
208 |
+
"scr_metric_threshold_20": 0.3647058686033112,
|
209 |
+
"scr_dir2_threshold_20": 0.3647058686033112,
|
210 |
+
"scr_dir1_threshold_50": -0.027173884874933916,
|
211 |
+
"scr_metric_threshold_50": 0.45490195620110374,
|
212 |
+
"scr_dir2_threshold_50": 0.45490195620110374,
|
213 |
+
"scr_dir1_threshold_100": 0.0,
|
214 |
+
"scr_metric_threshold_100": 0.4823528174298009,
|
215 |
+
"scr_dir2_threshold_100": 0.4823528174298009,
|
216 |
+
"scr_dir1_threshold_500": 0.05434776974986783,
|
217 |
+
"scr_metric_threshold_500": 0.5137254306143486,
|
218 |
+
"scr_dir2_threshold_500": 0.5137254306143486
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.04123688851473832,
|
223 |
+
"scr_metric_threshold_2": 0.08064531634921589,
|
224 |
+
"scr_dir2_threshold_2": 0.08064531634921589,
|
225 |
+
"scr_dir1_threshold_5": 0.08762865692919337,
|
226 |
+
"scr_metric_threshold_5": 0.10887099681547797,
|
227 |
+
"scr_dir2_threshold_5": 0.10887099681547797,
|
228 |
+
"scr_dir1_threshold_10": 0.0979381094881989,
|
229 |
+
"scr_metric_threshold_10": 0.10483879108333834,
|
230 |
+
"scr_dir2_threshold_10": 0.10483879108333834,
|
231 |
+
"scr_dir1_threshold_20": 0.030927743196160724,
|
232 |
+
"scr_metric_threshold_20": 0.1572581866250075,
|
233 |
+
"scr_dir2_threshold_20": 0.1572581866250075,
|
234 |
+
"scr_dir1_threshold_50": 0.09278322958848217,
|
235 |
+
"scr_metric_threshold_50": 0.20967734182539205,
|
236 |
+
"scr_dir2_threshold_50": 0.20967734182539205,
|
237 |
+
"scr_dir1_threshold_100": 0.13917499800293723,
|
238 |
+
"scr_metric_threshold_100": 0.2419354683650784,
|
239 |
+
"scr_dir2_threshold_100": 0.2419354683650784,
|
240 |
+
"scr_dir1_threshold_500": 0.04639146117402712,
|
241 |
+
"scr_metric_threshold_500": 0.2701613891726251,
|
242 |
+
"scr_dir2_threshold_500": 0.2701613891726251
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.0405405550534804,
|
247 |
+
"scr_metric_threshold_2": 0.16000004238552068,
|
248 |
+
"scr_dir2_threshold_2": 0.16000004238552068,
|
249 |
+
"scr_dir1_threshold_5": 0.009008892905490125,
|
250 |
+
"scr_metric_threshold_5": 0.22666663840965287,
|
251 |
+
"scr_dir2_threshold_5": 0.22666663840965287,
|
252 |
+
"scr_dir1_threshold_10": 0.045045001506225466,
|
253 |
+
"scr_metric_threshold_10": 0.25777787549995046,
|
254 |
+
"scr_dir2_threshold_10": 0.25777787549995046,
|
255 |
+
"scr_dir1_threshold_20": 0.08558555655970586,
|
256 |
+
"scr_metric_threshold_20": 0.2800000741746612,
|
257 |
+
"scr_dir2_threshold_20": 0.2800000741746612,
|
258 |
+
"scr_dir1_threshold_50": 0.06306305580659313,
|
259 |
+
"scr_metric_threshold_50": 0.3155554331084958,
|
260 |
+
"scr_dir2_threshold_50": 0.3155554331084958,
|
261 |
+
"scr_dir1_threshold_100": 0.08558555655970586,
|
262 |
+
"scr_metric_threshold_100": 0.3511110569518347,
|
263 |
+
"scr_dir2_threshold_100": 0.3511110569518347,
|
264 |
+
"scr_dir1_threshold_500": 0.18018027451428928,
|
265 |
+
"scr_metric_threshold_500": 0.33333324503016526,
|
266 |
+
"scr_dir2_threshold_500": 0.33333324503016526
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.025750992813676425,
|
271 |
+
"scr_metric_threshold_2": 0.025750992813676425,
|
272 |
+
"scr_dir2_threshold_2": 0.07142849033383136,
|
273 |
+
"scr_dir1_threshold_5": 0.042918406627421406,
|
274 |
+
"scr_metric_threshold_5": 0.042918406627421406,
|
275 |
+
"scr_dir2_threshold_5": 0.09047614452250444,
|
276 |
+
"scr_dir1_threshold_10": 0.08154502375487709,
|
277 |
+
"scr_metric_threshold_10": 0.08154502375487709,
|
278 |
+
"scr_dir2_threshold_10": 0.11904748388971893,
|
279 |
+
"scr_dir1_threshold_20": 0.012875624313779262,
|
280 |
+
"scr_metric_threshold_20": 0.012875624313779262,
|
281 |
+
"scr_dir2_threshold_20": 0.12857145289985059,
|
282 |
+
"scr_dir1_threshold_50": 0.01716741381374498,
|
283 |
+
"scr_metric_threshold_50": 0.01716741381374498,
|
284 |
+
"scr_dir2_threshold_50": 0.17619044645573817,
|
285 |
+
"scr_dir1_threshold_100": 0.042918406627421406,
|
286 |
+
"scr_metric_threshold_100": 0.042918406627421406,
|
287 |
+
"scr_dir2_threshold_100": 0.1619046348563358,
|
288 |
+
"scr_dir1_threshold_500": 0.030043038127524242,
|
289 |
+
"scr_metric_threshold_500": 0.030043038127524242,
|
290 |
+
"scr_dir2_threshold_500": 0.17142860386646747
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "standard",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "1d2f727d-ba91-4f28-ae1e-d5e769b99804",
|
30 |
+
"datetime_epoch_millis": 1738809581267,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.959237540513277,
|
44 |
+
"sae_top_1_test_accuracy": 0.7985375,
|
45 |
+
"sae_top_2_test_accuracy": 0.83240625,
|
46 |
+
"sae_top_5_test_accuracy": 0.8882062500000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9688000321388245,
|
65 |
+
"sae_top_1_test_accuracy": 0.8117999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.8220000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.8633999999999998,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9506000518798828,
|
84 |
+
"sae_top_1_test_accuracy": 0.7866,
|
85 |
+
"sae_top_2_test_accuracy": 0.8413999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8710000000000001,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9304000496864319,
|
103 |
+
"sae_top_1_test_accuracy": 0.8116,
|
104 |
+
"sae_top_2_test_accuracy": 0.8392000000000002,
|
105 |
+
"sae_top_5_test_accuracy": 0.8754,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9196000337600708,
|
122 |
+
"sae_top_1_test_accuracy": 0.7326,
|
123 |
+
"sae_top_2_test_accuracy": 0.776,
|
124 |
+
"sae_top_5_test_accuracy": 0.8576,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9805000424385071,
|
141 |
+
"sae_top_1_test_accuracy": 0.733,
|
142 |
+
"sae_top_2_test_accuracy": 0.773,
|
143 |
+
"sae_top_5_test_accuracy": 0.909,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9714000463485718,
|
160 |
+
"sae_top_1_test_accuracy": 0.7604,
|
161 |
+
"sae_top_2_test_accuracy": 0.8074,
|
162 |
+
"sae_top_5_test_accuracy": 0.8694000000000001,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9530000388622284,
|
179 |
+
"sae_top_1_test_accuracy": 0.8065,
|
180 |
+
"sae_top_2_test_accuracy": 0.8452500000000001,
|
181 |
+
"sae_top_5_test_accuracy": 0.86225,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9996000289916992,
|
198 |
+
"sae_top_1_test_accuracy": 0.9458,
|
199 |
+
"sae_top_2_test_accuracy": 0.9549999999999998,
|
200 |
+
"sae_top_5_test_accuracy": 0.9975999999999999,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.956000030040741,
|
240 |
+
"1": 0.968000054359436,
|
241 |
+
"2": 0.9570000171661377,
|
242 |
+
"6": 0.9880000352859497,
|
243 |
+
"9": 0.9750000238418579
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.625,
|
275 |
+
"1": 0.663,
|
276 |
+
"2": 0.863,
|
277 |
+
"6": 0.973,
|
278 |
+
"9": 0.935
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.66,
|
282 |
+
"1": 0.665,
|
283 |
+
"2": 0.863,
|
284 |
+
"6": 0.983,
|
285 |
+
"9": 0.939
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.741,
|
289 |
+
"1": 0.771,
|
290 |
+
"2": 0.876,
|
291 |
+
"6": 0.986,
|
292 |
+
"9": 0.943
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9650000333786011,
|
298 |
+
"13": 0.9490000605583191,
|
299 |
+
"14": 0.9540000557899475,
|
300 |
+
"18": 0.9300000667572021,
|
301 |
+
"19": 0.9550000429153442
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.729,
|
333 |
+
"13": 0.751,
|
334 |
+
"14": 0.878,
|
335 |
+
"18": 0.732,
|
336 |
+
"19": 0.843
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.857,
|
340 |
+
"13": 0.773,
|
341 |
+
"14": 0.88,
|
342 |
+
"18": 0.855,
|
343 |
+
"19": 0.842
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.933,
|
347 |
+
"13": 0.831,
|
348 |
+
"14": 0.888,
|
349 |
+
"18": 0.856,
|
350 |
+
"19": 0.847
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9530000686645508,
|
356 |
+
"21": 0.9290000200271606,
|
357 |
+
"22": 0.9140000343322754,
|
358 |
+
"25": 0.968000054359436,
|
359 |
+
"26": 0.8880000710487366
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.856,
|
391 |
+
"21": 0.77,
|
392 |
+
"22": 0.844,
|
393 |
+
"25": 0.881,
|
394 |
+
"26": 0.707
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.899,
|
398 |
+
"21": 0.805,
|
399 |
+
"22": 0.846,
|
400 |
+
"25": 0.886,
|
401 |
+
"26": 0.76
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.92,
|
405 |
+
"21": 0.848,
|
406 |
+
"22": 0.891,
|
407 |
+
"25": 0.891,
|
408 |
+
"26": 0.827
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9570000171661377,
|
414 |
+
"2": 0.9410000443458557,
|
415 |
+
"3": 0.9240000247955322,
|
416 |
+
"5": 0.909000039100647,
|
417 |
+
"6": 0.8670000433921814
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.845,
|
449 |
+
"2": 0.669,
|
450 |
+
"3": 0.668,
|
451 |
+
"5": 0.717,
|
452 |
+
"6": 0.764
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.876,
|
456 |
+
"2": 0.698,
|
457 |
+
"3": 0.672,
|
458 |
+
"5": 0.856,
|
459 |
+
"6": 0.778
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.91,
|
463 |
+
"2": 0.907,
|
464 |
+
"3": 0.802,
|
465 |
+
"5": 0.879,
|
466 |
+
"6": 0.79
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9810000658035278,
|
472 |
+
"5.0": 0.9800000190734863
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.733,
|
492 |
+
"5.0": 0.733
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.773,
|
496 |
+
"5.0": 0.773
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.909,
|
500 |
+
"5.0": 0.909
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9550000429153442,
|
506 |
+
"Python": 0.9880000352859497,
|
507 |
+
"HTML": 0.9890000224113464,
|
508 |
+
"Java": 0.9670000672340393,
|
509 |
+
"PHP": 0.9580000638961792
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.611,
|
541 |
+
"Python": 0.934,
|
542 |
+
"HTML": 0.68,
|
543 |
+
"Java": 0.655,
|
544 |
+
"PHP": 0.922
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.641,
|
548 |
+
"Python": 0.931,
|
549 |
+
"HTML": 0.887,
|
550 |
+
"Java": 0.655,
|
551 |
+
"PHP": 0.923
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.756,
|
555 |
+
"Python": 0.937,
|
556 |
+
"HTML": 0.93,
|
557 |
+
"Java": 0.805,
|
558 |
+
"PHP": 0.919
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9380000233650208,
|
564 |
+
"1": 0.984000027179718,
|
565 |
+
"2": 0.9360000491142273,
|
566 |
+
"3": 0.9540000557899475
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.86,
|
594 |
+
"1": 0.958,
|
595 |
+
"2": 0.801,
|
596 |
+
"3": 0.607
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.859,
|
600 |
+
"1": 0.965,
|
601 |
+
"2": 0.833,
|
602 |
+
"3": 0.724
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.871,
|
606 |
+
"1": 0.964,
|
607 |
+
"2": 0.835,
|
608 |
+
"3": 0.779
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.995,
|
649 |
+
"fr": 0.993,
|
650 |
+
"de": 0.903,
|
651 |
+
"es": 0.92,
|
652 |
+
"nl": 0.918
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.997,
|
656 |
+
"fr": 0.992,
|
657 |
+
"de": 0.908,
|
658 |
+
"es": 0.944,
|
659 |
+
"nl": 0.934
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.998,
|
664 |
+
"de": 0.995,
|
665 |
+
"es": 0.997,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "12d3597f-e01f-4acd-b4e4-3916a8253103",
|
30 |
+
"datetime_epoch_millis": 1738809822946,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9591187916696072,
|
44 |
+
"sae_top_1_test_accuracy": 0.7973625000000001,
|
45 |
+
"sae_top_2_test_accuracy": 0.8503124999999999,
|
46 |
+
"sae_top_5_test_accuracy": 0.8916437500000002,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9698000431060791,
|
65 |
+
"sae_top_1_test_accuracy": 0.8183999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.8507999999999999,
|
67 |
+
"sae_top_5_test_accuracy": 0.8998000000000002,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9524000406265258,
|
84 |
+
"sae_top_1_test_accuracy": 0.7678,
|
85 |
+
"sae_top_2_test_accuracy": 0.7921999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.885,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9310000538825989,
|
103 |
+
"sae_top_1_test_accuracy": 0.8183999999999999,
|
104 |
+
"sae_top_2_test_accuracy": 0.8488,
|
105 |
+
"sae_top_5_test_accuracy": 0.8701999999999999,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9168000459671021,
|
122 |
+
"sae_top_1_test_accuracy": 0.737,
|
123 |
+
"sae_top_2_test_accuracy": 0.7807999999999999,
|
124 |
+
"sae_top_5_test_accuracy": 0.8092,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9775000512599945,
|
141 |
+
"sae_top_1_test_accuracy": 0.773,
|
142 |
+
"sae_top_2_test_accuracy": 0.903,
|
143 |
+
"sae_top_5_test_accuracy": 0.93,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9718000411987304,
|
160 |
+
"sae_top_1_test_accuracy": 0.7976000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.8288,
|
162 |
+
"sae_top_5_test_accuracy": 0.8718,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9542500376701355,
|
179 |
+
"sae_top_1_test_accuracy": 0.7125,
|
180 |
+
"sae_top_2_test_accuracy": 0.8314999999999999,
|
181 |
+
"sae_top_5_test_accuracy": 0.87075,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9994000196456909,
|
198 |
+
"sae_top_1_test_accuracy": 0.9542000000000002,
|
199 |
+
"sae_top_2_test_accuracy": 0.9666,
|
200 |
+
"sae_top_5_test_accuracy": 0.9964000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9540000557899475,
|
240 |
+
"1": 0.9700000286102295,
|
241 |
+
"2": 0.9550000429153442,
|
242 |
+
"6": 0.9910000562667847,
|
243 |
+
"9": 0.9790000319480896
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.624,
|
275 |
+
"1": 0.693,
|
276 |
+
"2": 0.866,
|
277 |
+
"6": 0.98,
|
278 |
+
"9": 0.929
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.633,
|
282 |
+
"1": 0.819,
|
283 |
+
"2": 0.884,
|
284 |
+
"6": 0.981,
|
285 |
+
"9": 0.937
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.799,
|
289 |
+
"1": 0.878,
|
290 |
+
"2": 0.888,
|
291 |
+
"6": 0.986,
|
292 |
+
"9": 0.948
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9700000286102295,
|
298 |
+
"13": 0.9390000700950623,
|
299 |
+
"14": 0.9500000476837158,
|
300 |
+
"18": 0.9380000233650208,
|
301 |
+
"19": 0.9650000333786011
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.731,
|
333 |
+
"13": 0.692,
|
334 |
+
"14": 0.858,
|
335 |
+
"18": 0.729,
|
336 |
+
"19": 0.829
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.844,
|
340 |
+
"13": 0.68,
|
341 |
+
"14": 0.861,
|
342 |
+
"18": 0.731,
|
343 |
+
"19": 0.845
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.951,
|
347 |
+
"13": 0.863,
|
348 |
+
"14": 0.88,
|
349 |
+
"18": 0.884,
|
350 |
+
"19": 0.847
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9620000720024109,
|
356 |
+
"21": 0.9230000376701355,
|
357 |
+
"22": 0.9220000505447388,
|
358 |
+
"25": 0.9590000510215759,
|
359 |
+
"26": 0.8890000581741333
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.881,
|
391 |
+
"21": 0.76,
|
392 |
+
"22": 0.862,
|
393 |
+
"25": 0.88,
|
394 |
+
"26": 0.709
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.898,
|
398 |
+
"21": 0.792,
|
399 |
+
"22": 0.897,
|
400 |
+
"25": 0.891,
|
401 |
+
"26": 0.766
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.938,
|
405 |
+
"21": 0.842,
|
406 |
+
"22": 0.889,
|
407 |
+
"25": 0.877,
|
408 |
+
"26": 0.805
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9570000171661377,
|
414 |
+
"2": 0.9300000667572021,
|
415 |
+
"3": 0.9110000729560852,
|
416 |
+
"5": 0.9120000600814819,
|
417 |
+
"6": 0.8740000128746033
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.85,
|
449 |
+
"2": 0.751,
|
450 |
+
"3": 0.697,
|
451 |
+
"5": 0.662,
|
452 |
+
"6": 0.725
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.858,
|
456 |
+
"2": 0.806,
|
457 |
+
"3": 0.696,
|
458 |
+
"5": 0.776,
|
459 |
+
"6": 0.768
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.901,
|
463 |
+
"2": 0.852,
|
464 |
+
"3": 0.73,
|
465 |
+
"5": 0.807,
|
466 |
+
"6": 0.756
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9770000576972961,
|
472 |
+
"5.0": 0.9780000448226929
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.773,
|
492 |
+
"5.0": 0.773
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.903,
|
496 |
+
"5.0": 0.903
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.93,
|
500 |
+
"5.0": 0.93
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9600000381469727,
|
506 |
+
"Python": 0.9900000691413879,
|
507 |
+
"HTML": 0.9880000352859497,
|
508 |
+
"Java": 0.9640000462532043,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.615,
|
541 |
+
"Python": 0.93,
|
542 |
+
"HTML": 0.881,
|
543 |
+
"Java": 0.647,
|
544 |
+
"PHP": 0.915
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.734,
|
548 |
+
"Python": 0.935,
|
549 |
+
"HTML": 0.906,
|
550 |
+
"Java": 0.656,
|
551 |
+
"PHP": 0.913
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.738,
|
555 |
+
"Python": 0.957,
|
556 |
+
"HTML": 0.958,
|
557 |
+
"Java": 0.791,
|
558 |
+
"PHP": 0.915
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9360000491142273,
|
564 |
+
"1": 0.987000048160553,
|
565 |
+
"2": 0.9340000152587891,
|
566 |
+
"3": 0.9600000381469727
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.849,
|
594 |
+
"1": 0.637,
|
595 |
+
"2": 0.739,
|
596 |
+
"3": 0.625
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.853,
|
600 |
+
"1": 0.858,
|
601 |
+
"2": 0.824,
|
602 |
+
"3": 0.791
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.866,
|
606 |
+
"1": 0.971,
|
607 |
+
"2": 0.837,
|
608 |
+
"3": 0.809
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 1.0,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.9980000257492065,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 1.0,
|
649 |
+
"fr": 0.991,
|
650 |
+
"de": 0.919,
|
651 |
+
"es": 0.937,
|
652 |
+
"nl": 0.924
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.996,
|
656 |
+
"fr": 0.988,
|
657 |
+
"de": 0.913,
|
658 |
+
"es": 0.94,
|
659 |
+
"nl": 0.996
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.99,
|
664 |
+
"de": 0.999,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 1.0
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "b46e4809-b52d-4af3-a3b9-c16c6abcee8d",
|
30 |
+
"datetime_epoch_millis": 1738810625768,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9566000409424305,
|
44 |
+
"sae_top_1_test_accuracy": 0.8076125000000001,
|
45 |
+
"sae_top_2_test_accuracy": 0.85893125,
|
46 |
+
"sae_top_5_test_accuracy": 0.8915875,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9660000443458557,
|
65 |
+
"sae_top_1_test_accuracy": 0.7590000000000001,
|
66 |
+
"sae_top_2_test_accuracy": 0.8640000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.8916000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9500000596046447,
|
84 |
+
"sae_top_1_test_accuracy": 0.7979999999999999,
|
85 |
+
"sae_top_2_test_accuracy": 0.8022,
|
86 |
+
"sae_top_5_test_accuracy": 0.8744,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9332000494003296,
|
103 |
+
"sae_top_1_test_accuracy": 0.8299999999999998,
|
104 |
+
"sae_top_2_test_accuracy": 0.8457999999999999,
|
105 |
+
"sae_top_5_test_accuracy": 0.8716000000000002,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9124000430107116,
|
122 |
+
"sae_top_1_test_accuracy": 0.7344000000000002,
|
123 |
+
"sae_top_2_test_accuracy": 0.7668,
|
124 |
+
"sae_top_5_test_accuracy": 0.8311999999999999,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9745000302791595,
|
141 |
+
"sae_top_1_test_accuracy": 0.858,
|
142 |
+
"sae_top_2_test_accuracy": 0.932,
|
143 |
+
"sae_top_5_test_accuracy": 0.932,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.963800048828125,
|
160 |
+
"sae_top_1_test_accuracy": 0.8106000000000002,
|
161 |
+
"sae_top_2_test_accuracy": 0.8326,
|
162 |
+
"sae_top_5_test_accuracy": 0.8598000000000001,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9535000324249268,
|
179 |
+
"sae_top_1_test_accuracy": 0.7625000000000001,
|
180 |
+
"sae_top_2_test_accuracy": 0.8552500000000001,
|
181 |
+
"sae_top_5_test_accuracy": 0.8755000000000001,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9994000196456909,
|
198 |
+
"sae_top_1_test_accuracy": 0.9084,
|
199 |
+
"sae_top_2_test_accuracy": 0.9728,
|
200 |
+
"sae_top_5_test_accuracy": 0.9966000000000002,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9510000348091125,
|
240 |
+
"1": 0.9630000591278076,
|
241 |
+
"2": 0.9520000219345093,
|
242 |
+
"6": 0.9910000562667847,
|
243 |
+
"9": 0.9730000495910645
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.617,
|
275 |
+
"1": 0.643,
|
276 |
+
"2": 0.852,
|
277 |
+
"6": 0.757,
|
278 |
+
"9": 0.926
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.855,
|
282 |
+
"1": 0.666,
|
283 |
+
"2": 0.888,
|
284 |
+
"6": 0.98,
|
285 |
+
"9": 0.931
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.886,
|
289 |
+
"1": 0.751,
|
290 |
+
"2": 0.908,
|
291 |
+
"6": 0.986,
|
292 |
+
"9": 0.927
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9590000510215759,
|
298 |
+
"13": 0.9530000686645508,
|
299 |
+
"14": 0.9580000638961792,
|
300 |
+
"18": 0.9250000715255737,
|
301 |
+
"19": 0.9550000429153442
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.854,
|
333 |
+
"13": 0.707,
|
334 |
+
"14": 0.864,
|
335 |
+
"18": 0.722,
|
336 |
+
"19": 0.843
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.855,
|
340 |
+
"13": 0.686,
|
341 |
+
"14": 0.882,
|
342 |
+
"18": 0.725,
|
343 |
+
"19": 0.863
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.954,
|
347 |
+
"13": 0.792,
|
348 |
+
"14": 0.882,
|
349 |
+
"18": 0.898,
|
350 |
+
"19": 0.846
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9650000333786011,
|
356 |
+
"21": 0.9260000586509705,
|
357 |
+
"22": 0.909000039100647,
|
358 |
+
"25": 0.971000075340271,
|
359 |
+
"26": 0.8950000405311584
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.876,
|
391 |
+
"21": 0.789,
|
392 |
+
"22": 0.885,
|
393 |
+
"25": 0.896,
|
394 |
+
"26": 0.704
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.907,
|
398 |
+
"21": 0.799,
|
399 |
+
"22": 0.888,
|
400 |
+
"25": 0.889,
|
401 |
+
"26": 0.746
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.939,
|
405 |
+
"21": 0.847,
|
406 |
+
"22": 0.901,
|
407 |
+
"25": 0.904,
|
408 |
+
"26": 0.767
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9490000605583191,
|
414 |
+
"2": 0.9220000505447388,
|
415 |
+
"3": 0.9120000600814819,
|
416 |
+
"5": 0.9200000166893005,
|
417 |
+
"6": 0.859000027179718
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.854,
|
449 |
+
"2": 0.779,
|
450 |
+
"3": 0.676,
|
451 |
+
"5": 0.676,
|
452 |
+
"6": 0.687
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.859,
|
456 |
+
"2": 0.793,
|
457 |
+
"3": 0.695,
|
458 |
+
"5": 0.736,
|
459 |
+
"6": 0.751
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.885,
|
463 |
+
"2": 0.854,
|
464 |
+
"3": 0.805,
|
465 |
+
"5": 0.859,
|
466 |
+
"6": 0.753
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9740000367164612,
|
472 |
+
"5.0": 0.9750000238418579
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.858,
|
492 |
+
"5.0": 0.858
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.932,
|
496 |
+
"5.0": 0.932
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.932,
|
500 |
+
"5.0": 0.932
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9390000700950623,
|
506 |
+
"Python": 0.9830000400543213,
|
507 |
+
"HTML": 0.9860000610351562,
|
508 |
+
"Java": 0.9570000171661377,
|
509 |
+
"PHP": 0.9540000557899475
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.632,
|
541 |
+
"Python": 0.914,
|
542 |
+
"HTML": 0.88,
|
543 |
+
"Java": 0.708,
|
544 |
+
"PHP": 0.919
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.639,
|
548 |
+
"Python": 0.923,
|
549 |
+
"HTML": 0.893,
|
550 |
+
"Java": 0.785,
|
551 |
+
"PHP": 0.923
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.714,
|
555 |
+
"Python": 0.942,
|
556 |
+
"HTML": 0.927,
|
557 |
+
"Java": 0.793,
|
558 |
+
"PHP": 0.923
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9420000314712524,
|
564 |
+
"1": 0.9890000224113464,
|
565 |
+
"2": 0.9290000200271606,
|
566 |
+
"3": 0.9540000557899475
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.683,
|
594 |
+
"1": 0.936,
|
595 |
+
"2": 0.774,
|
596 |
+
"3": 0.657
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.848,
|
600 |
+
"1": 0.932,
|
601 |
+
"2": 0.828,
|
602 |
+
"3": 0.813
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.852,
|
606 |
+
"1": 0.946,
|
607 |
+
"2": 0.841,
|
608 |
+
"3": 0.863
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 1.0,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.69,
|
649 |
+
"fr": 0.993,
|
650 |
+
"de": 0.917,
|
651 |
+
"es": 0.944,
|
652 |
+
"nl": 0.998
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.999,
|
656 |
+
"fr": 0.994,
|
657 |
+
"de": 0.922,
|
658 |
+
"es": 0.951,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.993,
|
664 |
+
"de": 0.999,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.998
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "6425df5a-b293-4597-bb77-df77861d6457",
|
30 |
+
"datetime_epoch_millis": 1738810319161,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9553375504910946,
|
44 |
+
"sae_top_1_test_accuracy": 0.77961875,
|
45 |
+
"sae_top_2_test_accuracy": 0.853675,
|
46 |
+
"sae_top_5_test_accuracy": 0.89161875,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9662000417709351,
|
65 |
+
"sae_top_1_test_accuracy": 0.8164,
|
66 |
+
"sae_top_2_test_accuracy": 0.9014,
|
67 |
+
"sae_top_5_test_accuracy": 0.9178,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9474000453948974,
|
84 |
+
"sae_top_1_test_accuracy": 0.766,
|
85 |
+
"sae_top_2_test_accuracy": 0.7964,
|
86 |
+
"sae_top_5_test_accuracy": 0.8795999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9244000434875488,
|
103 |
+
"sae_top_1_test_accuracy": 0.8320000000000001,
|
104 |
+
"sae_top_2_test_accuracy": 0.8568000000000001,
|
105 |
+
"sae_top_5_test_accuracy": 0.8774000000000001,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9170000433921814,
|
122 |
+
"sae_top_1_test_accuracy": 0.6744,
|
123 |
+
"sae_top_2_test_accuracy": 0.7302000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.8102,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.971000075340271,
|
141 |
+
"sae_top_1_test_accuracy": 0.805,
|
142 |
+
"sae_top_2_test_accuracy": 0.934,
|
143 |
+
"sae_top_5_test_accuracy": 0.943,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9680000424385071,
|
160 |
+
"sae_top_1_test_accuracy": 0.7978000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.8178000000000001,
|
162 |
+
"sae_top_5_test_accuracy": 0.8513999999999999,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9495000541210175,
|
179 |
+
"sae_top_1_test_accuracy": 0.6577500000000001,
|
180 |
+
"sae_top_2_test_accuracy": 0.813,
|
181 |
+
"sae_top_5_test_accuracy": 0.86075,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9992000579833984,
|
198 |
+
"sae_top_1_test_accuracy": 0.8876,
|
199 |
+
"sae_top_2_test_accuracy": 0.9798,
|
200 |
+
"sae_top_5_test_accuracy": 0.9928000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9500000476837158,
|
240 |
+
"1": 0.9640000462532043,
|
241 |
+
"2": 0.9520000219345093,
|
242 |
+
"6": 0.9880000352859497,
|
243 |
+
"9": 0.9770000576972961
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.617,
|
275 |
+
"1": 0.823,
|
276 |
+
"2": 0.843,
|
277 |
+
"6": 0.977,
|
278 |
+
"9": 0.822
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.881,
|
282 |
+
"1": 0.848,
|
283 |
+
"2": 0.884,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.918
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.896,
|
289 |
+
"1": 0.883,
|
290 |
+
"2": 0.904,
|
291 |
+
"6": 0.986,
|
292 |
+
"9": 0.92
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9580000638961792,
|
298 |
+
"13": 0.956000030040741,
|
299 |
+
"14": 0.9550000429153442,
|
300 |
+
"18": 0.9110000729560852,
|
301 |
+
"19": 0.9570000171661377
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.733,
|
333 |
+
"13": 0.69,
|
334 |
+
"14": 0.839,
|
335 |
+
"18": 0.735,
|
336 |
+
"19": 0.833
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.859,
|
340 |
+
"13": 0.705,
|
341 |
+
"14": 0.85,
|
342 |
+
"18": 0.728,
|
343 |
+
"19": 0.84
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.947,
|
347 |
+
"13": 0.79,
|
348 |
+
"14": 0.885,
|
349 |
+
"18": 0.917,
|
350 |
+
"19": 0.859
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9550000429153442,
|
356 |
+
"21": 0.9220000505447388,
|
357 |
+
"22": 0.9100000262260437,
|
358 |
+
"25": 0.9540000557899475,
|
359 |
+
"26": 0.8810000419616699
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.882,
|
391 |
+
"21": 0.811,
|
392 |
+
"22": 0.888,
|
393 |
+
"25": 0.879,
|
394 |
+
"26": 0.7
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.899,
|
398 |
+
"21": 0.834,
|
399 |
+
"22": 0.899,
|
400 |
+
"25": 0.882,
|
401 |
+
"26": 0.77
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.937,
|
405 |
+
"21": 0.855,
|
406 |
+
"22": 0.885,
|
407 |
+
"25": 0.902,
|
408 |
+
"26": 0.808
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9420000314712524,
|
414 |
+
"2": 0.9360000491142273,
|
415 |
+
"3": 0.9260000586509705,
|
416 |
+
"5": 0.9170000553131104,
|
417 |
+
"6": 0.8640000224113464
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.861,
|
449 |
+
"2": 0.629,
|
450 |
+
"3": 0.597,
|
451 |
+
"5": 0.605,
|
452 |
+
"6": 0.68
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.903,
|
456 |
+
"2": 0.752,
|
457 |
+
"3": 0.648,
|
458 |
+
"5": 0.631,
|
459 |
+
"6": 0.717
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.918,
|
463 |
+
"2": 0.809,
|
464 |
+
"3": 0.698,
|
465 |
+
"5": 0.873,
|
466 |
+
"6": 0.753
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.971000075340271,
|
472 |
+
"5.0": 0.971000075340271
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.805,
|
492 |
+
"5.0": 0.805
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.934,
|
496 |
+
"5.0": 0.934
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.943,
|
500 |
+
"5.0": 0.943
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9620000720024109,
|
506 |
+
"Python": 0.984000027179718,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.9550000429153442,
|
509 |
+
"PHP": 0.9550000429153442
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.604,
|
541 |
+
"Python": 0.906,
|
542 |
+
"HTML": 0.876,
|
543 |
+
"Java": 0.713,
|
544 |
+
"PHP": 0.89
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.69,
|
548 |
+
"Python": 0.914,
|
549 |
+
"HTML": 0.889,
|
550 |
+
"Java": 0.699,
|
551 |
+
"PHP": 0.897
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.717,
|
555 |
+
"Python": 0.92,
|
556 |
+
"HTML": 0.915,
|
557 |
+
"Java": 0.79,
|
558 |
+
"PHP": 0.915
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9350000619888306,
|
564 |
+
"1": 0.9860000610351562,
|
565 |
+
"2": 0.9310000538825989,
|
566 |
+
"3": 0.9460000395774841
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.715,
|
594 |
+
"1": 0.606,
|
595 |
+
"2": 0.695,
|
596 |
+
"3": 0.615
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.839,
|
600 |
+
"1": 0.837,
|
601 |
+
"2": 0.815,
|
602 |
+
"3": 0.761
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.871,
|
606 |
+
"1": 0.935,
|
607 |
+
"2": 0.818,
|
608 |
+
"3": 0.819
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.685,
|
649 |
+
"fr": 0.984,
|
650 |
+
"de": 0.918,
|
651 |
+
"es": 0.942,
|
652 |
+
"nl": 0.909
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.996,
|
656 |
+
"fr": 0.99,
|
657 |
+
"de": 0.926,
|
658 |
+
"es": 0.991,
|
659 |
+
"nl": 0.996
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.991,
|
664 |
+
"de": 0.986,
|
665 |
+
"es": 0.99,
|
666 |
+
"nl": 0.998
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "79d387eb-180f-4d7c-8356-24c5e2fb2eca",
|
30 |
+
"datetime_epoch_millis": 1738810868662,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9545250445604324,
|
44 |
+
"sae_top_1_test_accuracy": 0.7856687499999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.8240375,
|
46 |
+
"sae_top_5_test_accuracy": 0.8696375000000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9622000455856323,
|
65 |
+
"sae_top_1_test_accuracy": 0.7978000000000001,
|
66 |
+
"sae_top_2_test_accuracy": 0.8472,
|
67 |
+
"sae_top_5_test_accuracy": 0.8636000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9474000453948974,
|
84 |
+
"sae_top_1_test_accuracy": 0.7654,
|
85 |
+
"sae_top_2_test_accuracy": 0.7933999999999999,
|
86 |
+
"sae_top_5_test_accuracy": 0.8804000000000001,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9278000354766845,
|
103 |
+
"sae_top_1_test_accuracy": 0.8138,
|
104 |
+
"sae_top_2_test_accuracy": 0.8508000000000001,
|
105 |
+
"sae_top_5_test_accuracy": 0.8704000000000001,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9180000424385071,
|
122 |
+
"sae_top_1_test_accuracy": 0.6996,
|
123 |
+
"sae_top_2_test_accuracy": 0.7652,
|
124 |
+
"sae_top_5_test_accuracy": 0.8160000000000001,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9690000414848328,
|
141 |
+
"sae_top_1_test_accuracy": 0.8,
|
142 |
+
"sae_top_2_test_accuracy": 0.813,
|
143 |
+
"sae_top_5_test_accuracy": 0.837,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9656000375747681,
|
160 |
+
"sae_top_1_test_accuracy": 0.7926,
|
161 |
+
"sae_top_2_test_accuracy": 0.8016,
|
162 |
+
"sae_top_5_test_accuracy": 0.8466000000000001,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9490000605583191,
|
179 |
+
"sae_top_1_test_accuracy": 0.74175,
|
180 |
+
"sae_top_2_test_accuracy": 0.7815,
|
181 |
+
"sae_top_5_test_accuracy": 0.8514999999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9972000479698181,
|
198 |
+
"sae_top_1_test_accuracy": 0.8744,
|
199 |
+
"sae_top_2_test_accuracy": 0.9395999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9916,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9390000700950623,
|
240 |
+
"1": 0.9650000333786011,
|
241 |
+
"2": 0.9470000267028809,
|
242 |
+
"6": 0.9820000529289246,
|
243 |
+
"9": 0.9780000448226929
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.621,
|
275 |
+
"1": 0.644,
|
276 |
+
"2": 0.856,
|
277 |
+
"6": 0.978,
|
278 |
+
"9": 0.89
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.653,
|
282 |
+
"1": 0.817,
|
283 |
+
"2": 0.878,
|
284 |
+
"6": 0.976,
|
285 |
+
"9": 0.912
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.676,
|
289 |
+
"1": 0.864,
|
290 |
+
"2": 0.889,
|
291 |
+
"6": 0.987,
|
292 |
+
"9": 0.902
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9570000171661377,
|
298 |
+
"13": 0.9490000605583191,
|
299 |
+
"14": 0.9550000429153442,
|
300 |
+
"18": 0.9180000424385071,
|
301 |
+
"19": 0.9580000638961792
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.721,
|
333 |
+
"13": 0.695,
|
334 |
+
"14": 0.859,
|
335 |
+
"18": 0.725,
|
336 |
+
"19": 0.827
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.859,
|
340 |
+
"13": 0.685,
|
341 |
+
"14": 0.868,
|
342 |
+
"18": 0.731,
|
343 |
+
"19": 0.824
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.927,
|
347 |
+
"13": 0.845,
|
348 |
+
"14": 0.885,
|
349 |
+
"18": 0.903,
|
350 |
+
"19": 0.842
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9540000557899475,
|
356 |
+
"21": 0.9200000166893005,
|
357 |
+
"22": 0.9140000343322754,
|
358 |
+
"25": 0.9610000252723694,
|
359 |
+
"26": 0.89000004529953
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.866,
|
391 |
+
"21": 0.729,
|
392 |
+
"22": 0.883,
|
393 |
+
"25": 0.887,
|
394 |
+
"26": 0.704
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.895,
|
398 |
+
"21": 0.821,
|
399 |
+
"22": 0.876,
|
400 |
+
"25": 0.893,
|
401 |
+
"26": 0.769
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.932,
|
405 |
+
"21": 0.851,
|
406 |
+
"22": 0.885,
|
407 |
+
"25": 0.905,
|
408 |
+
"26": 0.779
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9430000185966492,
|
414 |
+
"2": 0.9290000200271606,
|
415 |
+
"3": 0.921000063419342,
|
416 |
+
"5": 0.9260000586509705,
|
417 |
+
"6": 0.8710000514984131
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.893,
|
449 |
+
"2": 0.596,
|
450 |
+
"3": 0.569,
|
451 |
+
"5": 0.771,
|
452 |
+
"6": 0.669
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.892,
|
456 |
+
"2": 0.772,
|
457 |
+
"3": 0.669,
|
458 |
+
"5": 0.774,
|
459 |
+
"6": 0.719
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.888,
|
463 |
+
"2": 0.864,
|
464 |
+
"3": 0.711,
|
465 |
+
"5": 0.857,
|
466 |
+
"6": 0.76
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.968000054359436,
|
472 |
+
"5.0": 0.9700000286102295
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.8,
|
492 |
+
"5.0": 0.8
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.813,
|
496 |
+
"5.0": 0.813
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.837,
|
500 |
+
"5.0": 0.837
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9550000429153442,
|
506 |
+
"Python": 0.9750000238418579,
|
507 |
+
"HTML": 0.9820000529289246,
|
508 |
+
"Java": 0.9590000510215759,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.606,
|
541 |
+
"Python": 0.904,
|
542 |
+
"HTML": 0.856,
|
543 |
+
"Java": 0.717,
|
544 |
+
"PHP": 0.88
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.626,
|
548 |
+
"Python": 0.903,
|
549 |
+
"HTML": 0.884,
|
550 |
+
"Java": 0.709,
|
551 |
+
"PHP": 0.886
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.742,
|
555 |
+
"Python": 0.925,
|
556 |
+
"HTML": 0.92,
|
557 |
+
"Java": 0.729,
|
558 |
+
"PHP": 0.917
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9360000491142273,
|
564 |
+
"1": 0.9860000610351562,
|
565 |
+
"2": 0.9260000586509705,
|
566 |
+
"3": 0.9480000734329224
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.832,
|
594 |
+
"1": 0.817,
|
595 |
+
"2": 0.674,
|
596 |
+
"3": 0.644
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.842,
|
600 |
+
"1": 0.821,
|
601 |
+
"2": 0.813,
|
602 |
+
"3": 0.65
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.854,
|
606 |
+
"1": 0.925,
|
607 |
+
"2": 0.814,
|
608 |
+
"3": 0.813
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.9980000257492065,
|
615 |
+
"de": 0.9960000514984131,
|
616 |
+
"es": 0.9960000514984131,
|
617 |
+
"nl": 0.9970000386238098
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.696,
|
649 |
+
"fr": 0.986,
|
650 |
+
"de": 0.908,
|
651 |
+
"es": 0.895,
|
652 |
+
"nl": 0.887
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.802,
|
656 |
+
"fr": 0.988,
|
657 |
+
"de": 0.925,
|
658 |
+
"es": 0.986,
|
659 |
+
"nl": 0.997
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 1.0,
|
663 |
+
"fr": 0.988,
|
664 |
+
"de": 0.982,
|
665 |
+
"es": 0.991,
|
666 |
+
"nl": 0.997
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/sparse_probing/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "a24bedef-7962-4166-b149-7a1b6ad73ccc",
|
30 |
+
"datetime_epoch_millis": 1738810076954,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9598687920719384,
|
34 |
+
"llm_top_1_test_accuracy": 0.6589812500000001,
|
35 |
+
"llm_top_2_test_accuracy": 0.7174812500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.7826062500000001,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9497312925755977,
|
44 |
+
"sae_top_1_test_accuracy": 0.75526875,
|
45 |
+
"sae_top_2_test_accuracy": 0.8121375,
|
46 |
+
"sae_top_5_test_accuracy": 0.86029375,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9564000368118286,
|
65 |
+
"sae_top_1_test_accuracy": 0.7794,
|
66 |
+
"sae_top_2_test_accuracy": 0.8156000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.8370000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9544000387191772,
|
76 |
+
"llm_top_1_test_accuracy": 0.67,
|
77 |
+
"llm_top_2_test_accuracy": 0.7148,
|
78 |
+
"llm_top_5_test_accuracy": 0.7716,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9434000492095947,
|
84 |
+
"sae_top_1_test_accuracy": 0.7693999999999999,
|
85 |
+
"sae_top_2_test_accuracy": 0.78,
|
86 |
+
"sae_top_5_test_accuracy": 0.8836,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9320000410079956,
|
95 |
+
"llm_top_1_test_accuracy": 0.6918,
|
96 |
+
"llm_top_2_test_accuracy": 0.7338,
|
97 |
+
"llm_top_5_test_accuracy": 0.765,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9236000418663025,
|
103 |
+
"sae_top_1_test_accuracy": 0.7532,
|
104 |
+
"sae_top_2_test_accuracy": 0.8549999999999999,
|
105 |
+
"sae_top_5_test_accuracy": 0.8664000000000002,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.9196000456809997,
|
114 |
+
"llm_top_1_test_accuracy": 0.6048,
|
115 |
+
"llm_top_2_test_accuracy": 0.6406000000000001,
|
116 |
+
"llm_top_5_test_accuracy": 0.6696,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9022000432014465,
|
122 |
+
"sae_top_1_test_accuracy": 0.6662000000000001,
|
123 |
+
"sae_top_2_test_accuracy": 0.6942,
|
124 |
+
"sae_top_5_test_accuracy": 0.819,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9645000398159027,
|
141 |
+
"sae_top_1_test_accuracy": 0.786,
|
142 |
+
"sae_top_2_test_accuracy": 0.813,
|
143 |
+
"sae_top_5_test_accuracy": 0.81,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.969200050830841,
|
152 |
+
"llm_top_1_test_accuracy": 0.6428,
|
153 |
+
"llm_top_2_test_accuracy": 0.6920000000000001,
|
154 |
+
"llm_top_5_test_accuracy": 0.7656000000000001,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9658000349998475,
|
160 |
+
"sae_top_1_test_accuracy": 0.7729999999999999,
|
161 |
+
"sae_top_2_test_accuracy": 0.7797999999999999,
|
162 |
+
"sae_top_5_test_accuracy": 0.8286,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9527500420808792,
|
171 |
+
"llm_top_1_test_accuracy": 0.69725,
|
172 |
+
"llm_top_2_test_accuracy": 0.76625,
|
173 |
+
"llm_top_5_test_accuracy": 0.8192499999999999,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9457500576972961,
|
179 |
+
"sae_top_1_test_accuracy": 0.74775,
|
180 |
+
"sae_top_2_test_accuracy": 0.8095,
|
181 |
+
"sae_top_5_test_accuracy": 0.8547499999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000289916992,
|
190 |
+
"llm_top_1_test_accuracy": 0.6496000000000001,
|
191 |
+
"llm_top_2_test_accuracy": 0.7809999999999999,
|
192 |
+
"llm_top_5_test_accuracy": 0.913,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9962000370025634,
|
198 |
+
"sae_top_1_test_accuracy": 0.7672000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.95,
|
200 |
+
"sae_top_5_test_accuracy": 0.983,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "standard",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.940000057220459,
|
240 |
+
"1": 0.9520000219345093,
|
241 |
+
"2": 0.9430000185966492,
|
242 |
+
"6": 0.9790000319480896,
|
243 |
+
"9": 0.968000054359436
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.629,
|
275 |
+
"1": 0.687,
|
276 |
+
"2": 0.83,
|
277 |
+
"6": 0.978,
|
278 |
+
"9": 0.773
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.647,
|
282 |
+
"1": 0.754,
|
283 |
+
"2": 0.895,
|
284 |
+
"6": 0.98,
|
285 |
+
"9": 0.802
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.656,
|
289 |
+
"1": 0.757,
|
290 |
+
"2": 0.894,
|
291 |
+
"6": 0.983,
|
292 |
+
"9": 0.895
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.956000030040741,
|
298 |
+
"13": 0.9450000524520874,
|
299 |
+
"14": 0.9540000557899475,
|
300 |
+
"18": 0.9030000567436218,
|
301 |
+
"19": 0.9590000510215759
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.971000075340271,
|
305 |
+
"13": 0.9520000219345093,
|
306 |
+
"14": 0.956000030040741,
|
307 |
+
"18": 0.9360000491142273,
|
308 |
+
"19": 0.9570000171661377
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.561,
|
312 |
+
"13": 0.672,
|
313 |
+
"14": 0.631,
|
314 |
+
"18": 0.7,
|
315 |
+
"19": 0.786
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.69,
|
319 |
+
"13": 0.72,
|
320 |
+
"14": 0.677,
|
321 |
+
"18": 0.721,
|
322 |
+
"19": 0.766
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.776,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.768,
|
328 |
+
"18": 0.731,
|
329 |
+
"19": 0.841
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.728,
|
333 |
+
"13": 0.705,
|
334 |
+
"14": 0.855,
|
335 |
+
"18": 0.726,
|
336 |
+
"19": 0.833
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.733,
|
340 |
+
"13": 0.706,
|
341 |
+
"14": 0.866,
|
342 |
+
"18": 0.729,
|
343 |
+
"19": 0.866
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.952,
|
347 |
+
"13": 0.818,
|
348 |
+
"14": 0.859,
|
349 |
+
"18": 0.909,
|
350 |
+
"19": 0.88
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9540000557899475,
|
356 |
+
"21": 0.9230000376701355,
|
357 |
+
"22": 0.906000018119812,
|
358 |
+
"25": 0.956000030040741,
|
359 |
+
"26": 0.8790000677108765
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.956000030040741,
|
363 |
+
"21": 0.9260000586509705,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9640000462532043,
|
366 |
+
"26": 0.8970000147819519
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.699,
|
370 |
+
"21": 0.772,
|
371 |
+
"22": 0.641,
|
372 |
+
"25": 0.703,
|
373 |
+
"26": 0.644
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.811,
|
377 |
+
"21": 0.757,
|
378 |
+
"22": 0.655,
|
379 |
+
"25": 0.762,
|
380 |
+
"26": 0.684
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.861,
|
384 |
+
"21": 0.788,
|
385 |
+
"22": 0.712,
|
386 |
+
"25": 0.796,
|
387 |
+
"26": 0.668
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.903,
|
391 |
+
"21": 0.751,
|
392 |
+
"22": 0.522,
|
393 |
+
"25": 0.882,
|
394 |
+
"26": 0.708
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.913,
|
398 |
+
"21": 0.826,
|
399 |
+
"22": 0.869,
|
400 |
+
"25": 0.89,
|
401 |
+
"26": 0.777
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.937,
|
405 |
+
"21": 0.838,
|
406 |
+
"22": 0.878,
|
407 |
+
"25": 0.909,
|
408 |
+
"26": 0.77
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9430000185966492,
|
414 |
+
"2": 0.9270000457763672,
|
415 |
+
"3": 0.9040000438690186,
|
416 |
+
"5": 0.893000066280365,
|
417 |
+
"6": 0.8440000414848328
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.956000030040741,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9160000681877136,
|
423 |
+
"5": 0.9170000553131104,
|
424 |
+
"6": 0.8720000386238098
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.665,
|
428 |
+
"2": 0.596,
|
429 |
+
"3": 0.599,
|
430 |
+
"5": 0.576,
|
431 |
+
"6": 0.588
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.73,
|
435 |
+
"2": 0.632,
|
436 |
+
"3": 0.617,
|
437 |
+
"5": 0.615,
|
438 |
+
"6": 0.609
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.758,
|
442 |
+
"2": 0.646,
|
443 |
+
"3": 0.627,
|
444 |
+
"5": 0.646,
|
445 |
+
"6": 0.671
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.911,
|
449 |
+
"2": 0.59,
|
450 |
+
"3": 0.543,
|
451 |
+
"5": 0.651,
|
452 |
+
"6": 0.636
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.914,
|
456 |
+
"2": 0.65,
|
457 |
+
"3": 0.585,
|
458 |
+
"5": 0.645,
|
459 |
+
"6": 0.677
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.936,
|
463 |
+
"2": 0.841,
|
464 |
+
"3": 0.767,
|
465 |
+
"5": 0.81,
|
466 |
+
"6": 0.741
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9660000205039978,
|
472 |
+
"5.0": 0.9630000591278076
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.786,
|
492 |
+
"5.0": 0.786
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.813,
|
496 |
+
"5.0": 0.813
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.81,
|
500 |
+
"5.0": 0.81
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9510000348091125,
|
506 |
+
"Python": 0.9790000319480896,
|
507 |
+
"HTML": 0.9890000224113464,
|
508 |
+
"Java": 0.9600000381469727,
|
509 |
+
"PHP": 0.9500000476837158
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.9530000686645508,
|
513 |
+
"Python": 0.9860000610351562,
|
514 |
+
"HTML": 0.9880000352859497,
|
515 |
+
"Java": 0.9640000462532043,
|
516 |
+
"PHP": 0.9550000429153442
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.637,
|
521 |
+
"HTML": 0.714,
|
522 |
+
"Java": 0.624,
|
523 |
+
"PHP": 0.582
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.658,
|
527 |
+
"Python": 0.664,
|
528 |
+
"HTML": 0.801,
|
529 |
+
"Java": 0.697,
|
530 |
+
"PHP": 0.64
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.752,
|
534 |
+
"Python": 0.726,
|
535 |
+
"HTML": 0.928,
|
536 |
+
"Java": 0.728,
|
537 |
+
"PHP": 0.694
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.609,
|
541 |
+
"Python": 0.736,
|
542 |
+
"HTML": 0.892,
|
543 |
+
"Java": 0.712,
|
544 |
+
"PHP": 0.916
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.644,
|
548 |
+
"Python": 0.743,
|
549 |
+
"HTML": 0.889,
|
550 |
+
"Java": 0.712,
|
551 |
+
"PHP": 0.911
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.725,
|
555 |
+
"Python": 0.796,
|
556 |
+
"HTML": 0.914,
|
557 |
+
"Java": 0.794,
|
558 |
+
"PHP": 0.914
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9320000410079956,
|
564 |
+
"1": 0.9810000658035278,
|
565 |
+
"2": 0.9310000538825989,
|
566 |
+
"3": 0.9390000700950623
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9380000233650208,
|
570 |
+
"1": 0.9880000352859497,
|
571 |
+
"2": 0.9300000667572021,
|
572 |
+
"3": 0.9550000429153442
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.805,
|
576 |
+
"1": 0.67,
|
577 |
+
"2": 0.648,
|
578 |
+
"3": 0.666
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.794,
|
582 |
+
"1": 0.795,
|
583 |
+
"2": 0.686,
|
584 |
+
"3": 0.79
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.818,
|
588 |
+
"1": 0.867,
|
589 |
+
"2": 0.756,
|
590 |
+
"3": 0.836
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.688,
|
594 |
+
"1": 0.934,
|
595 |
+
"2": 0.74,
|
596 |
+
"3": 0.629
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.833,
|
600 |
+
"1": 0.93,
|
601 |
+
"2": 0.818,
|
602 |
+
"3": 0.657
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.858,
|
606 |
+
"1": 0.934,
|
607 |
+
"2": 0.827,
|
608 |
+
"3": 0.8
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 1.0,
|
614 |
+
"fr": 0.9950000643730164,
|
615 |
+
"de": 0.9970000386238098,
|
616 |
+
"es": 0.9930000305175781,
|
617 |
+
"nl": 0.9960000514984131
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 0.999000072479248,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.999000072479248
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.748,
|
628 |
+
"fr": 0.59,
|
629 |
+
"de": 0.754,
|
630 |
+
"es": 0.494,
|
631 |
+
"nl": 0.662
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.822,
|
635 |
+
"fr": 0.596,
|
636 |
+
"de": 0.831,
|
637 |
+
"es": 0.91,
|
638 |
+
"nl": 0.746
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.887,
|
642 |
+
"fr": 0.922,
|
643 |
+
"de": 0.909,
|
644 |
+
"es": 0.982,
|
645 |
+
"nl": 0.865
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.709,
|
649 |
+
"fr": 0.582,
|
650 |
+
"de": 0.906,
|
651 |
+
"es": 0.75,
|
652 |
+
"nl": 0.889
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.97,
|
656 |
+
"fr": 0.97,
|
657 |
+
"de": 0.909,
|
658 |
+
"es": 0.903,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.989,
|
664 |
+
"de": 0.943,
|
665 |
+
"es": 0.985,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "1fcdc3f3-fc8d-4e93-bfd2-d7725eac7b72",
|
73 |
+
"datetime_epoch_millis": 1738807905009,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.005549997091293335,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.008399999141693114,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.00285000205039978,
|
79 |
+
"tpp_threshold_5_total_metric": 0.00882500559091568,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.01300000548362732,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.004174999892711639,
|
82 |
+
"tpp_threshold_10_total_metric": 0.020424994826316833,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.02619999647140503,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.005775001645088196,
|
85 |
+
"tpp_threshold_20_total_metric": 0.030799996852874757,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.036699998378753665,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.005900001525878907,
|
88 |
+
"tpp_threshold_50_total_metric": 0.07465000301599503,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.08340000510215759,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.008750002086162566,
|
91 |
+
"tpp_threshold_100_total_metric": 0.14097500890493392,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.15220001339912415,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.011225004494190217,
|
94 |
+
"tpp_threshold_500_total_metric": 0.41430001705884933,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.4369000196456909,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.022600002586841583
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003849995136260986,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.00559999942779541,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500042915344239,
|
105 |
+
"tpp_threshold_5_total_metric": 0.006900015473365784,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.009200012683868409,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0022999972105026247,
|
108 |
+
"tpp_threshold_10_total_metric": 0.01449999213218689,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.017199993133544922,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0027000010013580322,
|
111 |
+
"tpp_threshold_20_total_metric": 0.02719999849796295,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.029799997806549072,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0025999993085861206,
|
114 |
+
"tpp_threshold_50_total_metric": 0.05915001034736633,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.0634000062942505,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.004249995946884156,
|
117 |
+
"tpp_threshold_100_total_metric": 0.12605001628398896,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.13100001811981202,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.004950001835823059,
|
120 |
+
"tpp_threshold_500_total_metric": 0.4442000240087509,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.45640002489089965,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.012200000882148742
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.007249999046325684,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.01119999885559082,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003949999809265137,
|
129 |
+
"tpp_threshold_5_total_metric": 0.010749995708465576,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.01679999828338623,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.006050002574920654,
|
132 |
+
"tpp_threshold_10_total_metric": 0.026349997520446776,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.03519999980926514,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.008850002288818359,
|
135 |
+
"tpp_threshold_20_total_metric": 0.03439999520778656,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.04359999895095825,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.009200003743171693,
|
138 |
+
"tpp_threshold_50_total_metric": 0.09014999568462372,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.1034000039100647,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.013250008225440979,
|
141 |
+
"tpp_threshold_100_total_metric": 0.1559000015258789,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.17340000867843627,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.017500007152557374,
|
144 |
+
"tpp_threshold_500_total_metric": 0.38440001010894775,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.4174000144004822,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.03300000429153442
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.005750015377998352,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.008000016212463379,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.013500049710273743,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.016000032424926758,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.002499982714653015,
|
187 |
+
"tpp_threshold_10_total_metric": 0.013500019907951355,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.00449998676776886,
|
190 |
+
"tpp_threshold_20_total_metric": 0.03125002980232239,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.0350000262260437,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0037499964237213135,
|
193 |
+
"tpp_threshold_50_total_metric": 0.0712500512599945,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.07500004768371582,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1625000536441803,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.16700005531311035,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.004500001668930054,
|
199 |
+
"tpp_threshold_500_total_metric": 0.4487500488758087,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.45500004291534424,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0062499940395355225
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0029999762773513794,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0010000020265579224,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0007500052452087402,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0007500052452087402,
|
210 |
+
"tpp_threshold_10_total_metric": 0.0034999698400497437,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.0029999613761901855,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0005000084638595581,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0104999840259552,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004500001668930054,
|
216 |
+
"tpp_threshold_50_total_metric": 0.03200000524520874,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.03700000047683716,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.004999995231628418,
|
219 |
+
"tpp_threshold_100_total_metric": 0.07999999821186066,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.08700001239776611,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0070000141859054565,
|
222 |
+
"tpp_threshold_500_total_metric": 0.43025001883506775,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.4390000104904175,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.008749991655349731
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.0012499988079071045,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.002750009298324585,
|
230 |
+
"tpp_threshold_5_total_metric": 0.008750006556510925,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250004887580872,
|
233 |
+
"tpp_threshold_10_total_metric": 0.01950003206729889,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.021000027656555176,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0014999955892562866,
|
236 |
+
"tpp_threshold_20_total_metric": 0.028249993920326233,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.02799999713897705,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": -0.00024999678134918213,
|
239 |
+
"tpp_threshold_50_total_metric": 0.03624999523162842,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.03799998760223389,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017499923706054688,
|
242 |
+
"tpp_threshold_100_total_metric": 0.08100000023841858,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.08300000429153442,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0020000040531158447,
|
245 |
+
"tpp_threshold_500_total_metric": 0.4345000237226486,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.44700002670288086,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.012500002980232239
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0015000104904174805,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0025000423192977905,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
|
256 |
+
"tpp_threshold_10_total_metric": 0.002499982714653015,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002500012516975403,
|
259 |
+
"tpp_threshold_20_total_metric": 0.00475001335144043,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0040000081062316895,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0007500052452087402,
|
262 |
+
"tpp_threshold_50_total_metric": 0.00950002670288086,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.012000024318695068,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
|
265 |
+
"tpp_threshold_100_total_metric": 0.015250012278556824,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.018000006675720215,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.002749994397163391,
|
268 |
+
"tpp_threshold_500_total_metric": 0.4517500102519989,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.4620000123977661,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.010250002145767212
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.007749974727630615,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0104999840259552,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.014999985694885254,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.004500001668930054,
|
279 |
+
"tpp_threshold_10_total_metric": 0.033499956130981445,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.038999974727630615,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
|
282 |
+
"tpp_threshold_20_total_metric": 0.06124997138977051,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.06699997186660767,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.005750000476837158,
|
285 |
+
"tpp_threshold_50_total_metric": 0.14674997329711914,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.1549999713897705,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.008249998092651367,
|
288 |
+
"tpp_threshold_100_total_metric": 0.2915000170469284,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.30000001192092896,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.00849999487400055,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4557500183582306,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.02325001358985901
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.012250036001205444,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.016000032424926758,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.011750057339668274,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.01500004529953003,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
|
304 |
+
"tpp_threshold_10_total_metric": 0.012500032782554626,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.021000027656555176,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
|
307 |
+
"tpp_threshold_20_total_metric": 0.022000029683113098,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.03100001811981201,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.008999988436698914,
|
310 |
+
"tpp_threshold_50_total_metric": 0.06149999797344208,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.0690000057220459,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.007500007748603821,
|
313 |
+
"tpp_threshold_100_total_metric": 0.10575002431869507,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.11800003051757812,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.012250006198883057,
|
316 |
+
"tpp_threshold_500_total_metric": 0.41975003480911255,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.44200003147125244,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.022249996662139893
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0007499605417251587,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0012500137090682983,
|
324 |
+
"tpp_threshold_5_total_metric": -0.013000041246414185,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.01600000262260437,
|
327 |
+
"tpp_threshold_10_total_metric": 0.0022499561309814453,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.01699995994567871,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.014750003814697266,
|
330 |
+
"tpp_threshold_20_total_metric": 0.0157499760389328,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.02399998903274536,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.008250012993812561,
|
333 |
+
"tpp_threshold_50_total_metric": 0.05649995803833008,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.08099997043609619,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.024500012397766113,
|
336 |
+
"tpp_threshold_100_total_metric": 0.12999998033046722,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.15799999237060547,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.028000012040138245,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3737499713897705,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.4269999861717224,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.053250014781951904
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.0070000141859054565,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0040000081062316895,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000006079673767,
|
347 |
+
"tpp_threshold_5_total_metric": 0.004249989986419678,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0002499818801879883,
|
350 |
+
"tpp_threshold_10_total_metric": 0.024749979376792908,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.02899998426437378,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.004250004887580872,
|
353 |
+
"tpp_threshold_20_total_metric": 0.014499977231025696,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.02399998903274536,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.009500011801719666,
|
356 |
+
"tpp_threshold_50_total_metric": 0.07250000536441803,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.0820000171661377,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.009500011801719666,
|
359 |
+
"tpp_threshold_100_total_metric": 0.12725001573562622,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.1420000195503235,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.014750003814697266,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3969999998807907,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.4259999990463257,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.028999999165534973
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.006999999284744263,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.0040000081062316895,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0029999911785125732,
|
370 |
+
"tpp_threshold_5_total_metric": -0.0015000104904174805,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0065000057220458984,
|
373 |
+
"tpp_threshold_10_total_metric": 0.007250010967254639,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.009750008583068848,
|
376 |
+
"tpp_threshold_20_total_metric": 0.02724999189376831,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.03799998760223389,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.010749995708465576,
|
379 |
+
"tpp_threshold_50_total_metric": 0.07350002229213715,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.08600002527236938,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.012500002980232239,
|
382 |
+
"tpp_threshold_100_total_metric": 0.13650000095367432,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.15700000524520874,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.020500004291534424,
|
385 |
+
"tpp_threshold_500_total_metric": 0.3790000081062317,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.41600000858306885,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03700000047683716
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.037250012159347534,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.046000003814697266,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.008749991655349731,
|
393 |
+
"tpp_threshold_5_total_metric": 0.052249982953071594,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.05699998140335083,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.004749998450279236,
|
396 |
+
"tpp_threshold_10_total_metric": 0.08500000834465027,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.09200000762939453,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.006999999284744263,
|
399 |
+
"tpp_threshold_20_total_metric": 0.0925000011920929,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.10100001096725464,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.008500009775161743,
|
402 |
+
"tpp_threshold_50_total_metric": 0.18674999475479126,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.19900000095367432,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.012250006198883057,
|
405 |
+
"tpp_threshold_100_total_metric": 0.2799999862909317,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.2919999957084656,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.012000009417533875,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3525000363588333,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3760000467300415,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.02350001037120819
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "dca94803-7711-4df2-8ffe-a1d99bedfde5",
|
73 |
+
"datetime_epoch_millis": 1738808022044,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0034999996423721313,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.00690000057220459,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034000009298324587,
|
79 |
+
"tpp_threshold_5_total_metric": 0.007625006139278412,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.011600005626678466,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003974999487400055,
|
82 |
+
"tpp_threshold_10_total_metric": 0.02060000002384186,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.026100003719329835,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
|
85 |
+
"tpp_threshold_20_total_metric": 0.03362499922513962,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.038699996471405027,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.005074997246265411,
|
88 |
+
"tpp_threshold_50_total_metric": 0.072200009226799,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.08010000586509705,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.007899996638298035,
|
91 |
+
"tpp_threshold_100_total_metric": 0.1294749990105629,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.14089999794960023,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.011424998939037322,
|
94 |
+
"tpp_threshold_500_total_metric": 0.40307500660419465,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.4214000105857849,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.01832500398159027
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.0038499802350997923,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.005799984931945801,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0019500046968460084,
|
105 |
+
"tpp_threshold_5_total_metric": 0.006150004267692566,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.008399999141693116,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0022499948740005494,
|
108 |
+
"tpp_threshold_10_total_metric": 0.017299994826316833,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.019599997997283937,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002300003170967102,
|
111 |
+
"tpp_threshold_20_total_metric": 0.03334999978542328,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.035399997234344484,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020499974489212036,
|
114 |
+
"tpp_threshold_50_total_metric": 0.05845000743865967,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.062400007247924806,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.003949999809265137,
|
117 |
+
"tpp_threshold_100_total_metric": 0.12464999556541442,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.12999999523162842,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.00534999966621399,
|
120 |
+
"tpp_threshold_500_total_metric": 0.43650000989437104,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.44520001411437987,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.00870000422000885
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0031500190496444704,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.008000016212463379,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0048499971628189085,
|
129 |
+
"tpp_threshold_5_total_metric": 0.009100008010864257,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.014800012111663818,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.00570000410079956,
|
132 |
+
"tpp_threshold_10_total_metric": 0.02390000522136688,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.032600009441375734,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.00870000422000885,
|
135 |
+
"tpp_threshold_20_total_metric": 0.033899998664855956,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.041999995708465576,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.008099997043609619,
|
138 |
+
"tpp_threshold_50_total_metric": 0.08595001101493835,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.09780000448226929,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.011849993467330932,
|
141 |
+
"tpp_threshold_100_total_metric": 0.13430000245571136,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.15180000066757202,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.017499998211860657,
|
144 |
+
"tpp_threshold_500_total_metric": 0.36965000331401826,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.39760000705718995,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.027950003743171692
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.005000025033950806,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.00700002908706665,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000040531158447,
|
184 |
+
"tpp_threshold_5_total_metric": 0.010250017046928406,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
187 |
+
"tpp_threshold_10_total_metric": 0.017250031232833862,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.020000040531158447,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
|
190 |
+
"tpp_threshold_20_total_metric": 0.025500014424324036,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.02799999713897705,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.002499982714653015,
|
193 |
+
"tpp_threshold_50_total_metric": 0.053750038146972656,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.057000041007995605,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0032500028610229492,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1327500194311142,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.1380000114440918,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0052499920129776,
|
199 |
+
"tpp_threshold_500_total_metric": 0.43675003945827484,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.44200003147125244,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0052499920129776
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0037499666213989258,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0012499988079071045,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0012499988079071045,
|
210 |
+
"tpp_threshold_10_total_metric": 0.005000010132789612,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0010000020265579224,
|
213 |
+
"tpp_threshold_20_total_metric": 0.017249956727027893,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.02199995517730713,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004749998450279236,
|
216 |
+
"tpp_threshold_50_total_metric": 0.03124995529651642,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.035999953746795654,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.004749998450279236,
|
219 |
+
"tpp_threshold_100_total_metric": 0.07649995386600494,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.08199995756149292,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
|
222 |
+
"tpp_threshold_500_total_metric": 0.40824997425079346,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.4169999957084656,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.00875002145767212
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.003749951720237732,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.003250017762184143,
|
230 |
+
"tpp_threshold_5_total_metric": 0.010749995708465576,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.014999985694885254,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004249989986419678,
|
233 |
+
"tpp_threshold_10_total_metric": 0.022749975323677063,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.02499997615814209,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
236 |
+
"tpp_threshold_20_total_metric": 0.0339999794960022,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": -0.0009999871253967285,
|
239 |
+
"tpp_threshold_50_total_metric": 0.03925001621246338,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.04000002145767212,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007500052452087402,
|
242 |
+
"tpp_threshold_100_total_metric": 0.12300001084804535,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.12400001287460327,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0010000020265579224,
|
245 |
+
"tpp_threshold_500_total_metric": 0.43324999511241913,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.4390000104904175,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.005750015377998352
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.002499997615814209,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0030000507831573486,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000169277191162,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0037499815225601196,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
259 |
+
"tpp_threshold_20_total_metric": 0.008750036358833313,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.00700002908706665,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0017500072717666626,
|
262 |
+
"tpp_threshold_50_total_metric": 0.01150001585483551,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.013000011444091797,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0014999955892562866,
|
265 |
+
"tpp_threshold_100_total_metric": 0.013750031590461731,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.017000019550323486,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0032499879598617554,
|
268 |
+
"tpp_threshold_500_total_metric": 0.4442500174045563,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.4490000009536743,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.004749983549118042
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.00424996018409729,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
276 |
+
"tpp_threshold_5_total_metric": 0.007999956607818604,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.011999964714050293,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
|
279 |
+
"tpp_threshold_10_total_metric": 0.03774997591972351,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.042999982833862305,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005250006914138794,
|
282 |
+
"tpp_threshold_20_total_metric": 0.08125001192092896,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.08700001239776611,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.005750000476837158,
|
285 |
+
"tpp_threshold_50_total_metric": 0.15650001168251038,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.16600000858306885,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.009499996900558472,
|
288 |
+
"tpp_threshold_100_total_metric": 0.2772499620914459,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.2889999747276306,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.011750012636184692,
|
291 |
+
"tpp_threshold_500_total_metric": 0.46000002324581146,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.019000008702278137
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.009750023484230042,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.013000011444091797,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032499879598617554,
|
301 |
+
"tpp_threshold_5_total_metric": 0.007750034332275391,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.010000050067901611,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0022500157356262207,
|
304 |
+
"tpp_threshold_10_total_metric": 0.012500032782554626,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.021000027656555176,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
|
307 |
+
"tpp_threshold_20_total_metric": 0.014500007033348083,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.023000001907348633,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.00849999487400055,
|
310 |
+
"tpp_threshold_50_total_metric": 0.05975005030632019,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.06600004434585571,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0062499940395355225,
|
313 |
+
"tpp_threshold_100_total_metric": 0.09250001609325409,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.10600000619888306,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.013499990105628967,
|
316 |
+
"tpp_threshold_500_total_metric": 0.3750000149011612,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.3970000147819519,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.02199999988079071
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0045000165700912476,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.00449998676776886,
|
324 |
+
"tpp_threshold_5_total_metric": -0.002500012516975403,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.0155000239610672,
|
327 |
+
"tpp_threshold_10_total_metric": 0.010499954223632812,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.02199995517730713,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.011500000953674316,
|
330 |
+
"tpp_threshold_20_total_metric": 0.031749993562698364,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.03700000047683716,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.005250006914138794,
|
333 |
+
"tpp_threshold_50_total_metric": 0.06824998557567596,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.08899998664855957,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.020750001072883606,
|
336 |
+
"tpp_threshold_100_total_metric": 0.11849997937679291,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.14499998092651367,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.026500001549720764,
|
339 |
+
"tpp_threshold_500_total_metric": 0.37724997103214264,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.4179999828338623,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.040750011801719666
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.008250012993812561,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.003250017762184143,
|
347 |
+
"tpp_threshold_5_total_metric": -0.002750024199485779,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0020000338554382324,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0007499903440475464,
|
350 |
+
"tpp_threshold_10_total_metric": 0.018250003457069397,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.022000014781951904,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
|
353 |
+
"tpp_threshold_20_total_metric": 0.010499969124794006,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.01699995994567871,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006499990820884705,
|
356 |
+
"tpp_threshold_50_total_metric": 0.06024998426437378,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.06699997186660767,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.006749987602233887,
|
359 |
+
"tpp_threshold_100_total_metric": 0.11424997448921204,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.12699997425079346,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.012749999761581421,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3764999955892563,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.39800000190734863,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.021500006318092346
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.0057499706745147705,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.001999974250793457,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
370 |
+
"tpp_threshold_5_total_metric": 0.009500041604042053,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.016000032424926758,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
|
373 |
+
"tpp_threshold_10_total_metric": 0.012000009417533875,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.022000014781951904,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.01000000536441803,
|
376 |
+
"tpp_threshold_20_total_metric": 0.027250006794929504,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.03700000047683716,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009749993681907654,
|
379 |
+
"tpp_threshold_50_total_metric": 0.07799999415874481,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.08899998664855957,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.010999992489814758,
|
382 |
+
"tpp_threshold_100_total_metric": 0.11225004494190216,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.13100004196166992,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.01874999701976776,
|
385 |
+
"tpp_threshold_500_total_metric": 0.36875003576278687,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.40000003576278687,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03125
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.015500038862228394,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.025000035762786865,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.009499996900558472,
|
393 |
+
"tpp_threshold_5_total_metric": 0.03350000083446503,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.03700000047683716,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
396 |
+
"tpp_threshold_10_total_metric": 0.0662500262260437,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.07600003480911255,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.009750008583068848,
|
399 |
+
"tpp_threshold_20_total_metric": 0.08550001680850983,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.09600001573562622,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.010499998927116394,
|
402 |
+
"tpp_threshold_50_total_metric": 0.16350004076957703,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.17800003290176392,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.01449999213218689,
|
405 |
+
"tpp_threshold_100_total_metric": 0.23399999737739563,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.25,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.01600000262260437,
|
408 |
+
"tpp_threshold_500_total_metric": 0.35074999928474426,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.375,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.024250000715255737
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "bafd917a-113b-4ade-93b9-fc280e1e64e8",
|
73 |
+
"datetime_epoch_millis": 1738808371674,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.004999996721744537,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.008099997043609619,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.003100000321865082,
|
79 |
+
"tpp_threshold_5_total_metric": 0.004899995028972625,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.008599996566772461,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037000015377998356,
|
82 |
+
"tpp_threshold_10_total_metric": 0.017099998891353607,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.02160000205039978,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004500003159046173,
|
85 |
+
"tpp_threshold_20_total_metric": 0.028300000727176665,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.03340000510215759,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.0051000043749809265,
|
88 |
+
"tpp_threshold_50_total_metric": 0.059400005638599394,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.0662000060081482,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.0068000003695487974,
|
91 |
+
"tpp_threshold_100_total_metric": 0.10572500675916671,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.11560000777244568,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.00987500101327896,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3575750187039375,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.3726000189781189,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.015025000274181365
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.005649995803833008,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0075999975204467775,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0019500017166137694,
|
105 |
+
"tpp_threshold_5_total_metric": 0.005199992656707763,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.0075999975204467775,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0024000048637390138,
|
108 |
+
"tpp_threshold_10_total_metric": 0.014100000262260437,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.01640000343322754,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002300003170967102,
|
111 |
+
"tpp_threshold_20_total_metric": 0.028600004315376282,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.031000006198883056,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.002400001883506775,
|
114 |
+
"tpp_threshold_50_total_metric": 0.053450003266334534,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.05700000524520874,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0035500019788742066,
|
117 |
+
"tpp_threshold_100_total_metric": 0.10355001091957092,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.10860000848770142,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005049997568130493,
|
120 |
+
"tpp_threshold_500_total_metric": 0.39755001962184905,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.4052000164985657,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.007649996876716613
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.004349997639656067,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.008599996566772461,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249998927116394,
|
129 |
+
"tpp_threshold_5_total_metric": 0.0045999974012374874,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.009599995613098145,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.004999998211860657,
|
132 |
+
"tpp_threshold_10_total_metric": 0.020099997520446777,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.026800000667572023,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.006700003147125244,
|
135 |
+
"tpp_threshold_20_total_metric": 0.02799999713897705,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.03580000400543213,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.007800006866455078,
|
138 |
+
"tpp_threshold_50_total_metric": 0.06535000801086426,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.07540000677108764,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.010049998760223389,
|
141 |
+
"tpp_threshold_100_total_metric": 0.1079000025987625,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.12260000705718994,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.014700004458427429,
|
144 |
+
"tpp_threshold_500_total_metric": 0.317600017786026,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.3400000214576721,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.02240000367164612
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.01075001060962677,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.013000011444091797,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.01600000262260437,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.018999993801116943,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
|
187 |
+
"tpp_threshold_10_total_metric": 0.013500034809112549,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.016000032424926758,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
190 |
+
"tpp_threshold_20_total_metric": 0.023750022053718567,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.027000010013580322,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032499879598617554,
|
193 |
+
"tpp_threshold_50_total_metric": 0.057750046253204346,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.061000049114227295,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0032500028610229492,
|
196 |
+
"tpp_threshold_100_total_metric": 0.11575004458427429,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.12200003862380981,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0062499940395355225,
|
199 |
+
"tpp_threshold_500_total_metric": 0.4100000709295273,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.41200006008148193,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.001999989151954651
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0014999955892562866,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0005000084638595581,
|
207 |
+
"tpp_threshold_5_total_metric": -0.005250021815299988,
|
208 |
+
"tpp_threshold_5_intended_diff_only": -0.0040000081062316895,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0012500137090682983,
|
210 |
+
"tpp_threshold_10_total_metric": -0.0005000382661819458,
|
211 |
+
"tpp_threshold_10_intended_diff_only": -0.001000046730041504,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0005000084638595581,
|
213 |
+
"tpp_threshold_20_total_metric": 0.006749957799911499,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.010999977588653564,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250019788742065,
|
216 |
+
"tpp_threshold_50_total_metric": 0.02424997091293335,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.02899998426437378,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.00475001335144043,
|
219 |
+
"tpp_threshold_100_total_metric": 0.047499969601631165,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.05199998617172241,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.0045000165700912476,
|
222 |
+
"tpp_threshold_500_total_metric": 0.2900000065565109,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.29500001668930054,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.005000010132789612
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.00875002145767212,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.012000024318695068,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
230 |
+
"tpp_threshold_5_total_metric": 0.007250010967254639,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.012000024318695068,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
|
233 |
+
"tpp_threshold_10_total_metric": 0.024500012397766113,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.026000022888183594,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0015000104904174805,
|
236 |
+
"tpp_threshold_20_total_metric": 0.03749999403953552,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.03700000047683716,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": -0.0004999935626983643,
|
239 |
+
"tpp_threshold_50_total_metric": 0.04449997842311859,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.042999982833862305,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0014999955892562866,
|
242 |
+
"tpp_threshold_100_total_metric": 0.08000004291534424,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.08100003004074097,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0009999871253967285,
|
245 |
+
"tpp_threshold_500_total_metric": 0.4047499895095825,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.4129999876022339,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.008249998092651367
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0029999911785125732,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0010000169277191162,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003750026226043701,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.003000020980834961,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007500052452087402,
|
256 |
+
"tpp_threshold_10_total_metric": 0.002249985933303833,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
|
259 |
+
"tpp_threshold_20_total_metric": 0.008000046014785767,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.00700002908706665,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0010000169277191162,
|
262 |
+
"tpp_threshold_50_total_metric": 0.012750029563903809,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.017000019550323486,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.004249989986419678,
|
265 |
+
"tpp_threshold_100_total_metric": 0.04250001907348633,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.046000003814697266,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0034999847412109375,
|
268 |
+
"tpp_threshold_500_total_metric": 0.42250001430511475,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.4300000071525574,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.007499992847442627
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.00424996018409729,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
276 |
+
"tpp_threshold_5_total_metric": 0.004249945282936096,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.007999956607818604,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037500113248825073,
|
279 |
+
"tpp_threshold_10_total_metric": 0.030750006437301636,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.03600001335144043,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005250006914138794,
|
282 |
+
"tpp_threshold_20_total_metric": 0.06700000166893005,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.07300001382827759,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
|
285 |
+
"tpp_threshold_50_total_metric": 0.12799999117851257,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.13499999046325684,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.006999999284744263,
|
288 |
+
"tpp_threshold_100_total_metric": 0.2319999784231186,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.24199998378753662,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.01000000536441803,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4605000168085098,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.47600001096725464,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.015499994158744812
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.013250023126602173,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.017000019550323486,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.011250004172325134,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
304 |
+
"tpp_threshold_10_total_metric": 0.01075001060962677,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
|
307 |
+
"tpp_threshold_20_total_metric": 0.015000015497207642,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.021000027656555176,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
|
310 |
+
"tpp_threshold_50_total_metric": 0.03725004196166992,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.04300004243850708,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.005750000476837158,
|
313 |
+
"tpp_threshold_100_total_metric": 0.0612499862909317,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.07400000095367432,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.012750014662742615,
|
316 |
+
"tpp_threshold_500_total_metric": 0.28825002908706665,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.2990000247955322,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.010749995708465576
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.008249983191490173,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.002749994397163391,
|
324 |
+
"tpp_threshold_5_total_metric": -0.0034999996423721313,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.009000003337860107,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.012500002980232239,
|
327 |
+
"tpp_threshold_10_total_metric": 0.015499994158744812,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.02399998903274536,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
|
330 |
+
"tpp_threshold_20_total_metric": 0.02699996531009674,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000027060508728,
|
333 |
+
"tpp_threshold_50_total_metric": 0.0677499920129776,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.07999998331069946,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.012249991297721863,
|
336 |
+
"tpp_threshold_100_total_metric": 0.10849995911121368,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.12699997425079346,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.018500015139579773,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3362499922513962,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.3669999837875366,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.030749991536140442
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.005500048398971558,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0020000338554382324,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.003500014543533325,
|
347 |
+
"tpp_threshold_5_total_metric": 0.0037499964237213135,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0012499988079071045,
|
350 |
+
"tpp_threshold_10_total_metric": 0.0209999680519104,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.02399998903274536,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000020980834961,
|
353 |
+
"tpp_threshold_20_total_metric": 0.010250002145767212,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.018000006675720215,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.007750004529953003,
|
356 |
+
"tpp_threshold_50_total_metric": 0.06074999272823334,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.06999999284744263,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.00925000011920929,
|
359 |
+
"tpp_threshold_100_total_metric": 0.08075001835823059,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.09600001573562622,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.01524999737739563,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3087500035762787,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3320000171661377,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.02325001358985901
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.006499990820884705,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.0040000081062316895,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.002499982714653015,
|
370 |
+
"tpp_threshold_5_total_metric": -0.0052499920129776,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0052499920129776,
|
373 |
+
"tpp_threshold_10_total_metric": -0.0022500157356262207,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.008249998092651367,
|
376 |
+
"tpp_threshold_20_total_metric": 0.013750016689300537,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.023000001907348633,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009249985218048096,
|
379 |
+
"tpp_threshold_50_total_metric": 0.03574998676776886,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.046999990940093994,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.011250004172325134,
|
382 |
+
"tpp_threshold_100_total_metric": 0.08550001680850983,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.10100001096725464,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.015499994158744812,
|
385 |
+
"tpp_threshold_500_total_metric": 0.3072500377893448,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.33500003814697266,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.02775000035762787
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.01225002110004425,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.021000027656555176,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.008750006556510925,
|
393 |
+
"tpp_threshold_5_total_metric": 0.016749978065490723,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.019999980926513672,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032500028610229492,
|
396 |
+
"tpp_threshold_10_total_metric": 0.055500030517578125,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.06200003623962402,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0065000057220458984,
|
399 |
+
"tpp_threshold_20_total_metric": 0.07399998605251312,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.08399999141693115,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.01000000536441803,
|
402 |
+
"tpp_threshold_50_total_metric": 0.12525002658367157,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.13700002431869507,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.011749997735023499,
|
405 |
+
"tpp_threshold_100_total_metric": 0.20350003242492676,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.21500003337860107,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.011500000953674316,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3475000262260437,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3670000433921814,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.019500017166137695
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "a9df393e-0561-4737-9c16-8f03033409cb",
|
73 |
+
"datetime_epoch_millis": 1738808254874,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0035749942064285277,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.0061999976634979255,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.002625003457069397,
|
79 |
+
"tpp_threshold_5_total_metric": 0.005500002205371857,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.009100002050399781,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003599999845027924,
|
82 |
+
"tpp_threshold_10_total_metric": 0.01294999271631241,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.017399996519088745,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004450003802776336,
|
85 |
+
"tpp_threshold_20_total_metric": 0.021499992907047273,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.0262999951839447,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.00480000227689743,
|
88 |
+
"tpp_threshold_50_total_metric": 0.043899996578693395,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.04889999628067017,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.004999999701976777,
|
91 |
+
"tpp_threshold_100_total_metric": 0.07275000959634781,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.08150001168251038,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.008750002086162566,
|
94 |
+
"tpp_threshold_500_total_metric": 0.2600750118494034,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.27110000848770144,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.011024996638298035
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.003349986672401428,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0053999900817871095,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002050003409385681,
|
105 |
+
"tpp_threshold_5_total_metric": 0.006400004029273987,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.008800005912780762,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002400001883506775,
|
108 |
+
"tpp_threshold_10_total_metric": 0.011899998784065247,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.014800000190734863,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0029000014066696165,
|
111 |
+
"tpp_threshold_20_total_metric": 0.023699989914894103,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.02659999132156372,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029000014066696165,
|
114 |
+
"tpp_threshold_50_total_metric": 0.04629998803138733,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.049799990653991696,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0035000026226043703,
|
117 |
+
"tpp_threshold_100_total_metric": 0.07975000441074372,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.08480000495910645,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.005050000548362732,
|
120 |
+
"tpp_threshold_500_total_metric": 0.31270001232624056,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.3202000021934509,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.007499989867210388
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.0038000017404556273,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.007000005245208741,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003200003504753113,
|
129 |
+
"tpp_threshold_5_total_metric": 0.004600000381469726,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.009399998188018798,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.004799997806549073,
|
132 |
+
"tpp_threshold_10_total_metric": 0.01399998664855957,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.019999992847442628,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.006000006198883056,
|
135 |
+
"tpp_threshold_20_total_metric": 0.01929999589920044,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.025999999046325682,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.006700003147125244,
|
138 |
+
"tpp_threshold_50_total_metric": 0.04150000512599945,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.048000001907348634,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.0064999967813491825,
|
141 |
+
"tpp_threshold_100_total_metric": 0.0657500147819519,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.07820001840591431,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.012450003623962402,
|
144 |
+
"tpp_threshold_500_total_metric": 0.20745001137256622,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.22200001478195192,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.01455000340938568
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.007749989628791809,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.017250046133995056,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.020000040531158447,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
187 |
+
"tpp_threshold_10_total_metric": 0.014750003814697266,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
190 |
+
"tpp_threshold_20_total_metric": 0.023250028491020203,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.026000022888183594,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.002749994397163391,
|
193 |
+
"tpp_threshold_50_total_metric": 0.04175001382827759,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.04500001668930054,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0032500028610229492,
|
196 |
+
"tpp_threshold_100_total_metric": 0.07125002145767212,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.07700002193450928,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.005750000476837158,
|
199 |
+
"tpp_threshold_500_total_metric": 0.24525001645088196,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.24500000476837158,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": -0.000250011682510376
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0029999613761901855,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.0,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0010000169277191162,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0020000040531158447,
|
210 |
+
"tpp_threshold_10_total_metric": 0.00925000011920929,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.00024999678134918213,
|
213 |
+
"tpp_threshold_20_total_metric": 0.010249972343444824,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.00475001335144043,
|
216 |
+
"tpp_threshold_50_total_metric": 0.025749951601028442,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.030999958515167236,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
|
219 |
+
"tpp_threshold_100_total_metric": 0.05324995517730713,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.05799996852874756,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.00475001335144043,
|
222 |
+
"tpp_threshold_500_total_metric": 0.26924997568130493,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.2789999842643738,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.009750008583068848
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.0027499794960021973,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
230 |
+
"tpp_threshold_5_total_metric": 0.006249964237213135,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.010999977588653564,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.00475001335144043,
|
233 |
+
"tpp_threshold_10_total_metric": 0.021499991416931152,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.02399998903274536,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
236 |
+
"tpp_threshold_20_total_metric": 0.03274999558925629,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.03299999237060547,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.00024999678134918213,
|
239 |
+
"tpp_threshold_50_total_metric": 0.03449997305870056,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.0339999794960022,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0004999935626983643,
|
242 |
+
"tpp_threshold_100_total_metric": 0.05425000190734863,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.0559999942779541,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0017499923706054688,
|
245 |
+
"tpp_threshold_500_total_metric": 0.3055000454187393,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.312000036239624,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.006499990820884705
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0010000169277191162,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0010000169277191162,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003000035881996155,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0010000020265579224,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.004999995231628418,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.003999993205070496,
|
259 |
+
"tpp_threshold_20_total_metric": 0.005499988794326782,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.0004999935626983643,
|
262 |
+
"tpp_threshold_50_total_metric": 0.006500035524368286,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.012000024318695068,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.005499988794326782,
|
265 |
+
"tpp_threshold_100_total_metric": 0.04650004208087921,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.053000032901763916,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.006499990820884705,
|
268 |
+
"tpp_threshold_500_total_metric": 0.30650003254413605,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.3190000057220459,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.012499973177909851
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.002249985933303833,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
276 |
+
"tpp_threshold_5_total_metric": 0.006499990820884705,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
279 |
+
"tpp_threshold_10_total_metric": 0.012999996542930603,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005000010132789612,
|
282 |
+
"tpp_threshold_20_total_metric": 0.046749964356422424,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.05299997329711914,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.006250008940696716,
|
285 |
+
"tpp_threshold_50_total_metric": 0.12299996614456177,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.12699997425079346,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
288 |
+
"tpp_threshold_100_total_metric": 0.17350000143051147,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.18000000715255737,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
|
291 |
+
"tpp_threshold_500_total_metric": 0.43699999153614044,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.44599997997283936,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.008999988436698914
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.01175004243850708,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.01500004529953003,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
301 |
+
"tpp_threshold_5_total_metric": 0.011750012636184692,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002249985933303833,
|
304 |
+
"tpp_threshold_10_total_metric": 0.007500052452087402,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.01500004529953003,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007499992847442627,
|
307 |
+
"tpp_threshold_20_total_metric": 0.011250004172325134,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.018000006675720215,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
310 |
+
"tpp_threshold_50_total_metric": 0.03525002300739288,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.03600001335144043,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0007499903440475464,
|
313 |
+
"tpp_threshold_100_total_metric": 0.03375004231929779,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.04300004243850708,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.00925000011920929,
|
316 |
+
"tpp_threshold_500_total_metric": 0.15350006520748138,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.16200006008148193,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.00849999487400055
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0024999380111694336,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.000500023365020752,
|
324 |
+
"tpp_threshold_5_total_metric": -0.011750012636184692,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.012749999761581421,
|
327 |
+
"tpp_threshold_10_total_metric": 0.0024999380111694336,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00550001859664917,
|
330 |
+
"tpp_threshold_20_total_metric": 0.007499963045120239,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.011999964714050293,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.004500001668930054,
|
333 |
+
"tpp_threshold_50_total_metric": 0.021749988198280334,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.03299999237060547,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.011250004172325134,
|
336 |
+
"tpp_threshold_100_total_metric": 0.04649996757507324,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.06599998474121094,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.019500017166137695,
|
339 |
+
"tpp_threshold_500_total_metric": 0.195499986410141,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.21299999952316284,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.01750001311302185
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.003500014543533325,
|
345 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
347 |
+
"tpp_threshold_5_total_metric": 0.004499956965446472,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.002500012516975403,
|
350 |
+
"tpp_threshold_10_total_metric": 0.019999995827674866,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.023000001907348633,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
|
353 |
+
"tpp_threshold_20_total_metric": 0.008000001311302185,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
|
356 |
+
"tpp_threshold_50_total_metric": 0.03449995815753937,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.034999966621398926,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.0005000084638595581,
|
359 |
+
"tpp_threshold_100_total_metric": 0.06475000083446503,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.07400000095367432,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.00925000011920929,
|
362 |
+
"tpp_threshold_500_total_metric": 0.18399998545646667,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.19900000095367432,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.015000015497207642
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.004999980330467224,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.0009999871253967285,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.003999993205070496,
|
370 |
+
"tpp_threshold_5_total_metric": -0.002499982714653015,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006499990820884705,
|
373 |
+
"tpp_threshold_10_total_metric": -0.002750009298324585,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.008749991655349731,
|
376 |
+
"tpp_threshold_20_total_metric": 0.003000020980834961,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.012000024318695068,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009000003337860107,
|
379 |
+
"tpp_threshold_50_total_metric": 0.027250006794929504,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.03700000047683716,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.009749993681907654,
|
382 |
+
"tpp_threshold_100_total_metric": 0.04350002110004425,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.058000028133392334,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.014500007033348083,
|
385 |
+
"tpp_threshold_500_total_metric": 0.18325002491474152,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.20200002193450928,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.01874999701976776
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.013250023126602173,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.017000019550323486,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
393 |
+
"tpp_threshold_5_total_metric": 0.021000027656555176,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.021000027656555176,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0,
|
396 |
+
"tpp_threshold_10_total_metric": 0.042749956250190735,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.04799997806549072,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.005250021815299988,
|
399 |
+
"tpp_threshold_20_total_metric": 0.06674998998641968,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.07400000095367432,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.007250010967254639,
|
402 |
+
"tpp_threshold_50_total_metric": 0.08875004947185516,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.09900003671646118,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.010249987244606018,
|
405 |
+
"tpp_threshold_100_total_metric": 0.1402500420808792,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.15000003576278687,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.009749993681907654,
|
408 |
+
"tpp_threshold_500_total_metric": 0.32099999487400055,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.33399999141693115,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.012999996542930603
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "d8d50b3f-efea-453e-80ea-09e656823d9c",
|
73 |
+
"datetime_epoch_millis": 1738808485682,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.003474992513656616,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.005799990892410278,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.002324998378753662,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0039999991655349735,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.006999999284744263,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0030000001192092896,
|
82 |
+
"tpp_threshold_10_total_metric": 0.01015000194311142,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.013400000333786011,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032499983906745912,
|
85 |
+
"tpp_threshold_20_total_metric": 0.016299988329410552,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.019599992036819457,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.003300003707408905,
|
88 |
+
"tpp_threshold_50_total_metric": 0.03060000091791153,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.03460000157356262,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000006556510925,
|
91 |
+
"tpp_threshold_100_total_metric": 0.047974994778633116,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.054599994421005243,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.0066249996423721315,
|
94 |
+
"tpp_threshold_500_total_metric": 0.18647501319646836,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.19520001411437987,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.00872500091791153
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.0038499891757965087,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.005599987506866455,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017499983310699462,
|
105 |
+
"tpp_threshold_5_total_metric": 0.005899989604949951,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.008799993991851806,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029000043869018555,
|
108 |
+
"tpp_threshold_10_total_metric": 0.008850002288818359,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.01119999885559082,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002349996566772461,
|
111 |
+
"tpp_threshold_20_total_metric": 0.01929998993873596,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.021199989318847656,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0018999993801116944,
|
114 |
+
"tpp_threshold_50_total_metric": 0.03360001742839813,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.03660001754760742,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.0030000001192092896,
|
117 |
+
"tpp_threshold_100_total_metric": 0.05629999041557312,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.06019998788833618,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.0038999974727630614,
|
120 |
+
"tpp_threshold_500_total_metric": 0.23835001289844512,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.2446000099182129,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.006249997019767761
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.003099995851516724,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.005999994277954101,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.002899998426437378,
|
129 |
+
"tpp_threshold_5_total_metric": 0.0021000087261199953,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.005200004577636719,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.003099995851516724,
|
132 |
+
"tpp_threshold_10_total_metric": 0.01145000159740448,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.015600001811981202,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.004150000214576721,
|
135 |
+
"tpp_threshold_20_total_metric": 0.013299986720085144,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.01799999475479126,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.004700008034706116,
|
138 |
+
"tpp_threshold_50_total_metric": 0.027599984407424928,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.03259998559951782,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.005000001192092896,
|
141 |
+
"tpp_threshold_100_total_metric": 0.039649999141693114,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.049000000953674315,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.009350001811981201,
|
144 |
+
"tpp_threshold_500_total_metric": 0.13460001349449158,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.14580001831054687,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.011200004816055298
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.0070000141859054565,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.001999989151954651,
|
184 |
+
"tpp_threshold_5_total_metric": 0.014750033617019653,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.017000019550323486,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.002249985933303833,
|
187 |
+
"tpp_threshold_10_total_metric": 0.005250036716461182,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.00700002908706665,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017499923706054688,
|
190 |
+
"tpp_threshold_20_total_metric": 0.015250027179718018,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.017000019550323486,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0017499923706054688,
|
193 |
+
"tpp_threshold_50_total_metric": 0.02700003981590271,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.030000030994415283,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0029999911785125732,
|
196 |
+
"tpp_threshold_100_total_metric": 0.04625000059604645,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.050999999046325684,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.004749998450279236,
|
199 |
+
"tpp_threshold_500_total_metric": 0.18425005674362183,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.18700003623962402,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0027499794960021973
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0027499794960021973,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.0007500052452087402,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0025000721216201782,
|
208 |
+
"tpp_threshold_5_intended_diff_only": -0.001000046730041504,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000253915786743,
|
210 |
+
"tpp_threshold_10_total_metric": 0.00475001335144043,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0007500052452087402,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0029999911785125732,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029999911785125732,
|
216 |
+
"tpp_threshold_50_total_metric": 0.02799999713897705,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.03200000524520874,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
219 |
+
"tpp_threshold_100_total_metric": 0.04924996197223663,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.05399996042251587,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.004749998450279236,
|
222 |
+
"tpp_threshold_500_total_metric": 0.164249986410141,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.171999990940094,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.007750004529953003
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.005499988794326782,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.003500014543533325,
|
230 |
+
"tpp_threshold_5_total_metric": 0.008000001311302185,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
233 |
+
"tpp_threshold_10_total_metric": 0.020500019192695618,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.023000001907348633,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499982714653015,
|
236 |
+
"tpp_threshold_20_total_metric": 0.029999971389770508,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.029999971389770508,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0,
|
239 |
+
"tpp_threshold_50_total_metric": 0.028249993920326233,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.02799999713897705,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.00024999678134918213,
|
242 |
+
"tpp_threshold_100_total_metric": 0.040000006556510925,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.04100000858306885,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0010000020265579224,
|
245 |
+
"tpp_threshold_500_total_metric": 0.21049997210502625,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.21399998664855957,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.003500014543533325
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.00024999678134918213,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0007499903440475464,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0012500286102294922,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.003000020980834961,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017499923706054688,
|
256 |
+
"tpp_threshold_10_total_metric": 0.002249985933303833,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
|
259 |
+
"tpp_threshold_20_total_metric": 0.005749985575675964,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.00024999678134918213,
|
262 |
+
"tpp_threshold_50_total_metric": 0.00575004518032074,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.01100003719329834,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.0052499920129776,
|
265 |
+
"tpp_threshold_100_total_metric": 0.026749998331069946,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.02899998426437378,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.002249985933303833,
|
268 |
+
"tpp_threshold_500_total_metric": 0.2757500410079956,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.2850000262260437,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.009249985218048096
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.0037499666213989258,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
276 |
+
"tpp_threshold_5_total_metric": 0.007999956607818604,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.011999964714050293,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0040000081062316895,
|
279 |
+
"tpp_threshold_10_total_metric": 0.011499956250190735,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.015999972820281982,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.0045000165700912476,
|
282 |
+
"tpp_threshold_20_total_metric": 0.04249997437000275,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.046999990940093994,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.0045000165700912476,
|
285 |
+
"tpp_threshold_50_total_metric": 0.07900001108646393,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.0820000171661377,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000006079673767,
|
288 |
+
"tpp_threshold_100_total_metric": 0.11924998462200165,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.12599998712539673,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
|
291 |
+
"tpp_threshold_500_total_metric": 0.357000008225441,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.36500000953674316,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.008000001311302185
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.010500013828277588,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
|
301 |
+
"tpp_threshold_5_total_metric": 0.009250059723854065,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.010000050067901611,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0007499903440475464,
|
304 |
+
"tpp_threshold_10_total_metric": 0.0065000057220458984,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.013000011444091797,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.0065000057220458984,
|
307 |
+
"tpp_threshold_20_total_metric": 0.008249998092651367,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.013000011444091797,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.00475001335144043,
|
310 |
+
"tpp_threshold_50_total_metric": 0.02350001037120819,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.023000001907348633,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": -0.0005000084638595581,
|
313 |
+
"tpp_threshold_100_total_metric": 0.016500025987625122,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.025000035762786865,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.008500009775161743,
|
316 |
+
"tpp_threshold_500_total_metric": 0.07200004160404205,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.07600003480911255,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.003999993205070496
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0009999573230743408,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0010000169277191162,
|
324 |
+
"tpp_threshold_5_total_metric": -0.007750019431114197,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.001999974250793457,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.009749993681907654,
|
327 |
+
"tpp_threshold_10_total_metric": 0.007749974727630615,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.010999977588653564,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
330 |
+
"tpp_threshold_20_total_metric": 0.005499944090843201,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.007999956607818604,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.002500012516975403,
|
333 |
+
"tpp_threshold_50_total_metric": 0.013999953866004944,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.0209999680519104,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.0070000141859054565,
|
336 |
+
"tpp_threshold_100_total_metric": 0.02374996244907379,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.034999966621398926,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.011250004172325134,
|
339 |
+
"tpp_threshold_500_total_metric": 0.11574997007846832,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.12699997425079346,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.011250004172325134
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.007500007748603821,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0040000081062316895,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999996423721313,
|
347 |
+
"tpp_threshold_5_total_metric": 0.0004999935626983643,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0004999935626983643,
|
350 |
+
"tpp_threshold_10_total_metric": 0.012000009417533875,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.013999998569488525,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.001999989151954651,
|
353 |
+
"tpp_threshold_20_total_metric": 0.002749994397163391,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.009000003337860107,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006250008940696716,
|
356 |
+
"tpp_threshold_50_total_metric": 0.016749992966651917,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.018999993801116943,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.002250000834465027,
|
359 |
+
"tpp_threshold_100_total_metric": 0.040249988436698914,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.046999990940093994,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.006750002503395081,
|
362 |
+
"tpp_threshold_500_total_metric": 0.12049998342990875,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.1340000033378601,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.013500019907951355
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.0054999589920043945,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.001999974250793457,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999847412109375,
|
370 |
+
"tpp_threshold_5_total_metric": -0.0027499794960021973,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.003000020980834961,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.005750000476837158,
|
373 |
+
"tpp_threshold_10_total_metric": -0.007249996066093445,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.0009999871253967285,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.006250008940696716,
|
376 |
+
"tpp_threshold_20_total_metric": 0.008249983191490173,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
379 |
+
"tpp_threshold_50_total_metric": 0.020749986171722412,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.02899998426437378,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.008249998092651367,
|
382 |
+
"tpp_threshold_100_total_metric": 0.03675001859664917,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.04900002479553223,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.012250006198883057,
|
385 |
+
"tpp_threshold_500_total_metric": 0.11750003695487976,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.13600003719329834,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.01850000023841858
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.016999974846839905,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.019999980926513672,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000006079673767,
|
393 |
+
"tpp_threshold_5_total_metric": 0.01124998927116394,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.009999990463256836,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
396 |
+
"tpp_threshold_10_total_metric": 0.03825001418590546,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.04100000858306885,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.002749994397163391,
|
399 |
+
"tpp_threshold_20_total_metric": 0.04175001382827759,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.04500001668930054,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
402 |
+
"tpp_threshold_50_total_metric": 0.06299997866153717,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.07099997997283936,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.008000001311302185,
|
405 |
+
"tpp_threshold_100_total_metric": 0.08100000023841858,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.08899998664855957,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.007999986410140991,
|
408 |
+
"tpp_threshold_500_total_metric": 0.247250035405159,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.2560000419616699,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.008750006556510925
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/tpp/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "9cc117bf-8880-4c1e-be3d-07555801ce77",
|
73 |
+
"datetime_epoch_millis": 1738808138146,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.0010750010609626769,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.0034000039100646976,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.00232500284910202,
|
79 |
+
"tpp_threshold_5_total_metric": 0.0006999984383583069,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.0031000018119812013,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0024000033736228943,
|
82 |
+
"tpp_threshold_10_total_metric": 0.005324994027614593,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.008099997043609619,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0027750030159950255,
|
85 |
+
"tpp_threshold_20_total_metric": 0.009074991941452027,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.01199999451637268,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029250025749206544,
|
88 |
+
"tpp_threshold_50_total_metric": 0.019300003349781037,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.02290000319480896,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.003599999845027924,
|
91 |
+
"tpp_threshold_100_total_metric": 0.029224996268749238,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.03529999852180481,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.0060750022530555725,
|
94 |
+
"tpp_threshold_500_total_metric": 0.11080000549554825,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.11790000796318054,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.007100002467632293
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.002349993586540222,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.0039999961853027345,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0016500025987625123,
|
105 |
+
"tpp_threshold_5_total_metric": 0.003099992871284485,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.001900002360343933,
|
108 |
+
"tpp_threshold_10_total_metric": 0.006099992990493774,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.007799994945526123,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0017000019550323486,
|
111 |
+
"tpp_threshold_20_total_metric": 0.011999988555908203,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.013399994373321534,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.0014000058174133301,
|
114 |
+
"tpp_threshold_50_total_metric": 0.02170000672340393,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.02380000352859497,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.00209999680519104,
|
117 |
+
"tpp_threshold_100_total_metric": 0.03549998998641968,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.03879998922348023,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.003299999237060547,
|
120 |
+
"tpp_threshold_500_total_metric": 0.1364999920129776,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.14199999570846558,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.005500003695487976
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": -0.00019999146461486817,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.0028000116348266602,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000003099441528,
|
129 |
+
"tpp_threshold_5_total_metric": -0.0016999959945678711,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.0012000083923339843,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029000043869018555,
|
132 |
+
"tpp_threshold_10_total_metric": 0.004549995064735413,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.008399999141693116,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.0038500040769577025,
|
135 |
+
"tpp_threshold_20_total_metric": 0.00614999532699585,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.010599994659423828,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.004449999332427979,
|
138 |
+
"tpp_threshold_50_total_metric": 0.016899999976158143,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.02200000286102295,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.0051000028848648075,
|
141 |
+
"tpp_threshold_100_total_metric": 0.022950002551078798,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.03180000782012939,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.008850005269050599,
|
144 |
+
"tpp_threshold_500_total_metric": 0.08510001897811889,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.0938000202178955,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.00870000123977661
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "standard",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.005500033497810364,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.00700002908706665,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0014999955892562866,
|
184 |
+
"tpp_threshold_5_total_metric": 0.010000035166740417,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.01100003719329834,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0010000020265579224,
|
187 |
+
"tpp_threshold_10_total_metric": 0.002500012516975403,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.0040000081062316895,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0014999955892562866,
|
190 |
+
"tpp_threshold_20_total_metric": 0.008999988436698914,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.009999990463256836,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0010000020265579224,
|
193 |
+
"tpp_threshold_50_total_metric": 0.016499996185302734,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.018999993801116943,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.002499997615814209,
|
196 |
+
"tpp_threshold_100_total_metric": 0.02399998903274536,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.02799999713897705,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0040000081062316895,
|
199 |
+
"tpp_threshold_500_total_metric": 0.12150004506111145,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.12200003862380981,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0004999935626983643
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0032499581575393677,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0029999613761901855,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.00024999678134918213,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0025000572204589844,
|
208 |
+
"tpp_threshold_5_intended_diff_only": -0.001000046730041504,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
|
210 |
+
"tpp_threshold_10_total_metric": 0.002499982714653015,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.0009999871253967285,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0014999955892562866,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0029999911785125732,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.004999995231628418,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0020000040531158447,
|
216 |
+
"tpp_threshold_50_total_metric": 0.02799999713897705,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.03200000524520874,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.0040000081062316895,
|
219 |
+
"tpp_threshold_100_total_metric": 0.039999961853027344,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.04499995708465576,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.004999995231628418,
|
222 |
+
"tpp_threshold_500_total_metric": 0.11849997937679291,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.12599998712539673,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.007500007748603821
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": -0.000500023365020752,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
|
230 |
+
"tpp_threshold_5_total_metric": 0.002249971032142639,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.0059999823570251465,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037500113248825073,
|
233 |
+
"tpp_threshold_10_total_metric": 0.01600000262260437,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
236 |
+
"tpp_threshold_20_total_metric": 0.02374999225139618,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.02399998903274536,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.00024999678134918213,
|
239 |
+
"tpp_threshold_50_total_metric": 0.01800002157688141,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.017000019550323486,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0010000020265579224,
|
242 |
+
"tpp_threshold_100_total_metric": 0.01874999701976776,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.018999993801116943,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.00024999678134918213,
|
245 |
+
"tpp_threshold_500_total_metric": 0.0807499885559082,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.07999998331069946,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": -0.0007500052452087402
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0017500072717666626,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0012500137090682983,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0022500455379486084,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.000250011682510376,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0032499730587005615,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.0059999823570251465,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
|
259 |
+
"tpp_threshold_20_total_metric": 0.006749972701072693,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.0059999823570251465,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": -0.0007499903440475464,
|
262 |
+
"tpp_threshold_50_total_metric": 0.009000048041343689,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.01100003719329834,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.001999989151954651,
|
265 |
+
"tpp_threshold_100_total_metric": 0.016749978065490723,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.019999980926513672,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.0032500028610229492,
|
268 |
+
"tpp_threshold_500_total_metric": 0.16824999451637268,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.1809999942779541,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.012749999761581421
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.0017499923706054688,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.004999995231628418,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
276 |
+
"tpp_threshold_5_total_metric": 0.0034999698400497437,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
279 |
+
"tpp_threshold_10_total_metric": 0.0062499940395355225,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
|
282 |
+
"tpp_threshold_20_total_metric": 0.017499998211860657,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.022000014781951904,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.0045000165700912476,
|
285 |
+
"tpp_threshold_50_total_metric": 0.03699997067451477,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.039999961853027344,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.0029999911785125732,
|
288 |
+
"tpp_threshold_100_total_metric": 0.0780000239610672,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.0820000171661377,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.003999993205070496,
|
291 |
+
"tpp_threshold_500_total_metric": 0.19349995255470276,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.20099997520446777,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.007500022649765015
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.01175004243850708,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.01500004529953003,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
301 |
+
"tpp_threshold_5_total_metric": 0.006250008940696716,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.008000016212463379,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
|
304 |
+
"tpp_threshold_10_total_metric": 0.003000006079673767,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
|
307 |
+
"tpp_threshold_20_total_metric": 0.003750041127204895,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.00700002908706665,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032499879598617554,
|
310 |
+
"tpp_threshold_50_total_metric": 0.015000015497207642,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.013999998569488525,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": -0.0010000169277191162,
|
313 |
+
"tpp_threshold_100_total_metric": 0.00850003957748413,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.01500004529953003,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.0065000057220458984,
|
316 |
+
"tpp_threshold_500_total_metric": 0.033750057220458984,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.03800004720687866,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.004249989986419678
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.0012499690055847168,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0007500052452087402,
|
324 |
+
"tpp_threshold_5_total_metric": -0.009000003337860107,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.009000003337860107,
|
327 |
+
"tpp_threshold_10_total_metric": 0.004499971866607666,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.006999969482421875,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
330 |
+
"tpp_threshold_20_total_metric": -0.0012499988079071045,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.0,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.0012499988079071045,
|
333 |
+
"tpp_threshold_50_total_metric": 0.00349995493888855,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.010999977588653564,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.007500022649765015,
|
336 |
+
"tpp_threshold_100_total_metric": 0.00899997353553772,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.019999980926513672,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.011000007390975952,
|
339 |
+
"tpp_threshold_500_total_metric": 0.07375001907348633,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.0820000171661377,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.008249998092651367
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.009000003337860107,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0052499920129776,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0059999823570251465,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007499903440475464,
|
350 |
+
"tpp_threshold_10_total_metric": 0.007499992847442627,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0015000104904174805,
|
353 |
+
"tpp_threshold_20_total_metric": -0.004000037908554077,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.001999974250793457,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
|
356 |
+
"tpp_threshold_50_total_metric": 0.010999992489814758,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.013999998569488525,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.003000006079673767,
|
359 |
+
"tpp_threshold_100_total_metric": 0.030500009655952454,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.03600001335144043,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.005500003695487976,
|
362 |
+
"tpp_threshold_500_total_metric": 0.07175000011920929,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.08300000429153442,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.011250004172325134
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": -0.009249985218048096,
|
368 |
+
"tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
370 |
+
"tpp_threshold_5_total_metric": -0.006749987602233887,
|
371 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.005750000476837158,
|
373 |
+
"tpp_threshold_10_total_metric": -0.012249991297721863,
|
374 |
+
"tpp_threshold_10_intended_diff_only": -0.006999969482421875,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.005250021815299988,
|
376 |
+
"tpp_threshold_20_total_metric": -0.005250006914138794,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.0009999871253967285,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
|
379 |
+
"tpp_threshold_50_total_metric": 0.005750015377998352,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.013000011444091797,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.007249996066093445,
|
382 |
+
"tpp_threshold_100_total_metric": 0.009499996900558472,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.023000001907348633,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.013500005006790161,
|
385 |
+
"tpp_threshold_500_total_metric": 0.06675000488758087,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.078000009059906,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.011250004172325134
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.004250019788742065,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.008000016212463379,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
393 |
+
"tpp_threshold_5_total_metric": 0.0062499940395355225,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0012499988079071045,
|
396 |
+
"tpp_threshold_10_total_metric": 0.019999995827674866,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.02399998903274536,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.003999993205070496,
|
399 |
+
"tpp_threshold_20_total_metric": 0.03749997913837433,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.042999982833862305,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.005500003695487976,
|
402 |
+
"tpp_threshold_50_total_metric": 0.04925002157688141,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.058000028133392334,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.008750006556510925,
|
405 |
+
"tpp_threshold_100_total_metric": 0.057249993085861206,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.06499999761581421,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750004529953003,
|
408 |
+
"tpp_threshold_500_total_metric": 0.179500013589859,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.18800002336502075,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.008500009775161743
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "27152fa7-494d-44cf-ac35-e2e33052a1b4",
|
37 |
+
"datetime_epoch_millis": 1738815248274,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.007504701614379883
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_0",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "b6ab776e-c77a-46bb-9b89-5bd9905335f9",
|
37 |
+
"datetime_epoch_millis": 1738815974660,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.015009403228759766
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_1",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "4e677628-eb8d-40f5-ade8-e25072775bf1",
|
37 |
+
"datetime_epoch_millis": 1738818108714,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.06378984451293945
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_2",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "9001c398-92b1-4daf-8d14-cd80c5147849",
|
37 |
+
"datetime_epoch_millis": 1738817408434,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.0863039493560791
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_3",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "68919444-b3fe-445b-b7d6-7eadd952f2a7",
|
37 |
+
"datetime_epoch_millis": 1738818814082,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.09943711757659912
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_4",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
old_relu_eval_results/unlearning/temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "39440e8a-6927-4b07-b9d8-8bf75861a3b2",
|
37 |
+
"datetime_epoch_millis": 1738816689242,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.23452156782150269
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_old_relu_google_gemma-2-2b_standard_resid_post_layer_12_trainer_5",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "standard",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|