Add files using upload-large-folder tool
Browse files- .gitattributes +5 -0
- random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +268 -0
- random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +268 -0
- random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +268 -0
- random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +268 -0
- random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +268 -0
- random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +3 -0
- random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +3 -0
- random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +3 -0
- random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +3 -0
- random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +3 -0
- random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +0 -0
- random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +0 -0
- random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +0 -0
- random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +0 -0
- random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +0 -0
- random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +323 -0
- random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +323 -0
- random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +323 -0
- random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +323 -0
- random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +323 -0
- random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +670 -0
- random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +670 -0
- random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +670 -0
- random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +670 -0
- random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +670 -0
- random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +414 -0
- random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +414 -0
- random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +414 -0
- random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +414 -0
- random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +414 -0
- random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json +74 -0
- random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json +74 -0
- random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json +74 -0
- random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json +74 -0
- random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json +74 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
40 |
+
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json filter=lfs diff=lfs merge=lfs -text
|
random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "48f426fa-d13b-4265-8871-5585ec45578b",
|
17 |
+
"datetime_epoch_millis": 1738784829328,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.1714686461548928,
|
21 |
+
"mean_full_absorption_score": 0.15386796931592747,
|
22 |
+
"mean_num_split_features": 1.2307692307692308,
|
23 |
+
"std_dev_absorption_fraction_score": 0.15554948458633766,
|
24 |
+
"std_dev_full_absorption_score": 0.14785339675818804,
|
25 |
+
"std_dev_num_split_features": 0.5870395085642742
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.10821684724823694,
|
32 |
+
"full_absorption_rate": 0.05433932759968726,
|
33 |
+
"num_full_absorption": 139,
|
34 |
+
"num_probe_true_positives": 2558,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.030210348040788013,
|
40 |
+
"full_absorption_rate": 0.017879948914431672,
|
41 |
+
"num_full_absorption": 28,
|
42 |
+
"num_probe_true_positives": 1566,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.41961031215817146,
|
48 |
+
"full_absorption_rate": 0.3772369362920544,
|
49 |
+
"num_full_absorption": 1054,
|
50 |
+
"num_probe_true_positives": 2794,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3467083150960684,
|
56 |
+
"full_absorption_rate": 0.232981220657277,
|
57 |
+
"num_full_absorption": 397,
|
58 |
+
"num_probe_true_positives": 1704,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.2815750993327208,
|
64 |
+
"full_absorption_rate": 0.2759493670886076,
|
65 |
+
"num_full_absorption": 436,
|
66 |
+
"num_probe_true_positives": 1580,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.15487083167478738,
|
72 |
+
"full_absorption_rate": 0.11884550084889643,
|
73 |
+
"num_full_absorption": 140,
|
74 |
+
"num_probe_true_positives": 1178,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.07421657717906203,
|
80 |
+
"full_absorption_rate": 0.07180851063829788,
|
81 |
+
"num_full_absorption": 81,
|
82 |
+
"num_probe_true_positives": 1128,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.05684645316227957,
|
88 |
+
"full_absorption_rate": 0.026449643947100712,
|
89 |
+
"num_full_absorption": 26,
|
90 |
+
"num_probe_true_positives": 983,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.36796796561960116,
|
96 |
+
"full_absorption_rate": 0.39185140802875973,
|
97 |
+
"num_full_absorption": 654,
|
98 |
+
"num_probe_true_positives": 1669,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.001114532296360666,
|
104 |
+
"full_absorption_rate": 0.01366742596810934,
|
105 |
+
"num_full_absorption": 6,
|
106 |
+
"num_probe_true_positives": 439,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0005988111448050127,
|
112 |
+
"full_absorption_rate": 0.004310344827586207,
|
113 |
+
"num_full_absorption": 3,
|
114 |
+
"num_probe_true_positives": 696,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.18851606271745078,
|
120 |
+
"full_absorption_rate": 0.18158347676419967,
|
121 |
+
"num_full_absorption": 211,
|
122 |
+
"num_probe_true_positives": 1162,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.2570751753813015,
|
128 |
+
"full_absorption_rate": 0.1906318082788671,
|
129 |
+
"num_full_absorption": 350,
|
130 |
+
"num_probe_true_positives": 1836,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.06637348713907784,
|
136 |
+
"full_absorption_rate": 0.05115712545676005,
|
137 |
+
"num_full_absorption": 42,
|
138 |
+
"num_probe_true_positives": 821,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.18381166786624634,
|
144 |
+
"full_absorption_rate": 0.21673003802281368,
|
145 |
+
"num_full_absorption": 228,
|
146 |
+
"num_probe_true_positives": 1052,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.4767444460243892,
|
152 |
+
"full_absorption_rate": 0.4237362637362637,
|
153 |
+
"num_full_absorption": 964,
|
154 |
+
"num_probe_true_positives": 2275,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.014007514920590125,
|
160 |
+
"full_absorption_rate": 0.020512820512820513,
|
161 |
+
"num_full_absorption": 4,
|
162 |
+
"num_probe_true_positives": 195,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.3248090798755296,
|
168 |
+
"full_absorption_rate": 0.2704773129051267,
|
169 |
+
"num_full_absorption": 459,
|
170 |
+
"num_probe_true_positives": 1697,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.47029052751768835,
|
176 |
+
"full_absorption_rate": 0.4606123151014792,
|
177 |
+
"num_full_absorption": 1339,
|
178 |
+
"num_probe_true_positives": 2907,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.17209049881072833,
|
184 |
+
"full_absorption_rate": 0.09523809523809523,
|
185 |
+
"num_full_absorption": 160,
|
186 |
+
"num_probe_true_positives": 1680,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.282661389531355,
|
192 |
+
"full_absorption_rate": 0.33077905491698595,
|
193 |
+
"num_full_absorption": 259,
|
194 |
+
"num_probe_true_positives": 783,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.05199868006616804,
|
200 |
+
"full_absorption_rate": 0.05089058524173028,
|
201 |
+
"num_full_absorption": 40,
|
202 |
+
"num_probe_true_positives": 786,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.06104382806297723,
|
208 |
+
"full_absorption_rate": 0.08006279434850863,
|
209 |
+
"num_full_absorption": 51,
|
210 |
+
"num_probe_true_positives": 637,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.05422639180774973,
|
216 |
+
"full_absorption_rate": 0.019417475728155338,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 103,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.01259995735307954,
|
224 |
+
"full_absorption_rate": 0.011560693641618497,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 173,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0,
|
232 |
+
"full_absorption_rate": 0.011857707509881422,
|
233 |
+
"num_full_absorption": 3,
|
234 |
+
"num_probe_true_positives": 253,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "65edad50-a831-4080-9ef3-0543dc2bdb6d",
|
17 |
+
"datetime_epoch_millis": 1738783917967,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.1711332913395652,
|
21 |
+
"mean_full_absorption_score": 0.15146027486674538,
|
22 |
+
"mean_num_split_features": 1.3076923076923077,
|
23 |
+
"std_dev_absorption_fraction_score": 0.16355798531094715,
|
24 |
+
"std_dev_full_absorption_score": 0.15005451741951137,
|
25 |
+
"std_dev_num_split_features": 0.7358929688062399
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.13034142230683501,
|
32 |
+
"full_absorption_rate": 0.0727130570758405,
|
33 |
+
"num_full_absorption": 186,
|
34 |
+
"num_probe_true_positives": 2558,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.03860032573759929,
|
40 |
+
"full_absorption_rate": 0.02234993614303959,
|
41 |
+
"num_full_absorption": 35,
|
42 |
+
"num_probe_true_positives": 1566,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.48111974773989397,
|
48 |
+
"full_absorption_rate": 0.4359341445955619,
|
49 |
+
"num_full_absorption": 1218,
|
50 |
+
"num_probe_true_positives": 2794,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.490319824411387,
|
56 |
+
"full_absorption_rate": 0.4025821596244131,
|
57 |
+
"num_full_absorption": 686,
|
58 |
+
"num_probe_true_positives": 1704,
|
59 |
+
"num_split_features": 1
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.2562253140403955,
|
64 |
+
"full_absorption_rate": 0.24746835443037973,
|
65 |
+
"num_full_absorption": 391,
|
66 |
+
"num_probe_true_positives": 1580,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.13061784782524904,
|
72 |
+
"full_absorption_rate": 0.09083191850594227,
|
73 |
+
"num_full_absorption": 107,
|
74 |
+
"num_probe_true_positives": 1178,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.08564969312088162,
|
80 |
+
"full_absorption_rate": 0.08067375886524823,
|
81 |
+
"num_full_absorption": 91,
|
82 |
+
"num_probe_true_positives": 1128,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.05383191935060708,
|
88 |
+
"full_absorption_rate": 0.024415055951169887,
|
89 |
+
"num_full_absorption": 24,
|
90 |
+
"num_probe_true_positives": 983,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.30973405186275194,
|
96 |
+
"full_absorption_rate": 0.32594367884961056,
|
97 |
+
"num_full_absorption": 544,
|
98 |
+
"num_probe_true_positives": 1669,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0012255374185012769,
|
104 |
+
"full_absorption_rate": 0.01366742596810934,
|
105 |
+
"num_full_absorption": 6,
|
106 |
+
"num_probe_true_positives": 439,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0002946951428704943,
|
112 |
+
"full_absorption_rate": 0.005747126436781609,
|
113 |
+
"num_full_absorption": 4,
|
114 |
+
"num_probe_true_positives": 696,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.20008709482189874,
|
120 |
+
"full_absorption_rate": 0.18932874354561102,
|
121 |
+
"num_full_absorption": 220,
|
122 |
+
"num_probe_true_positives": 1162,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.12115143225674394,
|
128 |
+
"full_absorption_rate": 0.08387799564270153,
|
129 |
+
"num_full_absorption": 154,
|
130 |
+
"num_probe_true_positives": 1836,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.05617340818231512,
|
136 |
+
"full_absorption_rate": 0.040194884287454324,
|
137 |
+
"num_full_absorption": 33,
|
138 |
+
"num_probe_true_positives": 821,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.19799995405281123,
|
144 |
+
"full_absorption_rate": 0.21387832699619772,
|
145 |
+
"num_full_absorption": 225,
|
146 |
+
"num_probe_true_positives": 1052,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.42373552854634816,
|
152 |
+
"full_absorption_rate": 0.3665934065934066,
|
153 |
+
"num_full_absorption": 834,
|
154 |
+
"num_probe_true_positives": 2275,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.019513367217384434,
|
160 |
+
"full_absorption_rate": 0.020512820512820513,
|
161 |
+
"num_full_absorption": 4,
|
162 |
+
"num_probe_true_positives": 195,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.38678566067757764,
|
168 |
+
"full_absorption_rate": 0.3252799057159694,
|
169 |
+
"num_full_absorption": 552,
|
170 |
+
"num_probe_true_positives": 1697,
|
171 |
+
"num_split_features": 1
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.4675735623770602,
|
176 |
+
"full_absorption_rate": 0.4196766425868593,
|
177 |
+
"num_full_absorption": 1220,
|
178 |
+
"num_probe_true_positives": 2907,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.19115083064649022,
|
184 |
+
"full_absorption_rate": 0.11964285714285715,
|
185 |
+
"num_full_absorption": 201,
|
186 |
+
"num_probe_true_positives": 1680,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.24797733259678253,
|
192 |
+
"full_absorption_rate": 0.30140485312899107,
|
193 |
+
"num_full_absorption": 236,
|
194 |
+
"num_probe_true_positives": 783,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.045272538797872366,
|
200 |
+
"full_absorption_rate": 0.043256997455470736,
|
201 |
+
"num_full_absorption": 34,
|
202 |
+
"num_probe_true_positives": 786,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.06742398501652283,
|
208 |
+
"full_absorption_rate": 0.06279434850863422,
|
209 |
+
"num_full_absorption": 40,
|
210 |
+
"num_probe_true_positives": 637,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.016888002349278078,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 103,
|
219 |
+
"num_split_features": 2
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.02889315611561085,
|
224 |
+
"full_absorption_rate": 0.017341040462427744,
|
225 |
+
"num_full_absorption": 3,
|
226 |
+
"num_probe_true_positives": 173,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0008793422170269473,
|
232 |
+
"full_absorption_rate": 0.011857707509881422,
|
233 |
+
"num_full_absorption": 3,
|
234 |
+
"num_probe_true_positives": 253,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "6aeabb62-1ccd-4e13-acae-ec063ce1b514",
|
17 |
+
"datetime_epoch_millis": 1738785735688,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.1680556606743435,
|
21 |
+
"mean_full_absorption_score": 0.15191023322678118,
|
22 |
+
"mean_num_split_features": 1.4615384615384615,
|
23 |
+
"std_dev_absorption_fraction_score": 0.14863662312386292,
|
24 |
+
"std_dev_full_absorption_score": 0.13647390993364086,
|
25 |
+
"std_dev_num_split_features": 0.9891721480417544
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.13047466988525097,
|
32 |
+
"full_absorption_rate": 0.0766223612197029,
|
33 |
+
"num_full_absorption": 196,
|
34 |
+
"num_probe_true_positives": 2558,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.023848564485708528,
|
40 |
+
"full_absorption_rate": 0.017241379310344827,
|
41 |
+
"num_full_absorption": 27,
|
42 |
+
"num_probe_true_positives": 1566,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.44742713898495284,
|
48 |
+
"full_absorption_rate": 0.3890479599141016,
|
49 |
+
"num_full_absorption": 1087,
|
50 |
+
"num_probe_true_positives": 2794,
|
51 |
+
"num_split_features": 4
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3708607264352453,
|
56 |
+
"full_absorption_rate": 0.2711267605633803,
|
57 |
+
"num_full_absorption": 462,
|
58 |
+
"num_probe_true_positives": 1704,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.32176505353839885,
|
64 |
+
"full_absorption_rate": 0.3158227848101266,
|
65 |
+
"num_full_absorption": 499,
|
66 |
+
"num_probe_true_positives": 1580,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.14714927842788375,
|
72 |
+
"full_absorption_rate": 0.11120543293718166,
|
73 |
+
"num_full_absorption": 131,
|
74 |
+
"num_probe_true_positives": 1178,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.08731746860539064,
|
80 |
+
"full_absorption_rate": 0.08156028368794327,
|
81 |
+
"num_full_absorption": 92,
|
82 |
+
"num_probe_true_positives": 1128,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.055969492596627826,
|
88 |
+
"full_absorption_rate": 0.03153611393692777,
|
89 |
+
"num_full_absorption": 31,
|
90 |
+
"num_probe_true_positives": 983,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.2785833366797728,
|
96 |
+
"full_absorption_rate": 0.2995805871779509,
|
97 |
+
"num_full_absorption": 500,
|
98 |
+
"num_probe_true_positives": 1669,
|
99 |
+
"num_split_features": 2
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0017460357133270942,
|
104 |
+
"full_absorption_rate": 0.01366742596810934,
|
105 |
+
"num_full_absorption": 6,
|
106 |
+
"num_probe_true_positives": 439,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0007196477033161077,
|
112 |
+
"full_absorption_rate": 0.005747126436781609,
|
113 |
+
"num_full_absorption": 4,
|
114 |
+
"num_probe_true_positives": 696,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.13922218898438524,
|
120 |
+
"full_absorption_rate": 0.1333907056798623,
|
121 |
+
"num_full_absorption": 155,
|
122 |
+
"num_probe_true_positives": 1162,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.18222620640573445,
|
128 |
+
"full_absorption_rate": 0.12418300653594772,
|
129 |
+
"num_full_absorption": 228,
|
130 |
+
"num_probe_true_positives": 1836,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.06866407604449835,
|
136 |
+
"full_absorption_rate": 0.05481120584652863,
|
137 |
+
"num_full_absorption": 45,
|
138 |
+
"num_probe_true_positives": 821,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.2887489800731137,
|
144 |
+
"full_absorption_rate": 0.3612167300380228,
|
145 |
+
"num_full_absorption": 380,
|
146 |
+
"num_probe_true_positives": 1052,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.44873846584887894,
|
152 |
+
"full_absorption_rate": 0.378021978021978,
|
153 |
+
"num_full_absorption": 860,
|
154 |
+
"num_probe_true_positives": 2275,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.041601474064769904,
|
160 |
+
"full_absorption_rate": 0.05641025641025641,
|
161 |
+
"num_full_absorption": 11,
|
162 |
+
"num_probe_true_positives": 195,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.3102650866902534,
|
168 |
+
"full_absorption_rate": 0.25987035945786685,
|
169 |
+
"num_full_absorption": 441,
|
170 |
+
"num_probe_true_positives": 1697,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.4414031454169644,
|
176 |
+
"full_absorption_rate": 0.4004127966976264,
|
177 |
+
"num_full_absorption": 1164,
|
178 |
+
"num_probe_true_positives": 2907,
|
179 |
+
"num_split_features": 5
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.2128788270041167,
|
184 |
+
"full_absorption_rate": 0.14345238095238094,
|
185 |
+
"num_full_absorption": 241,
|
186 |
+
"num_probe_true_positives": 1680,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.15207178611487968,
|
192 |
+
"full_absorption_rate": 0.22988505747126436,
|
193 |
+
"num_full_absorption": 180,
|
194 |
+
"num_probe_true_positives": 783,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.02597114917923278,
|
200 |
+
"full_absorption_rate": 0.03307888040712468,
|
201 |
+
"num_full_absorption": 26,
|
202 |
+
"num_probe_true_positives": 786,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.08939131791526586,
|
208 |
+
"full_absorption_rate": 0.08791208791208792,
|
209 |
+
"num_full_absorption": 56,
|
210 |
+
"num_probe_true_positives": 637,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.0674159775843633,
|
216 |
+
"full_absorption_rate": 0.019417475728155338,
|
217 |
+
"num_full_absorption": 2,
|
218 |
+
"num_probe_true_positives": 103,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.034987083150599306,
|
224 |
+
"full_absorption_rate": 0.03468208092485549,
|
225 |
+
"num_full_absorption": 6,
|
226 |
+
"num_probe_true_positives": 173,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.0,
|
232 |
+
"full_absorption_rate": 0.019762845849802372,
|
233 |
+
"num_full_absorption": 5,
|
234 |
+
"num_probe_true_positives": 253,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "a615335e-36f1-40d4-80c8-a632dba17d19",
|
17 |
+
"datetime_epoch_millis": 1738786653640,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.17538157541809793,
|
21 |
+
"mean_full_absorption_score": 0.1514468976607655,
|
22 |
+
"mean_num_split_features": 1.2692307692307692,
|
23 |
+
"std_dev_absorption_fraction_score": 0.16584926866768673,
|
24 |
+
"std_dev_full_absorption_score": 0.15332607817709293,
|
25 |
+
"std_dev_num_split_features": 0.6667948594698258
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.1843346778809071,
|
32 |
+
"full_absorption_rate": 0.0965598123534011,
|
33 |
+
"num_full_absorption": 247,
|
34 |
+
"num_probe_true_positives": 2558,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.027441769972842704,
|
40 |
+
"full_absorption_rate": 0.01532567049808429,
|
41 |
+
"num_full_absorption": 24,
|
42 |
+
"num_probe_true_positives": 1566,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.5391824266226292,
|
48 |
+
"full_absorption_rate": 0.4874731567644954,
|
49 |
+
"num_full_absorption": 1362,
|
50 |
+
"num_probe_true_positives": 2794,
|
51 |
+
"num_split_features": 1
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3774188362026092,
|
56 |
+
"full_absorption_rate": 0.2494131455399061,
|
57 |
+
"num_full_absorption": 425,
|
58 |
+
"num_probe_true_positives": 1704,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.33716358784198536,
|
64 |
+
"full_absorption_rate": 0.3120253164556962,
|
65 |
+
"num_full_absorption": 493,
|
66 |
+
"num_probe_true_positives": 1580,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.1350976074162719,
|
72 |
+
"full_absorption_rate": 0.100169779286927,
|
73 |
+
"num_full_absorption": 118,
|
74 |
+
"num_probe_true_positives": 1178,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.08468221916767832,
|
80 |
+
"full_absorption_rate": 0.0797872340425532,
|
81 |
+
"num_full_absorption": 90,
|
82 |
+
"num_probe_true_positives": 1128,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.0547531455889232,
|
88 |
+
"full_absorption_rate": 0.021363173957273652,
|
89 |
+
"num_full_absorption": 21,
|
90 |
+
"num_probe_true_positives": 983,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.3798311805573998,
|
96 |
+
"full_absorption_rate": 0.3996405032953865,
|
97 |
+
"num_full_absorption": 667,
|
98 |
+
"num_probe_true_positives": 1669,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0035791146766769285,
|
104 |
+
"full_absorption_rate": 0.01366742596810934,
|
105 |
+
"num_full_absorption": 6,
|
106 |
+
"num_probe_true_positives": 439,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0033612951454096854,
|
112 |
+
"full_absorption_rate": 0.007183908045977011,
|
113 |
+
"num_full_absorption": 5,
|
114 |
+
"num_probe_true_positives": 696,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.21961957691643827,
|
120 |
+
"full_absorption_rate": 0.1919104991394148,
|
121 |
+
"num_full_absorption": 223,
|
122 |
+
"num_probe_true_positives": 1162,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.08392659102380813,
|
128 |
+
"full_absorption_rate": 0.04847494553376906,
|
129 |
+
"num_full_absorption": 89,
|
130 |
+
"num_probe_true_positives": 1836,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.08025181361638342,
|
136 |
+
"full_absorption_rate": 0.06090133982947625,
|
137 |
+
"num_full_absorption": 50,
|
138 |
+
"num_probe_true_positives": 821,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.16317207769449307,
|
144 |
+
"full_absorption_rate": 0.17680608365019013,
|
145 |
+
"num_full_absorption": 186,
|
146 |
+
"num_probe_true_positives": 1052,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.5201401741517296,
|
152 |
+
"full_absorption_rate": 0.4320879120879121,
|
153 |
+
"num_full_absorption": 983,
|
154 |
+
"num_probe_true_positives": 2275,
|
155 |
+
"num_split_features": 2
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.033809387805282545,
|
160 |
+
"full_absorption_rate": 0.046153846153846156,
|
161 |
+
"num_full_absorption": 9,
|
162 |
+
"num_probe_true_positives": 195,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.265286753988254,
|
168 |
+
"full_absorption_rate": 0.2139068945197407,
|
169 |
+
"num_full_absorption": 363,
|
170 |
+
"num_probe_true_positives": 1697,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.4506998861290704,
|
176 |
+
"full_absorption_rate": 0.4213966288269694,
|
177 |
+
"num_full_absorption": 1225,
|
178 |
+
"num_probe_true_positives": 2907,
|
179 |
+
"num_split_features": 4
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.20785809991779755,
|
184 |
+
"full_absorption_rate": 0.12083333333333333,
|
185 |
+
"num_full_absorption": 203,
|
186 |
+
"num_probe_true_positives": 1680,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.21319969783745676,
|
192 |
+
"full_absorption_rate": 0.30140485312899107,
|
193 |
+
"num_full_absorption": 236,
|
194 |
+
"num_probe_true_positives": 783,
|
195 |
+
"num_split_features": 2
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.04640054552065163,
|
200 |
+
"full_absorption_rate": 0.03689567430025445,
|
201 |
+
"num_full_absorption": 29,
|
202 |
+
"num_probe_true_positives": 786,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.08355275384554323,
|
208 |
+
"full_absorption_rate": 0.0847723704866562,
|
209 |
+
"num_full_absorption": 54,
|
210 |
+
"num_probe_true_positives": 637,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.03616653815832325,
|
216 |
+
"full_absorption_rate": 0.0,
|
217 |
+
"num_full_absorption": 0,
|
218 |
+
"num_probe_true_positives": 103,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.018065386940751115,
|
224 |
+
"full_absorption_rate": 0.011560693641618497,
|
225 |
+
"num_full_absorption": 2,
|
226 |
+
"num_probe_true_positives": 173,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.010925816251230009,
|
232 |
+
"full_absorption_rate": 0.007905138339920948,
|
233 |
+
"num_full_absorption": 2,
|
234 |
+
"num_probe_true_positives": 253,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
random_seed_eval_results/absorption/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "absorption_first_letter",
|
3 |
+
"eval_config": {
|
4 |
+
"model_name": "gemma-2-2b",
|
5 |
+
"random_seed": 42,
|
6 |
+
"f1_jump_threshold": 0.03,
|
7 |
+
"max_k_value": 10,
|
8 |
+
"prompt_template": "{word} has the first letter:",
|
9 |
+
"prompt_token_pos": -6,
|
10 |
+
"llm_batch_size": 32,
|
11 |
+
"llm_dtype": "bfloat16",
|
12 |
+
"k_sparse_probe_l1_decay": 0.01,
|
13 |
+
"k_sparse_probe_batch_size": 4096,
|
14 |
+
"k_sparse_probe_num_epochs": 50
|
15 |
+
},
|
16 |
+
"eval_id": "4ab555a5-5c14-4096-985d-fbfcd95bad5d",
|
17 |
+
"datetime_epoch_millis": 1738787577551,
|
18 |
+
"eval_result_metrics": {
|
19 |
+
"mean": {
|
20 |
+
"mean_absorption_fraction_score": 0.16402077829695136,
|
21 |
+
"mean_full_absorption_score": 0.1455081445066387,
|
22 |
+
"mean_num_split_features": 1.2307692307692308,
|
23 |
+
"std_dev_absorption_fraction_score": 0.15788027063450677,
|
24 |
+
"std_dev_full_absorption_score": 0.1410482155509194,
|
25 |
+
"std_dev_num_split_features": 0.5870395085642742
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"eval_result_details": [
|
29 |
+
{
|
30 |
+
"first_letter": "a",
|
31 |
+
"mean_absorption_fraction": 0.12991450124362738,
|
32 |
+
"full_absorption_rate": 0.07584050039093042,
|
33 |
+
"num_full_absorption": 194,
|
34 |
+
"num_probe_true_positives": 2558,
|
35 |
+
"num_split_features": 1
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"first_letter": "b",
|
39 |
+
"mean_absorption_fraction": 0.023356857769147497,
|
40 |
+
"full_absorption_rate": 0.01277139208173691,
|
41 |
+
"num_full_absorption": 20,
|
42 |
+
"num_probe_true_positives": 1566,
|
43 |
+
"num_split_features": 1
|
44 |
+
},
|
45 |
+
{
|
46 |
+
"first_letter": "c",
|
47 |
+
"mean_absorption_fraction": 0.42441637423084033,
|
48 |
+
"full_absorption_rate": 0.36435218324982105,
|
49 |
+
"num_full_absorption": 1018,
|
50 |
+
"num_probe_true_positives": 2794,
|
51 |
+
"num_split_features": 3
|
52 |
+
},
|
53 |
+
{
|
54 |
+
"first_letter": "d",
|
55 |
+
"mean_absorption_fraction": 0.3983830197774112,
|
56 |
+
"full_absorption_rate": 0.27582159624413144,
|
57 |
+
"num_full_absorption": 470,
|
58 |
+
"num_probe_true_positives": 1704,
|
59 |
+
"num_split_features": 2
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"first_letter": "e",
|
63 |
+
"mean_absorption_fraction": 0.29735930296831414,
|
64 |
+
"full_absorption_rate": 0.28417721518987343,
|
65 |
+
"num_full_absorption": 449,
|
66 |
+
"num_probe_true_positives": 1580,
|
67 |
+
"num_split_features": 1
|
68 |
+
},
|
69 |
+
{
|
70 |
+
"first_letter": "f",
|
71 |
+
"mean_absorption_fraction": 0.14754169713635484,
|
72 |
+
"full_absorption_rate": 0.10696095076400679,
|
73 |
+
"num_full_absorption": 126,
|
74 |
+
"num_probe_true_positives": 1178,
|
75 |
+
"num_split_features": 1
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"first_letter": "g",
|
79 |
+
"mean_absorption_fraction": 0.09041740227134817,
|
80 |
+
"full_absorption_rate": 0.09042553191489362,
|
81 |
+
"num_full_absorption": 102,
|
82 |
+
"num_probe_true_positives": 1128,
|
83 |
+
"num_split_features": 1
|
84 |
+
},
|
85 |
+
{
|
86 |
+
"first_letter": "h",
|
87 |
+
"mean_absorption_fraction": 0.043464844239960984,
|
88 |
+
"full_absorption_rate": 0.018311291963377416,
|
89 |
+
"num_full_absorption": 18,
|
90 |
+
"num_probe_true_positives": 983,
|
91 |
+
"num_split_features": 1
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"first_letter": "i",
|
95 |
+
"mean_absorption_fraction": 0.28105418653291725,
|
96 |
+
"full_absorption_rate": 0.28819652486518876,
|
97 |
+
"num_full_absorption": 481,
|
98 |
+
"num_probe_true_positives": 1669,
|
99 |
+
"num_split_features": 1
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"first_letter": "j",
|
103 |
+
"mean_absorption_fraction": 0.0011799746030814323,
|
104 |
+
"full_absorption_rate": 0.011389521640091117,
|
105 |
+
"num_full_absorption": 5,
|
106 |
+
"num_probe_true_positives": 439,
|
107 |
+
"num_split_features": 1
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"first_letter": "k",
|
111 |
+
"mean_absorption_fraction": 0.0011871688335180757,
|
112 |
+
"full_absorption_rate": 0.010057471264367816,
|
113 |
+
"num_full_absorption": 7,
|
114 |
+
"num_probe_true_positives": 696,
|
115 |
+
"num_split_features": 1
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"first_letter": "l",
|
119 |
+
"mean_absorption_fraction": 0.14993970659401568,
|
120 |
+
"full_absorption_rate": 0.1333907056798623,
|
121 |
+
"num_full_absorption": 155,
|
122 |
+
"num_probe_true_positives": 1162,
|
123 |
+
"num_split_features": 1
|
124 |
+
},
|
125 |
+
{
|
126 |
+
"first_letter": "m",
|
127 |
+
"mean_absorption_fraction": 0.12240690347298436,
|
128 |
+
"full_absorption_rate": 0.07788671023965142,
|
129 |
+
"num_full_absorption": 143,
|
130 |
+
"num_probe_true_positives": 1836,
|
131 |
+
"num_split_features": 1
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"first_letter": "n",
|
135 |
+
"mean_absorption_fraction": 0.07982834734533917,
|
136 |
+
"full_absorption_rate": 0.06090133982947625,
|
137 |
+
"num_full_absorption": 50,
|
138 |
+
"num_probe_true_positives": 821,
|
139 |
+
"num_split_features": 1
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"first_letter": "o",
|
143 |
+
"mean_absorption_fraction": 0.19346406307592456,
|
144 |
+
"full_absorption_rate": 0.21577946768060838,
|
145 |
+
"num_full_absorption": 227,
|
146 |
+
"num_probe_true_positives": 1052,
|
147 |
+
"num_split_features": 1
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"first_letter": "p",
|
151 |
+
"mean_absorption_fraction": 0.5301150273469434,
|
152 |
+
"full_absorption_rate": 0.4553846153846154,
|
153 |
+
"num_full_absorption": 1036,
|
154 |
+
"num_probe_true_positives": 2275,
|
155 |
+
"num_split_features": 1
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"first_letter": "q",
|
159 |
+
"mean_absorption_fraction": 0.01507865766197323,
|
160 |
+
"full_absorption_rate": 0.03076923076923077,
|
161 |
+
"num_full_absorption": 6,
|
162 |
+
"num_probe_true_positives": 195,
|
163 |
+
"num_split_features": 1
|
164 |
+
},
|
165 |
+
{
|
166 |
+
"first_letter": "r",
|
167 |
+
"mean_absorption_fraction": 0.2573794005707176,
|
168 |
+
"full_absorption_rate": 0.21096051856216852,
|
169 |
+
"num_full_absorption": 358,
|
170 |
+
"num_probe_true_positives": 1697,
|
171 |
+
"num_split_features": 2
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"first_letter": "s",
|
175 |
+
"mean_absorption_fraction": 0.5040795030415306,
|
176 |
+
"full_absorption_rate": 0.4915720674234606,
|
177 |
+
"num_full_absorption": 1429,
|
178 |
+
"num_probe_true_positives": 2907,
|
179 |
+
"num_split_features": 3
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"first_letter": "t",
|
183 |
+
"mean_absorption_fraction": 0.15276755271078457,
|
184 |
+
"full_absorption_rate": 0.08928571428571429,
|
185 |
+
"num_full_absorption": 150,
|
186 |
+
"num_probe_true_positives": 1680,
|
187 |
+
"num_split_features": 1
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"first_letter": "u",
|
191 |
+
"mean_absorption_fraction": 0.22381275073634874,
|
192 |
+
"full_absorption_rate": 0.24521072796934865,
|
193 |
+
"num_full_absorption": 192,
|
194 |
+
"num_probe_true_positives": 783,
|
195 |
+
"num_split_features": 1
|
196 |
+
},
|
197 |
+
{
|
198 |
+
"first_letter": "v",
|
199 |
+
"mean_absorption_fraction": 0.026830089617880883,
|
200 |
+
"full_absorption_rate": 0.026717557251908396,
|
201 |
+
"num_full_absorption": 21,
|
202 |
+
"num_probe_true_positives": 786,
|
203 |
+
"num_split_features": 1
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"first_letter": "w",
|
207 |
+
"mean_absorption_fraction": 0.07490041912777874,
|
208 |
+
"full_absorption_rate": 0.07849293563579278,
|
209 |
+
"num_full_absorption": 50,
|
210 |
+
"num_probe_true_positives": 637,
|
211 |
+
"num_split_features": 1
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"first_letter": "x",
|
215 |
+
"mean_absorption_fraction": 0.05162827599896595,
|
216 |
+
"full_absorption_rate": 0.038834951456310676,
|
217 |
+
"num_full_absorption": 4,
|
218 |
+
"num_probe_true_positives": 103,
|
219 |
+
"num_split_features": 1
|
220 |
+
},
|
221 |
+
{
|
222 |
+
"first_letter": "y",
|
223 |
+
"mean_absorption_fraction": 0.042818109707770516,
|
224 |
+
"full_absorption_rate": 0.046242774566473986,
|
225 |
+
"num_full_absorption": 8,
|
226 |
+
"num_probe_true_positives": 173,
|
227 |
+
"num_split_features": 1
|
228 |
+
},
|
229 |
+
{
|
230 |
+
"first_letter": "z",
|
231 |
+
"mean_absorption_fraction": 0.001216099105256222,
|
232 |
+
"full_absorption_rate": 0.043478260869565216,
|
233 |
+
"num_full_absorption": 11,
|
234 |
+
"num_probe_true_positives": 253,
|
235 |
+
"num_split_features": 1
|
236 |
+
}
|
237 |
+
],
|
238 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
239 |
+
"sae_lens_id": "custom_sae",
|
240 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4",
|
241 |
+
"sae_lens_version": "5.4.1",
|
242 |
+
"sae_cfg_dict": {
|
243 |
+
"model_name": "gemma-2-2b",
|
244 |
+
"d_in": 2304,
|
245 |
+
"d_sae": 16384,
|
246 |
+
"hook_layer": 12,
|
247 |
+
"hook_name": "blocks.12.hook_resid_post",
|
248 |
+
"context_size": null,
|
249 |
+
"hook_head_index": null,
|
250 |
+
"architecture": "topk",
|
251 |
+
"apply_b_dec_to_input": null,
|
252 |
+
"finetuning_scaling_factor": null,
|
253 |
+
"activation_fn_str": "",
|
254 |
+
"prepend_bos": true,
|
255 |
+
"normalize_activations": "none",
|
256 |
+
"dtype": "bfloat16",
|
257 |
+
"device": "",
|
258 |
+
"dataset_path": "",
|
259 |
+
"dataset_trust_remote_code": true,
|
260 |
+
"seqpos_slice": [
|
261 |
+
null
|
262 |
+
],
|
263 |
+
"training_tokens": -100000,
|
264 |
+
"sae_lens_training_version": null,
|
265 |
+
"neuronpedia_id": null
|
266 |
+
},
|
267 |
+
"eval_result_unstructured": null
|
268 |
+
}
|
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9cd264c6bcbb7a785af8168b12a582a0b00e8b6ec78f32f4df8a68432daf4c3e
|
3 |
+
size 26730928
|
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a22e244c1bd987a1dbf0b6048cc06a8d67a12488adc40a7c99cbc266996f2a41
|
3 |
+
size 26750492
|
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ffa3c81081876da4dad8b3ac18f00fe7a94d3fc56428680351a75bf3cff061f7
|
3 |
+
size 26599749
|
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8b84354daa6b2e2a7289ed82da6317b1ae7d598f5847285622afaa7a2d2549c
|
3 |
+
size 26846759
|
random_seed_eval_results/autointerp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82f546c16abf1b1eb58446c47e29a02b5e4143f01437db21e85faa030dd77276
|
3 |
+
size 26752322
|
random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
random_seed_eval_results/core/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "9dee0d3c-103c-4d65-953a-0f4f306c5fe1",
|
73 |
+
"datetime_epoch_millis": 1738791866252,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.22513964187620733,
|
77 |
+
"scr_metric_threshold_2": 0.1385347875882101,
|
78 |
+
"scr_dir2_threshold_2": 0.14459702159963525,
|
79 |
+
"scr_dir1_threshold_5": 0.21983111343139253,
|
80 |
+
"scr_metric_threshold_5": 0.20326640785460648,
|
81 |
+
"scr_dir2_threshold_5": 0.21158442490747542,
|
82 |
+
"scr_dir1_threshold_10": 0.25192846118074297,
|
83 |
+
"scr_metric_threshold_10": 0.2749545350332416,
|
84 |
+
"scr_dir2_threshold_10": 0.2841028124566354,
|
85 |
+
"scr_dir1_threshold_20": 0.20284025239431253,
|
86 |
+
"scr_metric_threshold_20": 0.32735915703734847,
|
87 |
+
"scr_dir2_threshold_20": 0.33435385077642354,
|
88 |
+
"scr_dir1_threshold_50": 0.13344059547499754,
|
89 |
+
"scr_metric_threshold_50": 0.3972973820182854,
|
90 |
+
"scr_dir2_threshold_50": 0.39891958599459704,
|
91 |
+
"scr_dir1_threshold_100": 0.13881520081445878,
|
92 |
+
"scr_metric_threshold_100": 0.28708448453918034,
|
93 |
+
"scr_dir2_threshold_100": 0.3102170234845004,
|
94 |
+
"scr_dir1_threshold_500": -0.48690647479983323,
|
95 |
+
"scr_metric_threshold_500": 0.3190861754204293,
|
96 |
+
"scr_dir2_threshold_500": 0.3425712257082064
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.43749988358469727,
|
103 |
+
"scr_metric_threshold_2": 0.022167494916722333,
|
104 |
+
"scr_dir2_threshold_2": 0.022167494916722333,
|
105 |
+
"scr_dir1_threshold_5": 0.4687504074535596,
|
106 |
+
"scr_metric_threshold_5": 0.039408798291137845,
|
107 |
+
"scr_dir2_threshold_5": 0.039408798291137845,
|
108 |
+
"scr_dir1_threshold_10": 0.4687504074535596,
|
109 |
+
"scr_metric_threshold_10": 0.07389155184943222,
|
110 |
+
"scr_dir2_threshold_10": 0.07389155184943222,
|
111 |
+
"scr_dir1_threshold_20": 0.42187508731147705,
|
112 |
+
"scr_metric_threshold_20": 0.07389155184943222,
|
113 |
+
"scr_dir2_threshold_20": 0.07389155184943222,
|
114 |
+
"scr_dir1_threshold_50": 0.43749988358469727,
|
115 |
+
"scr_metric_threshold_50": 0.15270929524117124,
|
116 |
+
"scr_dir2_threshold_50": 0.15270929524117124,
|
117 |
+
"scr_dir1_threshold_100": 0.3749997671693945,
|
118 |
+
"scr_metric_threshold_100": 0.21428558844903142,
|
119 |
+
"scr_dir2_threshold_100": 0.21428558844903142,
|
120 |
+
"scr_dir1_threshold_500": -1.406249359715835,
|
121 |
+
"scr_metric_threshold_500": 0.34236451321652195,
|
122 |
+
"scr_dir2_threshold_500": 0.34236451321652195
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2803735818828607,
|
127 |
+
"scr_metric_threshold_2": 0.2622478955438056,
|
128 |
+
"scr_dir2_threshold_2": 0.2622478955438056,
|
129 |
+
"scr_dir1_threshold_5": 0.2616822325784525,
|
130 |
+
"scr_metric_threshold_5": 0.33717591477669556,
|
131 |
+
"scr_dir2_threshold_5": 0.33717591477669556,
|
132 |
+
"scr_dir1_threshold_10": 0.2710279072306566,
|
133 |
+
"scr_metric_threshold_10": 0.41498556949231613,
|
134 |
+
"scr_dir2_threshold_10": 0.41498556949231613,
|
135 |
+
"scr_dir1_threshold_20": 0.12149544163702218,
|
136 |
+
"scr_metric_threshold_20": 0.4265129702797845,
|
137 |
+
"scr_dir2_threshold_20": 0.4265129702797845,
|
138 |
+
"scr_dir1_threshold_50": -0.46728986019089086,
|
139 |
+
"scr_metric_threshold_50": 0.4207493557717049,
|
140 |
+
"scr_dir2_threshold_50": 0.4207493557717049,
|
141 |
+
"scr_dir1_threshold_100": -0.7102807434649352,
|
142 |
+
"scr_metric_threshold_100": -0.02305480157493678,
|
143 |
+
"scr_dir2_threshold_100": -0.02305480157493678,
|
144 |
+
"scr_dir1_threshold_500": -1.4953274412002928,
|
145 |
+
"scr_metric_threshold_500": -0.11527366433206554,
|
146 |
+
"scr_dir2_threshold_500": -0.11527366433206554
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5312505238688624,
|
151 |
+
"scr_metric_threshold_2": 0.04556963247779803,
|
152 |
+
"scr_dir2_threshold_2": 0.04556963247779803,
|
153 |
+
"scr_dir1_threshold_5": 0.5156247962732202,
|
154 |
+
"scr_metric_threshold_5": 0.10379747943300446,
|
155 |
+
"scr_dir2_threshold_5": 0.10379747943300446,
|
156 |
+
"scr_dir1_threshold_10": 0.5468753201420825,
|
157 |
+
"scr_metric_threshold_10": 0.13670889743339407,
|
158 |
+
"scr_dir2_threshold_10": 0.13670889743339407,
|
159 |
+
"scr_dir1_threshold_20": 0.42187508731147705,
|
160 |
+
"scr_metric_threshold_20": 0.22278493695715454,
|
161 |
+
"scr_dir2_threshold_20": 0.22278493695715454,
|
162 |
+
"scr_dir1_threshold_50": 0.2656252619344312,
|
163 |
+
"scr_metric_threshold_50": 0.26582280582121537,
|
164 |
+
"scr_dir2_threshold_50": 0.26582280582121537,
|
165 |
+
"scr_dir1_threshold_100": 0.07812491268852294,
|
166 |
+
"scr_metric_threshold_100": -0.0025316127159178037,
|
167 |
+
"scr_dir2_threshold_100": -0.0025316127159178037,
|
168 |
+
"scr_dir1_threshold_500": -2.1406240977814037,
|
169 |
+
"scr_metric_threshold_500": -0.005063225431835607,
|
170 |
+
"scr_dir2_threshold_500": -0.005063225431835607
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.259842390342736,
|
175 |
+
"scr_metric_threshold_2": 0.14836798243767776,
|
176 |
+
"scr_dir2_threshold_2": 0.14836798243767776,
|
177 |
+
"scr_dir1_threshold_5": -0.007873912274188802,
|
178 |
+
"scr_metric_threshold_5": 0.25816026467998676,
|
179 |
+
"scr_dir2_threshold_5": 0.25816026467998676,
|
180 |
+
"scr_dir1_threshold_10": -0.04724394297291932,
|
181 |
+
"scr_metric_threshold_10": 0.32047488451379413,
|
182 |
+
"scr_dir2_threshold_10": 0.32047488451379413,
|
183 |
+
"scr_dir1_threshold_20": -0.2283462719181943,
|
184 |
+
"scr_metric_threshold_20": 0.28486650505613625,
|
185 |
+
"scr_dir2_threshold_20": 0.28486650505613625,
|
186 |
+
"scr_dir1_threshold_50": -0.13385791664456914,
|
187 |
+
"scr_metric_threshold_50": 0.3916914665607343,
|
188 |
+
"scr_dir2_threshold_50": 0.3916914665607343,
|
189 |
+
"scr_dir1_threshold_100": 0.40157468858928047,
|
190 |
+
"scr_metric_threshold_100": 0.09792299929146529,
|
191 |
+
"scr_dir2_threshold_100": 0.09792299929146529,
|
192 |
+
"scr_dir1_threshold_500": 0.14960621052073325,
|
193 |
+
"scr_metric_threshold_500": 0.18100904115762476,
|
194 |
+
"scr_dir2_threshold_500": 0.18100904115762476
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.021739172687599125,
|
199 |
+
"scr_metric_threshold_2": 0.13333325541876354,
|
200 |
+
"scr_dir2_threshold_2": 0.13333325541876354,
|
201 |
+
"scr_dir1_threshold_5": 0.048913057562533044,
|
202 |
+
"scr_metric_threshold_5": 0.22745086122869718,
|
203 |
+
"scr_dir2_threshold_5": 0.22745086122869718,
|
204 |
+
"scr_dir1_threshold_10": 0.11413025168707046,
|
205 |
+
"scr_metric_threshold_10": 0.4470586860331121,
|
206 |
+
"scr_dir2_threshold_10": 0.4470586860331121,
|
207 |
+
"scr_dir1_threshold_20": 0.05434776974986783,
|
208 |
+
"scr_metric_threshold_20": 0.5960784817878589,
|
209 |
+
"scr_dir2_threshold_20": 0.5960784817878589,
|
210 |
+
"scr_dir1_threshold_50": -0.048913057562533044,
|
211 |
+
"scr_metric_threshold_50": 0.6862745693856515,
|
212 |
+
"scr_dir2_threshold_50": 0.6862745693856515,
|
213 |
+
"scr_dir1_threshold_100": -0.021739172687599125,
|
214 |
+
"scr_metric_threshold_100": 0.7568625984353196,
|
215 |
+
"scr_dir2_threshold_100": 0.7568625984353196,
|
216 |
+
"scr_dir1_threshold_500": 0.010869424374669583,
|
217 |
+
"scr_metric_threshold_500": 0.7411765255867552,
|
218 |
+
"scr_dir2_threshold_500": 0.7411765255867552
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.09278322958848217,
|
223 |
+
"scr_metric_threshold_2": 0.08064531634921589,
|
224 |
+
"scr_dir2_threshold_2": 0.08064531634921589,
|
225 |
+
"scr_dir1_threshold_5": 0.2010307916356866,
|
226 |
+
"scr_metric_threshold_5": 0.12096785435318151,
|
227 |
+
"scr_dir2_threshold_5": 0.12096785435318151,
|
228 |
+
"scr_dir1_threshold_10": 0.25257713270943044,
|
229 |
+
"scr_metric_threshold_10": 0.16129039235714715,
|
230 |
+
"scr_dir2_threshold_10": 0.16129039235714715,
|
231 |
+
"scr_dir1_threshold_20": 0.3350515242197629,
|
232 |
+
"scr_metric_threshold_20": 0.23387105690079912,
|
233 |
+
"scr_dir2_threshold_20": 0.23387105690079912,
|
234 |
+
"scr_dir1_threshold_50": 0.3195874990014686,
|
235 |
+
"scr_metric_threshold_50": 0.3588711169861203,
|
236 |
+
"scr_dir2_threshold_50": 0.3588711169861203,
|
237 |
+
"scr_dir1_threshold_100": 0.34536066953834055,
|
238 |
+
"scr_metric_threshold_100": 0.47983873099801716,
|
239 |
+
"scr_dir2_threshold_100": 0.47983873099801716,
|
240 |
+
"scr_dir1_threshold_500": 0.2731957305870136,
|
241 |
+
"scr_metric_threshold_500": 0.5604838070059485,
|
242 |
+
"scr_dir2_threshold_500": 0.5604838070059485
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.12612611161318626,
|
247 |
+
"scr_metric_threshold_2": 0.3644444821204628,
|
248 |
+
"scr_dir2_threshold_2": 0.3644444821204628,
|
249 |
+
"scr_dir1_threshold_5": 0.18468472096703434,
|
250 |
+
"scr_metric_threshold_5": 0.45333327681930574,
|
251 |
+
"scr_dir2_threshold_5": 0.45333327681930574,
|
252 |
+
"scr_dir1_threshold_10": 0.3063063861274755,
|
253 |
+
"scr_metric_threshold_10": 0.5422220715181487,
|
254 |
+
"scr_dir2_threshold_10": 0.5422220715181487,
|
255 |
+
"scr_dir1_threshold_20": 0.3333333333333333,
|
256 |
+
"scr_metric_threshold_20": 0.6177777059578677,
|
257 |
+
"scr_dir2_threshold_20": 0.6177777059578677,
|
258 |
+
"scr_dir1_threshold_50": 0.445945837098897,
|
259 |
+
"scr_metric_threshold_50": 0.6533333298012066,
|
260 |
+
"scr_dir2_threshold_50": 0.6533333298012066,
|
261 |
+
"scr_dir1_threshold_100": 0.5180180543003676,
|
262 |
+
"scr_metric_threshold_100": 0.6488889430481654,
|
263 |
+
"scr_dir2_threshold_100": 0.6488889430481654,
|
264 |
+
"scr_dir1_threshold_500": 0.5630630558065931,
|
265 |
+
"scr_metric_threshold_500": 0.697777727150628,
|
266 |
+
"scr_dir2_threshold_500": 0.697777727150628
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.05150224144123495,
|
271 |
+
"scr_metric_threshold_2": 0.05150224144123495,
|
272 |
+
"scr_dir2_threshold_2": 0.10000011353263609,
|
273 |
+
"scr_dir1_threshold_5": 0.08583681325484281,
|
274 |
+
"scr_metric_threshold_5": 0.08583681325484281,
|
275 |
+
"scr_dir2_threshold_5": 0.1523809496777944,
|
276 |
+
"scr_dir1_threshold_10": 0.10300422706858779,
|
277 |
+
"scr_metric_threshold_10": 0.10300422706858779,
|
278 |
+
"scr_dir2_threshold_10": 0.17619044645573817,
|
279 |
+
"scr_dir1_threshold_20": 0.16309004750975417,
|
280 |
+
"scr_metric_threshold_20": 0.16309004750975417,
|
281 |
+
"scr_dir2_threshold_20": 0.21904759742235502,
|
282 |
+
"scr_dir1_threshold_50": 0.2489271165784791,
|
283 |
+
"scr_metric_threshold_50": 0.2489271165784791,
|
284 |
+
"scr_dir2_threshold_50": 0.2619047483889719,
|
285 |
+
"scr_dir1_threshold_100": 0.1244634303822985,
|
286 |
+
"scr_metric_threshold_100": 0.1244634303822985,
|
287 |
+
"scr_dir2_threshold_100": 0.3095237419448595,
|
288 |
+
"scr_dir1_threshold_500": 0.15021467900985702,
|
289 |
+
"scr_metric_threshold_500": 0.15021467900985702,
|
290 |
+
"scr_dir2_threshold_500": 0.33809508131207394
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "60baa903-f878-4b23-a5cd-ec96f2332e41",
|
73 |
+
"datetime_epoch_millis": 1738791541544,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.2361877325388411,
|
77 |
+
"scr_metric_threshold_2": 0.13791798180534542,
|
78 |
+
"scr_dir2_threshold_2": 0.1371311321790473,
|
79 |
+
"scr_dir1_threshold_5": 0.22353532292083783,
|
80 |
+
"scr_metric_threshold_5": 0.19684522910292013,
|
81 |
+
"scr_dir2_threshold_5": 0.20170931003047823,
|
82 |
+
"scr_dir1_threshold_10": 0.2429635217288672,
|
83 |
+
"scr_metric_threshold_10": 0.27304872008261916,
|
84 |
+
"scr_dir2_threshold_10": 0.276714683405259,
|
85 |
+
"scr_dir1_threshold_20": 0.2338835500774749,
|
86 |
+
"scr_metric_threshold_20": 0.3322860879225477,
|
87 |
+
"scr_dir2_threshold_20": 0.33648086797508275,
|
88 |
+
"scr_dir1_threshold_50": 0.30544138126010734,
|
89 |
+
"scr_metric_threshold_50": 0.3970974400299999,
|
90 |
+
"scr_dir2_threshold_50": 0.39961631797060004,
|
91 |
+
"scr_dir1_threshold_100": 0.16496334174074267,
|
92 |
+
"scr_metric_threshold_100": 0.290651594443317,
|
93 |
+
"scr_dir2_threshold_100": 0.30121004864569334,
|
94 |
+
"scr_dir1_threshold_500": -0.3407507620134751,
|
95 |
+
"scr_metric_threshold_500": 0.3261204064251065,
|
96 |
+
"scr_dir2_threshold_500": 0.34041889450148693
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.43749988358469727,
|
103 |
+
"scr_metric_threshold_2": 0.024630517283144072,
|
104 |
+
"scr_dir2_threshold_2": 0.024630517283144072,
|
105 |
+
"scr_dir1_threshold_5": 0.4843752037267798,
|
106 |
+
"scr_metric_threshold_5": 0.049261034566288144,
|
107 |
+
"scr_dir2_threshold_5": 0.049261034566288144,
|
108 |
+
"scr_dir1_threshold_10": 0.5156247962732202,
|
109 |
+
"scr_metric_threshold_10": 0.06650233794070366,
|
110 |
+
"scr_dir2_threshold_10": 0.06650233794070366,
|
111 |
+
"scr_dir1_threshold_20": 0.4062502910382569,
|
112 |
+
"scr_metric_threshold_20": 0.10591128304130484,
|
113 |
+
"scr_dir2_threshold_20": 0.10591128304130484,
|
114 |
+
"scr_dir1_threshold_50": 0.4062502910382569,
|
115 |
+
"scr_metric_threshold_50": 0.13793101423317747,
|
116 |
+
"scr_dir2_threshold_50": 0.13793101423317747,
|
117 |
+
"scr_dir1_threshold_100": 0.4062502910382569,
|
118 |
+
"scr_metric_threshold_100": 0.16748757624916502,
|
119 |
+
"scr_dir2_threshold_100": 0.16748757624916502,
|
120 |
+
"scr_dir1_threshold_500": -1.406249359715835,
|
121 |
+
"scr_metric_threshold_500": 0.23891625254163884,
|
122 |
+
"scr_dir2_threshold_500": 0.23891625254163884
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2803735818828607,
|
127 |
+
"scr_metric_threshold_2": 0.273775296331274,
|
128 |
+
"scr_dir2_threshold_2": 0.273775296331274,
|
129 |
+
"scr_dir1_threshold_5": 0.2616822325784525,
|
130 |
+
"scr_metric_threshold_5": 0.291066311626822,
|
131 |
+
"scr_dir2_threshold_5": 0.291066311626822,
|
132 |
+
"scr_dir1_threshold_10": 0.2616822325784525,
|
133 |
+
"scr_metric_threshold_10": 0.36311235183436297,
|
134 |
+
"scr_dir2_threshold_10": 0.36311235183436297,
|
135 |
+
"scr_dir1_threshold_20": 0.2336446515690506,
|
136 |
+
"scr_metric_threshold_20": 0.4582133653881499,
|
137 |
+
"scr_dir2_threshold_20": 0.4582133653881499,
|
138 |
+
"scr_dir1_threshold_50": 0.35514009320607276,
|
139 |
+
"scr_metric_threshold_50": 0.4351585638132131,
|
140 |
+
"scr_dir2_threshold_50": 0.4351585638132131,
|
141 |
+
"scr_dir1_threshold_100": -0.514018790504701,
|
142 |
+
"scr_metric_threshold_100": -0.01152740078746839,
|
143 |
+
"scr_dir2_threshold_100": -0.01152740078746839,
|
144 |
+
"scr_dir1_threshold_500": -1.1682243719508323,
|
145 |
+
"scr_metric_threshold_500": -0.1815560900315269,
|
146 |
+
"scr_dir2_threshold_500": -0.1815560900315269
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5625001164153027,
|
151 |
+
"scr_metric_threshold_2": 0.037974794330044616,
|
152 |
+
"scr_dir2_threshold_2": 0.037974794330044616,
|
153 |
+
"scr_dir1_threshold_5": 0.5625001164153027,
|
154 |
+
"scr_metric_threshold_5": 0.08354442680784264,
|
155 |
+
"scr_dir2_threshold_5": 0.08354442680784264,
|
156 |
+
"scr_dir1_threshold_10": 0.5156247962732202,
|
157 |
+
"scr_metric_threshold_10": 0.15696210095637528,
|
158 |
+
"scr_dir2_threshold_10": 0.15696210095637528,
|
159 |
+
"scr_dir1_threshold_20": 0.43749988358469727,
|
160 |
+
"scr_metric_threshold_20": 0.20000012071825551,
|
161 |
+
"scr_dir2_threshold_20": 0.20000012071825551,
|
162 |
+
"scr_dir1_threshold_50": 0.2968748544808716,
|
163 |
+
"scr_metric_threshold_50": 0.28101278391236095,
|
164 |
+
"scr_dir2_threshold_50": 0.28101278391236095,
|
165 |
+
"scr_dir1_threshold_100": 0.2343756693879908,
|
166 |
+
"scr_metric_threshold_100": 0.04303801976188022,
|
167 |
+
"scr_dir2_threshold_100": 0.04303801976188022,
|
168 |
+
"scr_dir1_threshold_500": -1.421874155989055,
|
169 |
+
"scr_metric_threshold_500": 0.015189978091145603,
|
170 |
+
"scr_dir2_threshold_500": 0.015189978091145603
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2440945657943584,
|
175 |
+
"scr_metric_threshold_2": 0.18100904115762476,
|
176 |
+
"scr_dir2_threshold_2": 0.18100904115762476,
|
177 |
+
"scr_dir1_threshold_5": -0.03149611842454171,
|
178 |
+
"scr_metric_threshold_5": 0.2759645428430035,
|
179 |
+
"scr_dir2_threshold_5": 0.2759645428430035,
|
180 |
+
"scr_dir1_threshold_10": -0.10236179822002743,
|
181 |
+
"scr_metric_threshold_10": 0.3382789858084353,
|
182 |
+
"scr_dir2_threshold_10": 0.3382789858084353,
|
183 |
+
"scr_dir1_threshold_20": -0.16535403506911087,
|
184 |
+
"scr_metric_threshold_20": 0.32640952598921597,
|
185 |
+
"scr_dir2_threshold_20": 0.32640952598921597,
|
186 |
+
"scr_dir1_threshold_50": 0.37007903949252524,
|
187 |
+
"scr_metric_threshold_50": 0.4272998460183922,
|
188 |
+
"scr_dir2_threshold_50": 0.4272998460183922,
|
189 |
+
"scr_dir1_threshold_100": 0.22834674124598078,
|
190 |
+
"scr_metric_threshold_100": 0.1364985226184585,
|
191 |
+
"scr_dir2_threshold_100": 0.1364985226184585,
|
192 |
+
"scr_dir1_threshold_500": 0.10236226754781394,
|
193 |
+
"scr_metric_threshold_500": 0.19287850097684403,
|
194 |
+
"scr_dir2_threshold_500": 0.19287850097684403
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.08152165462480175,
|
199 |
+
"scr_metric_threshold_2": 0.13333325541876354,
|
200 |
+
"scr_dir2_threshold_2": 0.13333325541876354,
|
201 |
+
"scr_dir1_threshold_5": 0.06521719412453741,
|
202 |
+
"scr_metric_threshold_5": 0.262744992625386,
|
203 |
+
"scr_dir2_threshold_5": 0.262744992625386,
|
204 |
+
"scr_dir1_threshold_10": 0.10326082731240087,
|
205 |
+
"scr_metric_threshold_10": 0.4784312992176598,
|
206 |
+
"scr_dir2_threshold_10": 0.4784312992176598,
|
207 |
+
"scr_dir1_threshold_20": 0.07608694243746696,
|
208 |
+
"scr_metric_threshold_20": 0.6039215182121411,
|
209 |
+
"scr_dir2_threshold_20": 0.6039215182121411,
|
210 |
+
"scr_dir1_threshold_50": -0.04347834537519825,
|
211 |
+
"scr_metric_threshold_50": 0.7254902189944813,
|
212 |
+
"scr_dir2_threshold_50": 0.7254902189944813,
|
213 |
+
"scr_dir1_threshold_100": -0.032608921000528666,
|
214 |
+
"scr_metric_threshold_100": 0.780392175195585,
|
215 |
+
"scr_dir2_threshold_100": 0.780392175195585,
|
216 |
+
"scr_dir1_threshold_500": 0.10326082731240087,
|
217 |
+
"scr_metric_threshold_500": 0.7411765255867552,
|
218 |
+
"scr_dir2_threshold_500": 0.7411765255867552
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.0979381094881989,
|
223 |
+
"scr_metric_threshold_2": 0.06854845881151235,
|
224 |
+
"scr_dir2_threshold_2": 0.06854845881151235,
|
225 |
+
"scr_dir1_threshold_5": 0.18041219375810347,
|
226 |
+
"scr_metric_threshold_5": 0.10887099681547797,
|
227 |
+
"scr_dir2_threshold_5": 0.10887099681547797,
|
228 |
+
"scr_dir1_threshold_10": 0.24226798739085284,
|
229 |
+
"scr_metric_threshold_10": 0.1370969176230247,
|
230 |
+
"scr_dir2_threshold_10": 0.1370969176230247,
|
231 |
+
"scr_dir1_threshold_20": 0.3195874990014686,
|
232 |
+
"scr_metric_threshold_20": 0.2056451360932524,
|
233 |
+
"scr_dir2_threshold_20": 0.2056451360932524,
|
234 |
+
"scr_dir1_threshold_50": 0.34020609687905173,
|
235 |
+
"scr_metric_threshold_50": 0.30241927537102686,
|
236 |
+
"scr_dir2_threshold_50": 0.30241927537102686,
|
237 |
+
"scr_dir1_threshold_100": 0.381443292634218,
|
238 |
+
"scr_metric_threshold_100": 0.3588711169861203,
|
239 |
+
"scr_dir2_threshold_100": 0.3588711169861203,
|
240 |
+
"scr_dir1_threshold_500": 0.3659792674159237,
|
241 |
+
"scr_metric_threshold_500": 0.6250000600853212,
|
242 |
+
"scr_dir2_threshold_500": 0.6250000600853212
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.11261250376556366,
|
247 |
+
"scr_metric_threshold_2": 0.3111110463554545,
|
248 |
+
"scr_dir2_threshold_2": 0.3111110463554545,
|
249 |
+
"scr_dir1_threshold_5": 0.17117111311941172,
|
250 |
+
"scr_metric_threshold_5": 0.4088888794698843,
|
251 |
+
"scr_dir2_threshold_5": 0.4088888794698843,
|
252 |
+
"scr_dir1_threshold_10": 0.2702702775267402,
|
253 |
+
"scr_metric_threshold_10": 0.5066667125843141,
|
254 |
+
"scr_dir2_threshold_10": 0.5066667125843141,
|
255 |
+
"scr_dir1_threshold_20": 0.38738749623443636,
|
256 |
+
"scr_metric_threshold_20": 0.5822220821145289,
|
257 |
+
"scr_dir2_threshold_20": 0.5822220821145289,
|
258 |
+
"scr_dir1_threshold_50": 0.4909911070945099,
|
259 |
+
"scr_metric_threshold_50": 0.6399999046325785,
|
260 |
+
"scr_dir2_threshold_50": 0.6399999046325785,
|
261 |
+
"scr_dir1_threshold_100": 0.4099099969875491,
|
262 |
+
"scr_metric_threshold_100": 0.6444442913856198,
|
263 |
+
"scr_dir2_threshold_100": 0.6444442913856198,
|
264 |
+
"scr_dir1_threshold_500": 0.43693694419340684,
|
265 |
+
"scr_metric_threshold_500": 0.7155555390722975,
|
266 |
+
"scr_dir2_threshold_500": 0.7155555390722975
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.07296144475494565,
|
271 |
+
"scr_metric_threshold_2": 0.07296144475494565,
|
272 |
+
"scr_dir2_threshold_2": 0.06666664774456064,
|
273 |
+
"scr_dir1_threshold_5": 0.09442064806865635,
|
274 |
+
"scr_metric_threshold_5": 0.09442064806865635,
|
275 |
+
"scr_dir2_threshold_5": 0.1333332954891213,
|
276 |
+
"scr_dir1_threshold_10": 0.13733905469607777,
|
277 |
+
"scr_metric_threshold_10": 0.13733905469607777,
|
278 |
+
"scr_dir2_threshold_10": 0.16666676127719673,
|
279 |
+
"scr_dir1_threshold_20": 0.17596567182353345,
|
280 |
+
"scr_metric_threshold_20": 0.17596567182353345,
|
281 |
+
"scr_dir2_threshold_20": 0.20952391224381361,
|
282 |
+
"scr_dir1_threshold_50": 0.2274679132647684,
|
283 |
+
"scr_metric_threshold_50": 0.2274679132647684,
|
284 |
+
"scr_dir2_threshold_50": 0.24761893678956953,
|
285 |
+
"scr_dir1_threshold_100": 0.20600845413717558,
|
286 |
+
"scr_metric_threshold_100": 0.20600845413717558,
|
287 |
+
"scr_dir2_threshold_100": 0.2904760877561864,
|
288 |
+
"scr_dir1_threshold_500": 0.2618024850783763,
|
289 |
+
"scr_metric_threshold_500": 0.2618024850783763,
|
290 |
+
"scr_dir2_threshold_500": 0.37619038968942015
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "abc3a7fe-901e-41b7-b863-d7c2fe187f5d",
|
73 |
+
"datetime_epoch_millis": 1738792191381,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.27189662143546267,
|
77 |
+
"scr_metric_threshold_2": 0.14004249795768345,
|
78 |
+
"scr_dir2_threshold_2": 0.1368670346002006,
|
79 |
+
"scr_dir1_threshold_5": 0.31141524878678495,
|
80 |
+
"scr_metric_threshold_5": 0.2115742250577737,
|
81 |
+
"scr_dir2_threshold_5": 0.211903765527429,
|
82 |
+
"scr_dir1_threshold_10": 0.3010599506808038,
|
83 |
+
"scr_metric_threshold_10": 0.29065234137699436,
|
84 |
+
"scr_dir2_threshold_10": 0.2925836779282715,
|
85 |
+
"scr_dir1_threshold_20": 0.25286159416685644,
|
86 |
+
"scr_metric_threshold_20": 0.3374504224840684,
|
87 |
+
"scr_dir2_threshold_20": 0.33961678908221155,
|
88 |
+
"scr_dir1_threshold_50": 0.32881244000862797,
|
89 |
+
"scr_metric_threshold_50": 0.385774710992042,
|
90 |
+
"scr_dir2_threshold_50": 0.3761384965816031,
|
91 |
+
"scr_dir1_threshold_100": 0.1704072584532131,
|
92 |
+
"scr_metric_threshold_100": 0.3010885087468888,
|
93 |
+
"scr_dir2_threshold_100": 0.29610434806667896,
|
94 |
+
"scr_dir1_threshold_500": -0.3535588273019515,
|
95 |
+
"scr_metric_threshold_500": 0.32484492101420326,
|
96 |
+
"scr_dir2_threshold_500": 0.33724782632593275
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.42187508731147705,
|
103 |
+
"scr_metric_threshold_2": 0.02709353964956581,
|
104 |
+
"scr_dir2_threshold_2": 0.02709353964956581,
|
105 |
+
"scr_dir1_threshold_5": 0.42187508731147705,
|
106 |
+
"scr_metric_threshold_5": 0.044334989833444666,
|
107 |
+
"scr_dir2_threshold_5": 0.044334989833444666,
|
108 |
+
"scr_dir1_threshold_10": 0.4062502910382569,
|
109 |
+
"scr_metric_threshold_10": 0.08128076575816078,
|
110 |
+
"scr_dir2_threshold_10": 0.08128076575816078,
|
111 |
+
"scr_dir1_threshold_20": 0.4062502910382569,
|
112 |
+
"scr_metric_threshold_20": 0.12315258641572036,
|
113 |
+
"scr_dir2_threshold_20": 0.12315258641572036,
|
114 |
+
"scr_dir1_threshold_50": 0.4062502910382569,
|
115 |
+
"scr_metric_threshold_50": 0.2635467698247829,
|
116 |
+
"scr_dir2_threshold_50": 0.2635467698247829,
|
117 |
+
"scr_dir1_threshold_100": 0.42187508731147705,
|
118 |
+
"scr_metric_threshold_100": 0.3349752993077934,
|
119 |
+
"scr_dir2_threshold_100": 0.3349752993077934,
|
120 |
+
"scr_dir1_threshold_500": -1.01562479627322,
|
121 |
+
"scr_metric_threshold_500": 0.40147778405796036,
|
122 |
+
"scr_dir2_threshold_500": 0.40147778405796036
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2803735818828607,
|
127 |
+
"scr_metric_threshold_2": 0.273775296331274,
|
128 |
+
"scr_dir2_threshold_2": 0.273775296331274,
|
129 |
+
"scr_dir1_threshold_5": 0.35514009320607276,
|
130 |
+
"scr_metric_threshold_5": 0.35158512281820375,
|
131 |
+
"scr_dir2_threshold_5": 0.35158512281820375,
|
132 |
+
"scr_dir1_threshold_10": 0.36448576785827685,
|
133 |
+
"scr_metric_threshold_10": 0.41498556949231613,
|
134 |
+
"scr_dir2_threshold_10": 0.41498556949231613,
|
135 |
+
"scr_dir1_threshold_20": 0.2990654882400586,
|
136 |
+
"scr_metric_threshold_20": 0.4899135887252061,
|
137 |
+
"scr_dir2_threshold_20": 0.4899135887252061,
|
138 |
+
"scr_dir1_threshold_50": 0.35514009320607276,
|
139 |
+
"scr_metric_threshold_50": 0.38328534615525994,
|
140 |
+
"scr_dir2_threshold_50": 0.38328534615525994,
|
141 |
+
"scr_dir1_threshold_100": -0.6635518131511251,
|
142 |
+
"scr_metric_threshold_100": -0.09221903452843794,
|
143 |
+
"scr_dir2_threshold_100": -0.09221903452843794,
|
144 |
+
"scr_dir1_threshold_500": -1.4299066045292848,
|
145 |
+
"scr_metric_threshold_500": -0.24495670847694848,
|
146 |
+
"scr_dir2_threshold_500": -0.24495670847694848
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5468753201420825,
|
151 |
+
"scr_metric_threshold_2": 0.037974794330044616,
|
152 |
+
"scr_dir2_threshold_2": 0.037974794330044616,
|
153 |
+
"scr_dir1_threshold_5": 0.5625001164153027,
|
154 |
+
"scr_metric_threshold_5": 0.08860765223967824,
|
155 |
+
"scr_dir2_threshold_5": 0.08860765223967824,
|
156 |
+
"scr_dir1_threshold_10": 0.5468753201420825,
|
157 |
+
"scr_metric_threshold_10": 0.13924051014931188,
|
158 |
+
"scr_dir2_threshold_10": 0.13924051014931188,
|
159 |
+
"scr_dir1_threshold_20": 0.28125005820765137,
|
160 |
+
"scr_metric_threshold_20": 0.20253173343417333,
|
161 |
+
"scr_dir2_threshold_20": 0.20253173343417333,
|
162 |
+
"scr_dir1_threshold_50": 0.39062549476503666,
|
163 |
+
"scr_metric_threshold_50": 0.25063297862788914,
|
164 |
+
"scr_dir2_threshold_50": 0.25063297862788914,
|
165 |
+
"scr_dir1_threshold_100": 0.2343756693879908,
|
166 |
+
"scr_metric_threshold_100": 0.05316462152337083,
|
167 |
+
"scr_dir2_threshold_100": 0.05316462152337083,
|
168 |
+
"scr_dir1_threshold_500": -1.2656243306120092,
|
169 |
+
"scr_metric_threshold_500": 0.037974794330044616,
|
170 |
+
"scr_dir2_threshold_500": 0.037974794330044616
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2519684780685472,
|
175 |
+
"scr_metric_threshold_2": 0.15430280078147524,
|
176 |
+
"scr_dir2_threshold_2": 0.15430280078147524,
|
177 |
+
"scr_dir1_threshold_5": 0.33070853946600826,
|
178 |
+
"scr_metric_threshold_5": 0.2640950830237842,
|
179 |
+
"scr_dir2_threshold_5": 0.2640950830237842,
|
180 |
+
"scr_dir1_threshold_10": 0.08661444299943634,
|
181 |
+
"scr_metric_threshold_10": 0.34421380415223274,
|
182 |
+
"scr_dir2_threshold_10": 0.34421380415223274,
|
183 |
+
"scr_dir1_threshold_20": -0.11023617982200275,
|
184 |
+
"scr_metric_threshold_20": 0.2908013233999337,
|
185 |
+
"scr_dir2_threshold_20": 0.2908013233999337,
|
186 |
+
"scr_dir1_threshold_50": 0.40944907019125576,
|
187 |
+
"scr_metric_threshold_50": 0.40059360564224267,
|
188 |
+
"scr_dir2_threshold_50": 0.40059360564224267,
|
189 |
+
"scr_dir1_threshold_100": 0.33858292106798354,
|
190 |
+
"scr_metric_threshold_100": 0.1988131424522659,
|
191 |
+
"scr_dir2_threshold_100": 0.1988131424522659,
|
192 |
+
"scr_dir1_threshold_500": 0.11811056142397805,
|
193 |
+
"scr_metric_threshold_500": 0.2997032856130665,
|
194 |
+
"scr_dir2_threshold_500": 0.2997032856130665
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.048913057562533044,
|
199 |
+
"scr_metric_threshold_2": 0.1490195620110374,
|
200 |
+
"scr_dir2_threshold_2": 0.1490195620110374,
|
201 |
+
"scr_dir1_threshold_5": 0.06521719412453741,
|
202 |
+
"scr_metric_threshold_5": 0.28627433564194205,
|
203 |
+
"scr_dir2_threshold_5": 0.28627433564194205,
|
204 |
+
"scr_dir1_threshold_10": 0.08695636681213655,
|
205 |
+
"scr_metric_threshold_10": 0.49019608759779254,
|
206 |
+
"scr_dir2_threshold_10": 0.49019608759779254,
|
207 |
+
"scr_dir1_threshold_20": 0.05434776974986783,
|
208 |
+
"scr_metric_threshold_20": 0.6,
|
209 |
+
"scr_dir2_threshold_20": 0.6,
|
210 |
+
"scr_dir1_threshold_50": -0.07065223025013216,
|
211 |
+
"scr_metric_threshold_50": 0.7176469488264897,
|
212 |
+
"scr_dir2_threshold_50": 0.7176469488264897,
|
213 |
+
"scr_dir1_threshold_100": -0.06521751806279738,
|
214 |
+
"scr_metric_threshold_100": 0.7529410802231785,
|
215 |
+
"scr_dir2_threshold_100": 0.7529410802231785,
|
216 |
+
"scr_dir1_threshold_500": -0.04347834537519825,
|
217 |
+
"scr_metric_threshold_500": 0.7176469488264897,
|
218 |
+
"scr_dir2_threshold_500": 0.7176469488264897
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.21649481685398092,
|
223 |
+
"scr_metric_threshold_2": 0.06451625307937271,
|
224 |
+
"scr_dir2_threshold_2": 0.06451625307937271,
|
225 |
+
"scr_dir1_threshold_5": 0.26804115792772476,
|
226 |
+
"scr_metric_threshold_5": 0.12096785435318151,
|
227 |
+
"scr_dir2_threshold_5": 0.12096785435318151,
|
228 |
+
"scr_dir1_threshold_10": 0.34536066953834055,
|
229 |
+
"scr_metric_threshold_10": 0.17741945562699032,
|
230 |
+
"scr_dir2_threshold_10": 0.17741945562699032,
|
231 |
+
"scr_dir1_threshold_20": 0.4175256084896675,
|
232 |
+
"scr_metric_threshold_20": 0.2137097878988163,
|
233 |
+
"scr_dir2_threshold_20": 0.2137097878988163,
|
234 |
+
"scr_dir1_threshold_50": 0.43814420636725065,
|
235 |
+
"scr_metric_threshold_50": 0.2943548639067476,
|
236 |
+
"scr_dir2_threshold_50": 0.2943548639067476,
|
237 |
+
"scr_dir1_threshold_100": 0.43814420636725065,
|
238 |
+
"scr_metric_threshold_100": 0.38306459172024276,
|
239 |
+
"scr_dir2_threshold_100": 0.38306459172024276,
|
240 |
+
"scr_dir1_threshold_500": 0.453608231585545,
|
241 |
+
"scr_metric_threshold_500": 0.5766128702757917,
|
242 |
+
"scr_dir2_threshold_500": 0.5766128702757917
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.29279277827985295,
|
247 |
+
"scr_metric_threshold_2": 0.29777788609633066,
|
248 |
+
"scr_dir2_threshold_2": 0.29777788609633066,
|
249 |
+
"scr_dir1_threshold_5": 0.3333333333333333,
|
250 |
+
"scr_metric_threshold_5": 0.38222229404213226,
|
251 |
+
"scr_dir2_threshold_5": 0.38222229404213226,
|
252 |
+
"scr_dir1_threshold_10": 0.38738749623443636,
|
253 |
+
"scr_metric_threshold_10": 0.49333328741568594,
|
254 |
+
"scr_dir2_threshold_10": 0.49333328741568594,
|
255 |
+
"scr_dir1_threshold_20": 0.4729730527941422,
|
256 |
+
"scr_metric_threshold_20": 0.5777776953614876,
|
257 |
+
"scr_dir2_threshold_20": 0.5777776953614876,
|
258 |
+
"scr_dir1_threshold_50": 0.4054052820454166,
|
259 |
+
"scr_metric_threshold_50": 0.47999986224705776,
|
260 |
+
"scr_dir2_threshold_50": 0.47999986224705776,
|
261 |
+
"scr_dir1_threshold_100": 0.41441444344029416,
|
262 |
+
"scr_metric_threshold_100": 0.5333332980120661,
|
263 |
+
"scr_dir2_threshold_100": 0.5333332980120661,
|
264 |
+
"scr_dir1_threshold_500": 0.14414416591355395,
|
265 |
+
"scr_metric_threshold_500": 0.5999998940361982,
|
266 |
+
"scr_dir2_threshold_500": 0.5999998940361982
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.11587985138236706,
|
271 |
+
"scr_metric_threshold_2": 0.11587985138236706,
|
272 |
+
"scr_dir2_threshold_2": 0.09047614452250444,
|
273 |
+
"scr_dir1_threshold_5": 0.15450646850982275,
|
274 |
+
"scr_metric_threshold_5": 0.15450646850982275,
|
275 |
+
"scr_dir2_threshold_5": 0.1571427922670651,
|
276 |
+
"scr_dir1_threshold_10": 0.18454925082346488,
|
277 |
+
"scr_metric_threshold_10": 0.18454925082346488,
|
278 |
+
"scr_dir2_threshold_10": 0.19999994323368195,
|
279 |
+
"scr_dir1_threshold_20": 0.20171666463720986,
|
280 |
+
"scr_metric_threshold_20": 0.20171666463720986,
|
281 |
+
"scr_dir2_threshold_20": 0.21904759742235502,
|
282 |
+
"scr_dir1_threshold_50": 0.29613731270586624,
|
283 |
+
"scr_metric_threshold_50": 0.29613731270586624,
|
284 |
+
"scr_dir2_threshold_50": 0.21904759742235502,
|
285 |
+
"scr_dir1_threshold_100": 0.24463507126463127,
|
286 |
+
"scr_metric_threshold_100": 0.24463507126463127,
|
287 |
+
"scr_dir2_threshold_100": 0.20476178582295265,
|
288 |
+
"scr_dir1_threshold_500": 0.21030049945102341,
|
289 |
+
"scr_metric_threshold_500": 0.21030049945102341,
|
290 |
+
"scr_dir2_threshold_500": 0.3095237419448595
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "ba20839f-14dc-439c-bb3f-cfa46001940a",
|
73 |
+
"datetime_epoch_millis": 1738792516361,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.2122048197936822,
|
77 |
+
"scr_metric_threshold_2": 0.1276442565195558,
|
78 |
+
"scr_dir2_threshold_2": 0.13525715145509146,
|
79 |
+
"scr_dir1_threshold_5": 0.20309333729784668,
|
80 |
+
"scr_metric_threshold_5": 0.1880617094849222,
|
81 |
+
"scr_dir2_threshold_5": 0.19781287418957544,
|
82 |
+
"scr_dir1_threshold_10": 0.2087873066702555,
|
83 |
+
"scr_metric_threshold_10": 0.25796852124284025,
|
84 |
+
"scr_dir2_threshold_10": 0.28046238714353955,
|
85 |
+
"scr_dir1_threshold_20": 0.2119416002085242,
|
86 |
+
"scr_metric_threshold_20": 0.30689936831597747,
|
87 |
+
"scr_dir2_threshold_20": 0.32259525025751606,
|
88 |
+
"scr_dir1_threshold_50": 0.301772100798797,
|
89 |
+
"scr_metric_threshold_50": 0.3936194698709037,
|
90 |
+
"scr_dir2_threshold_50": 0.40370783200281274,
|
91 |
+
"scr_dir1_threshold_100": 0.06892208153648197,
|
92 |
+
"scr_metric_threshold_100": 0.30548528244548945,
|
93 |
+
"scr_dir2_threshold_100": 0.32957328747021003,
|
94 |
+
"scr_dir1_threshold_500": -0.5674361096258931,
|
95 |
+
"scr_metric_threshold_500": 0.31796247845360437,
|
96 |
+
"scr_dir2_threshold_500": 0.3613484461501164
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.42187508731147705,
|
103 |
+
"scr_metric_threshold_2": 0.024630517283144072,
|
104 |
+
"scr_dir2_threshold_2": 0.024630517283144072,
|
105 |
+
"scr_dir1_threshold_5": 0.4843752037267798,
|
106 |
+
"scr_metric_threshold_5": 0.039408798291137845,
|
107 |
+
"scr_dir2_threshold_5": 0.039408798291137845,
|
108 |
+
"scr_dir1_threshold_10": 0.5156247962732202,
|
109 |
+
"scr_metric_threshold_10": 0.0566502484750167,
|
110 |
+
"scr_dir2_threshold_10": 0.0566502484750167,
|
111 |
+
"scr_dir1_threshold_20": 0.4843752037267798,
|
112 |
+
"scr_metric_threshold_20": 0.07389155184943222,
|
113 |
+
"scr_dir2_threshold_20": 0.07389155184943222,
|
114 |
+
"scr_dir1_threshold_50": 0.42187508731147705,
|
115 |
+
"scr_metric_threshold_50": 0.16256153151632155,
|
116 |
+
"scr_dir2_threshold_50": 0.16256153151632155,
|
117 |
+
"scr_dir1_threshold_100": 0.39062549476503666,
|
118 |
+
"scr_metric_threshold_100": 0.3029555681159207,
|
119 |
+
"scr_dir2_threshold_100": 0.3029555681159207,
|
120 |
+
"scr_dir1_threshold_500": -1.2656243306120092,
|
121 |
+
"scr_metric_threshold_500": 0.3817733115076598,
|
122 |
+
"scr_dir2_threshold_500": 0.3817733115076598
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2803735818828607,
|
127 |
+
"scr_metric_threshold_2": 0.20461106337777285,
|
128 |
+
"scr_dir2_threshold_2": 0.20461106337777285,
|
129 |
+
"scr_dir1_threshold_5": 0.2897198135878545,
|
130 |
+
"scr_metric_threshold_5": 0.21902027141928104,
|
131 |
+
"scr_dir2_threshold_5": 0.21902027141928104,
|
132 |
+
"scr_dir1_threshold_10": 0.2616822325784525,
|
133 |
+
"scr_metric_threshold_10": 0.273775296331274,
|
134 |
+
"scr_dir2_threshold_10": 0.273775296331274,
|
135 |
+
"scr_dir1_threshold_20": 0.2242989769168465,
|
136 |
+
"scr_metric_threshold_20": 0.35158512281820375,
|
137 |
+
"scr_dir2_threshold_20": 0.35158512281820375,
|
138 |
+
"scr_dir1_threshold_50": 0.2242989769168465,
|
139 |
+
"scr_metric_threshold_50": 0.37463975262183136,
|
140 |
+
"scr_dir2_threshold_50": 0.37463975262183136,
|
141 |
+
"scr_dir1_threshold_100": -0.9345797203817817,
|
142 |
+
"scr_metric_threshold_100": -0.014409208041508192,
|
143 |
+
"scr_dir2_threshold_100": -0.014409208041508192,
|
144 |
+
"scr_dir1_threshold_500": -1.1962619529602343,
|
145 |
+
"scr_metric_threshold_500": -0.1325648513989227,
|
146 |
+
"scr_dir2_threshold_500": -0.1325648513989227
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5468753201420825,
|
151 |
+
"scr_metric_threshold_2": 0.04810139609153522,
|
152 |
+
"scr_dir2_threshold_2": 0.04810139609153522,
|
153 |
+
"scr_dir1_threshold_5": 0.5468753201420825,
|
154 |
+
"scr_metric_threshold_5": 0.09367087767151386,
|
155 |
+
"scr_dir2_threshold_5": 0.09367087767151386,
|
156 |
+
"scr_dir1_threshold_10": 0.5468753201420825,
|
157 |
+
"scr_metric_threshold_10": 0.1721519281497015,
|
158 |
+
"scr_dir2_threshold_10": 0.1721519281497015,
|
159 |
+
"scr_dir1_threshold_20": 0.5156247962732202,
|
160 |
+
"scr_metric_threshold_20": 0.20759495886600893,
|
161 |
+
"scr_dir2_threshold_20": 0.20759495886600893,
|
162 |
+
"scr_dir1_threshold_50": 0.4687504074535596,
|
163 |
+
"scr_metric_threshold_50": 0.29113923477603215,
|
164 |
+
"scr_dir2_threshold_50": 0.29113923477603215,
|
165 |
+
"scr_dir1_threshold_100": 0.21874994179234863,
|
166 |
+
"scr_metric_threshold_100": 0.022784816238899015,
|
167 |
+
"scr_dir2_threshold_100": 0.022784816238899015,
|
168 |
+
"scr_dir1_threshold_500": -2.8906245634426146,
|
169 |
+
"scr_metric_threshold_500": 0.0075949890455728015,
|
170 |
+
"scr_dir2_threshold_500": 0.0075949890455728015
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2519684780685472,
|
175 |
+
"scr_metric_threshold_2": 0.1780415435515382,
|
176 |
+
"scr_dir2_threshold_2": 0.1780415435515382,
|
177 |
+
"scr_dir1_threshold_5": -0.007873912274188802,
|
178 |
+
"scr_metric_threshold_5": 0.249258302466854,
|
179 |
+
"scr_dir2_threshold_5": 0.249258302466854,
|
180 |
+
"scr_dir1_threshold_10": -0.04724394297291932,
|
181 |
+
"scr_metric_threshold_10": 0.2997032856130665,
|
182 |
+
"scr_dir2_threshold_10": 0.2997032856130665,
|
183 |
+
"scr_dir1_threshold_20": -0.16535403506911087,
|
184 |
+
"scr_metric_threshold_20": 0.28486650505613625,
|
185 |
+
"scr_dir2_threshold_20": 0.28486650505613625,
|
186 |
+
"scr_dir1_threshold_50": 0.4724408377125527,
|
187 |
+
"scr_metric_threshold_50": 0.3531157663653655,
|
188 |
+
"scr_dir2_threshold_50": 0.3531157663653655,
|
189 |
+
"scr_dir1_threshold_100": 0.5354330745616361,
|
190 |
+
"scr_metric_threshold_100": 0.10682496150459805,
|
191 |
+
"scr_dir2_threshold_100": 0.10682496150459805,
|
192 |
+
"scr_dir1_threshold_500": 0.33070853946600826,
|
193 |
+
"scr_metric_threshold_500": 0.20474778392768772,
|
194 |
+
"scr_dir2_threshold_500": 0.20474778392768772
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.010869424374669583,
|
199 |
+
"scr_metric_threshold_2": 0.10980391240220748,
|
200 |
+
"scr_dir2_threshold_2": 0.10980391240220748,
|
201 |
+
"scr_dir1_threshold_5": 0.03260859706226871,
|
202 |
+
"scr_metric_threshold_5": 0.262744992625386,
|
203 |
+
"scr_dir2_threshold_5": 0.262744992625386,
|
204 |
+
"scr_dir1_threshold_10": 0.04347802143693829,
|
205 |
+
"scr_metric_threshold_10": 0.5333332554187635,
|
206 |
+
"scr_dir2_threshold_10": 0.5333332554187635,
|
207 |
+
"scr_dir1_threshold_20": 0.03260859706226871,
|
208 |
+
"scr_metric_threshold_20": 0.6156863065922739,
|
209 |
+
"scr_dir2_threshold_20": 0.6156863065922739,
|
210 |
+
"scr_dir1_threshold_50": -0.04347834537519825,
|
211 |
+
"scr_metric_threshold_50": 0.7098039124022075,
|
212 |
+
"scr_dir2_threshold_50": 0.7098039124022075,
|
213 |
+
"scr_dir1_threshold_100": -0.03804363318786346,
|
214 |
+
"scr_metric_threshold_100": 0.7686273868154523,
|
215 |
+
"scr_dir2_threshold_100": 0.7686273868154523,
|
216 |
+
"scr_dir1_threshold_500": 0.059782481937202626,
|
217 |
+
"scr_metric_threshold_500": 0.6549019562011037,
|
218 |
+
"scr_dir2_threshold_500": 0.6549019562011037
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.09278322958848217,
|
223 |
+
"scr_metric_threshold_2": 0.0927419335456348,
|
224 |
+
"scr_dir2_threshold_2": 0.0927419335456348,
|
225 |
+
"scr_dir1_threshold_5": 0.12886585268435963,
|
226 |
+
"scr_metric_threshold_5": 0.1491935348194436,
|
227 |
+
"scr_dir2_threshold_5": 0.1491935348194436,
|
228 |
+
"scr_dir1_threshold_10": 0.18041219375810347,
|
229 |
+
"scr_metric_threshold_10": 0.16935480382142643,
|
230 |
+
"scr_dir2_threshold_10": 0.16935480382142643,
|
231 |
+
"scr_dir1_threshold_20": 0.25773170536871925,
|
232 |
+
"scr_metric_threshold_20": 0.20967734182539205,
|
233 |
+
"scr_dir2_threshold_20": 0.20967734182539205,
|
234 |
+
"scr_dir1_threshold_50": 0.3247420716607574,
|
235 |
+
"scr_metric_threshold_50": 0.41935492399206875,
|
236 |
+
"scr_dir2_threshold_50": 0.41935492399206875,
|
237 |
+
"scr_dir1_threshold_100": 0.26288658526843595,
|
238 |
+
"scr_metric_threshold_100": 0.4475806044583308,
|
239 |
+
"scr_dir2_threshold_100": 0.4475806044583308,
|
240 |
+
"scr_dir1_threshold_500": 0.2938143284645967,
|
241 |
+
"scr_metric_threshold_500": 0.6330644715496004,
|
242 |
+
"scr_dir2_threshold_500": 0.6330644715496004
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.05855860935384807,
|
247 |
+
"scr_metric_threshold_2": 0.32888885827712394,
|
248 |
+
"scr_dir2_threshold_2": 0.32888885827712394,
|
249 |
+
"scr_dir1_threshold_5": 0.09009000301245093,
|
250 |
+
"scr_metric_threshold_5": 0.43111107814459504,
|
251 |
+
"scr_dir2_threshold_5": 0.43111107814459504,
|
252 |
+
"scr_dir1_threshold_10": 0.13513500451867638,
|
253 |
+
"scr_metric_threshold_10": 0.5244445245059836,
|
254 |
+
"scr_dir2_threshold_10": 0.5244445245059836,
|
255 |
+
"scr_dir1_threshold_20": 0.2432433303208824,
|
256 |
+
"scr_metric_threshold_20": 0.6088889324517851,
|
257 |
+
"scr_dir2_threshold_20": 0.6088889324517851,
|
258 |
+
"scr_dir1_threshold_50": 0.37387388838681374,
|
259 |
+
"scr_metric_threshold_50": 0.6666667549698347,
|
260 |
+
"scr_dir2_threshold_50": 0.6666667549698347,
|
261 |
+
"scr_dir1_threshold_100": 0.009008892905490125,
|
262 |
+
"scr_metric_threshold_100": 0.7022221139036694,
|
263 |
+
"scr_dir2_threshold_100": 0.7022221139036694,
|
264 |
+
"scr_dir1_threshold_500": 0.09009000301245093,
|
265 |
+
"scr_metric_threshold_500": 0.7555555496686777,
|
266 |
+
"scr_dir2_threshold_500": 0.7555555496686777
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.03433482762748996,
|
271 |
+
"scr_metric_threshold_2": 0.03433482762748996,
|
272 |
+
"scr_dir2_threshold_2": 0.09523798711177515,
|
273 |
+
"scr_dir1_threshold_5": 0.060085820441166386,
|
274 |
+
"scr_metric_threshold_5": 0.060085820441166386,
|
275 |
+
"scr_dir2_threshold_5": 0.13809513807839202,
|
276 |
+
"scr_dir1_threshold_10": 0.03433482762748996,
|
277 |
+
"scr_metric_threshold_10": 0.03433482762748996,
|
278 |
+
"scr_dir2_threshold_10": 0.21428575483308432,
|
279 |
+
"scr_dir1_threshold_20": 0.10300422706858779,
|
280 |
+
"scr_metric_threshold_20": 0.10300422706858779,
|
281 |
+
"scr_dir2_threshold_20": 0.22857128260089646,
|
282 |
+
"scr_dir1_threshold_50": 0.17167388232356773,
|
283 |
+
"scr_metric_threshold_50": 0.17167388232356773,
|
284 |
+
"scr_dir2_threshold_50": 0.25238077937884024,
|
285 |
+
"scr_dir1_threshold_100": 0.10729601656855352,
|
286 |
+
"scr_metric_threshold_100": 0.10729601656855352,
|
287 |
+
"scr_dir2_threshold_100": 0.300000056766318,
|
288 |
+
"scr_dir1_threshold_500": 0.03862661712745569,
|
289 |
+
"scr_metric_threshold_500": 0.03862661712745569,
|
290 |
+
"scr_dir2_threshold_500": 0.3857143586995518
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
random_seed_eval_results/scr/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "scr",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": true,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "34ee72b9-1763-4633-b984-223c06ba4460",
|
73 |
+
"datetime_epoch_millis": 1738792841644,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"scr_metrics": {
|
76 |
+
"scr_dir1_threshold_2": 0.23138581033127065,
|
77 |
+
"scr_metric_threshold_2": 0.13280867947941138,
|
78 |
+
"scr_dir2_threshold_2": 0.1354757659784241,
|
79 |
+
"scr_dir1_threshold_5": 0.25065267944291175,
|
80 |
+
"scr_metric_threshold_5": 0.19412669695057988,
|
81 |
+
"scr_dir2_threshold_5": 0.201731934928515,
|
82 |
+
"scr_dir1_threshold_10": 0.26031112509300863,
|
83 |
+
"scr_metric_threshold_10": 0.27721479325569365,
|
84 |
+
"scr_dir2_threshold_10": 0.2875535313264051,
|
85 |
+
"scr_dir1_threshold_20": 0.22049703318560473,
|
86 |
+
"scr_metric_threshold_20": 0.3181290025820012,
|
87 |
+
"scr_dir2_threshold_20": 0.3235142432818539,
|
88 |
+
"scr_dir1_threshold_50": 0.3025873216306098,
|
89 |
+
"scr_metric_threshold_50": 0.3842515908876025,
|
90 |
+
"scr_dir2_threshold_50": 0.3905181916365426,
|
91 |
+
"scr_dir1_threshold_100": 0.16621406066629962,
|
92 |
+
"scr_metric_threshold_100": 0.32000395023975886,
|
93 |
+
"scr_dir2_threshold_100": 0.3322893742558974,
|
94 |
+
"scr_dir1_threshold_500": -0.4676586781273777,
|
95 |
+
"scr_metric_threshold_500": 0.3251383965129933,
|
96 |
+
"scr_dir2_threshold_500": 0.3427886213575595
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_professor_nurse_results",
|
102 |
+
"scr_dir1_threshold_2": 0.4531256111803394,
|
103 |
+
"scr_metric_threshold_2": 0.022167494916722333,
|
104 |
+
"scr_dir2_threshold_2": 0.022167494916722333,
|
105 |
+
"scr_dir1_threshold_5": 0.5156247962732202,
|
106 |
+
"scr_metric_threshold_5": 0.044334989833444666,
|
107 |
+
"scr_dir2_threshold_5": 0.044334989833444666,
|
108 |
+
"scr_dir1_threshold_10": 0.4843752037267798,
|
109 |
+
"scr_metric_threshold_10": 0.05418707929913162,
|
110 |
+
"scr_dir2_threshold_10": 0.05418707929913162,
|
111 |
+
"scr_dir1_threshold_20": 0.43749988358469727,
|
112 |
+
"scr_metric_threshold_20": 0.07389155184943222,
|
113 |
+
"scr_dir2_threshold_20": 0.07389155184943222,
|
114 |
+
"scr_dir1_threshold_50": 0.42187508731147705,
|
115 |
+
"scr_metric_threshold_50": 0.12561575559160543,
|
116 |
+
"scr_dir2_threshold_50": 0.12561575559160543,
|
117 |
+
"scr_dir1_threshold_100": 0.42187508731147705,
|
118 |
+
"scr_metric_threshold_100": 0.19211824034177244,
|
119 |
+
"scr_dir2_threshold_100": 0.19211824034177244,
|
120 |
+
"scr_dir1_threshold_500": -1.3749997671693945,
|
121 |
+
"scr_metric_threshold_500": 0.35960581659093743,
|
122 |
+
"scr_dir2_threshold_500": 0.35960581659093743
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_architect_journalist_results",
|
126 |
+
"scr_dir1_threshold_2": 0.2803735818828607,
|
127 |
+
"scr_metric_threshold_2": 0.24495688024825762,
|
128 |
+
"scr_dir2_threshold_2": 0.24495688024825762,
|
129 |
+
"scr_dir1_threshold_5": 0.2803735818828607,
|
130 |
+
"scr_metric_threshold_5": 0.2997119051602506,
|
131 |
+
"scr_dir2_threshold_5": 0.2997119051602506,
|
132 |
+
"scr_dir1_threshold_10": 0.33644874390166457,
|
133 |
+
"scr_metric_threshold_10": 0.37175794536779155,
|
134 |
+
"scr_dir2_threshold_10": 0.37175794536779155,
|
135 |
+
"scr_dir1_threshold_20": 0.2149533022646424,
|
136 |
+
"scr_metric_threshold_20": 0.4293947775338243,
|
137 |
+
"scr_dir2_threshold_20": 0.4293947775338243,
|
138 |
+
"scr_dir1_threshold_50": 0.2149533022646424,
|
139 |
+
"scr_metric_threshold_50": 0.38328534615525994,
|
140 |
+
"scr_dir2_threshold_50": 0.38328534615525994,
|
141 |
+
"scr_dir1_threshold_100": -0.6448599067939272,
|
142 |
+
"scr_metric_threshold_100": -0.05475502491199297,
|
143 |
+
"scr_dir2_threshold_100": -0.05475502491199297,
|
144 |
+
"scr_dir1_threshold_500": -1.4018695805726726,
|
145 |
+
"scr_metric_threshold_500": -0.1527376739485105,
|
146 |
+
"scr_dir2_threshold_500": -0.1527376739485105
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_surgeon_psychologist_results",
|
150 |
+
"scr_dir1_threshold_2": 0.5468753201420825,
|
151 |
+
"scr_metric_threshold_2": 0.037974794330044616,
|
152 |
+
"scr_dir2_threshold_2": 0.037974794330044616,
|
153 |
+
"scr_dir1_threshold_5": 0.5625001164153027,
|
154 |
+
"scr_metric_threshold_5": 0.06075961056894363,
|
155 |
+
"scr_dir2_threshold_5": 0.06075961056894363,
|
156 |
+
"scr_dir1_threshold_10": 0.5625001164153027,
|
157 |
+
"scr_metric_threshold_10": 0.10632924304674166,
|
158 |
+
"scr_dir2_threshold_10": 0.10632924304674166,
|
159 |
+
"scr_dir1_threshold_20": 0.43749988358469727,
|
160 |
+
"scr_metric_threshold_20": 0.14936711191080249,
|
161 |
+
"scr_dir2_threshold_20": 0.14936711191080249,
|
162 |
+
"scr_dir1_threshold_50": 0.42187508731147705,
|
163 |
+
"scr_metric_threshold_50": 0.24303798958231634,
|
164 |
+
"scr_dir2_threshold_50": 0.24303798958231634,
|
165 |
+
"scr_dir1_threshold_100": 0.2656252619344312,
|
166 |
+
"scr_metric_threshold_100": 0.2860760093441966,
|
167 |
+
"scr_dir2_threshold_100": 0.2860760093441966,
|
168 |
+
"scr_dir1_threshold_500": -2.1718746216502662,
|
169 |
+
"scr_metric_threshold_500": 0.02784819256855401,
|
170 |
+
"scr_dir2_threshold_500": 0.02784819256855401
|
171 |
+
},
|
172 |
+
{
|
173 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_scr_attorney_teacher_results",
|
174 |
+
"scr_dir1_threshold_2": 0.2519684780685472,
|
175 |
+
"scr_metric_threshold_2": 0.16913958133840545,
|
176 |
+
"scr_dir2_threshold_2": 0.16913958133840545,
|
177 |
+
"scr_dir1_threshold_5": 0.06299223684908342,
|
178 |
+
"scr_metric_threshold_5": 0.25816026467998676,
|
179 |
+
"scr_dir2_threshold_5": 0.25816026467998676,
|
180 |
+
"scr_dir1_threshold_10": -0.03937003069873052,
|
181 |
+
"scr_metric_threshold_10": 0.32047488451379413,
|
182 |
+
"scr_dir2_threshold_10": 0.32047488451379413,
|
183 |
+
"scr_dir1_threshold_20": -0.18897624121946377,
|
184 |
+
"scr_metric_threshold_20": 0.2818991843184253,
|
185 |
+
"scr_dir2_threshold_20": 0.2818991843184253,
|
186 |
+
"scr_dir1_threshold_50": 0.46456692543836386,
|
187 |
+
"scr_metric_threshold_50": 0.3531157663653655,
|
188 |
+
"scr_dir2_threshold_50": 0.3531157663653655,
|
189 |
+
"scr_dir1_threshold_100": 0.5826770175345555,
|
190 |
+
"scr_metric_threshold_100": 0.10385764076688712,
|
191 |
+
"scr_dir2_threshold_100": 0.10385764076688712,
|
192 |
+
"scr_dir1_threshold_500": 0.4488191008899863,
|
193 |
+
"scr_metric_threshold_500": 0.19584582171455495,
|
194 |
+
"scr_dir2_threshold_500": 0.19584582171455495
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Books_CDs_and_Vinyl_results",
|
198 |
+
"scr_dir1_threshold_2": 0.08695636681213655,
|
199 |
+
"scr_metric_threshold_2": 0.13333325541876354,
|
200 |
+
"scr_dir2_threshold_2": 0.13333325541876354,
|
201 |
+
"scr_dir1_threshold_5": 0.08152165462480175,
|
202 |
+
"scr_metric_threshold_5": 0.24705868603311212,
|
203 |
+
"scr_dir2_threshold_5": 0.24705868603311212,
|
204 |
+
"scr_dir1_threshold_10": 0.08695636681213655,
|
205 |
+
"scr_metric_threshold_10": 0.5411765255867551,
|
206 |
+
"scr_dir2_threshold_10": 0.5411765255867551,
|
207 |
+
"scr_dir1_threshold_20": 0.09782611512506609,
|
208 |
+
"scr_metric_threshold_20": 0.6470586860331121,
|
209 |
+
"scr_dir2_threshold_20": 0.6470586860331121,
|
210 |
+
"scr_dir1_threshold_50": -0.05434776974986783,
|
211 |
+
"scr_metric_threshold_50": 0.7176469488264897,
|
212 |
+
"scr_dir2_threshold_50": 0.7176469488264897,
|
213 |
+
"scr_dir1_threshold_100": -0.05434776974986783,
|
214 |
+
"scr_metric_threshold_100": 0.7372547736309046,
|
215 |
+
"scr_dir2_threshold_100": 0.7372547736309046,
|
216 |
+
"scr_dir1_threshold_500": 0.05434776974986783,
|
217 |
+
"scr_metric_threshold_500": 0.7450980437988962,
|
218 |
+
"scr_dir2_threshold_500": 0.7450980437988962
|
219 |
+
},
|
220 |
+
{
|
221 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Software_Electronics_results",
|
222 |
+
"scr_dir1_threshold_2": 0.07731951161061577,
|
223 |
+
"scr_metric_threshold_2": 0.0927419335456348,
|
224 |
+
"scr_dir2_threshold_2": 0.0927419335456348,
|
225 |
+
"scr_dir1_threshold_5": 0.190721646317109,
|
226 |
+
"scr_metric_threshold_5": 0.16129039235714715,
|
227 |
+
"scr_dir2_threshold_5": 0.16129039235714715,
|
228 |
+
"scr_dir1_threshold_10": 0.24226798739085284,
|
229 |
+
"scr_metric_threshold_10": 0.20967734182539205,
|
230 |
+
"scr_dir2_threshold_10": 0.20967734182539205,
|
231 |
+
"scr_dir1_threshold_20": 0.3144329263421798,
|
232 |
+
"scr_metric_threshold_20": 0.20967734182539205,
|
233 |
+
"scr_dir2_threshold_20": 0.20967734182539205,
|
234 |
+
"scr_dir1_threshold_50": 0.28350487590559115,
|
235 |
+
"scr_metric_threshold_50": 0.3709677341825392,
|
236 |
+
"scr_dir2_threshold_50": 0.3709677341825392,
|
237 |
+
"scr_dir1_threshold_100": 0.309278353682891,
|
238 |
+
"scr_metric_threshold_100": 0.3870967974523824,
|
239 |
+
"scr_dir2_threshold_100": 0.3870967974523824,
|
240 |
+
"scr_dir1_threshold_500": 0.36082469475663487,
|
241 |
+
"scr_metric_threshold_500": 0.608870996815478,
|
242 |
+
"scr_dir2_threshold_500": 0.608870996815478
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Pet_Supplies_Office_Products_results",
|
246 |
+
"scr_dir1_threshold_2": 0.09009000301245093,
|
247 |
+
"scr_metric_threshold_2": 0.29777788609633066,
|
248 |
+
"scr_dir2_threshold_2": 0.29777788609633066,
|
249 |
+
"scr_dir1_threshold_5": 0.23423416892600485,
|
250 |
+
"scr_metric_threshold_5": 0.404444492716843,
|
251 |
+
"scr_dir2_threshold_5": 0.404444492716843,
|
252 |
+
"scr_dir1_threshold_10": 0.3063063861274755,
|
253 |
+
"scr_metric_threshold_10": 0.5111110993373553,
|
254 |
+
"scr_dir2_threshold_10": 0.5111110993373553,
|
255 |
+
"scr_dir1_threshold_20": 0.27477472397948527,
|
256 |
+
"scr_metric_threshold_20": 0.5777776953614876,
|
257 |
+
"scr_dir2_threshold_20": 0.5777776953614876,
|
258 |
+
"scr_dir1_threshold_50": 0.42792778279852933,
|
259 |
+
"scr_metric_threshold_50": 0.6399999046325785,
|
260 |
+
"scr_dir2_threshold_50": 0.6399999046325785,
|
261 |
+
"scr_dir1_threshold_100": 0.24774777677362747,
|
262 |
+
"scr_metric_threshold_100": 0.7066665006567107,
|
263 |
+
"scr_dir2_threshold_100": 0.7066665006567107,
|
264 |
+
"scr_dir1_threshold_500": 0.18468472096703434,
|
265 |
+
"scr_metric_threshold_500": 0.6577777165542479,
|
266 |
+
"scr_dir2_threshold_500": 0.6577777165542479
|
267 |
+
},
|
268 |
+
{
|
269 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_scr_Industrial_and_Scientific_Toys_and_Games_results",
|
270 |
+
"scr_dir1_threshold_2": 0.0643776099411321,
|
271 |
+
"scr_metric_threshold_2": 0.0643776099411321,
|
272 |
+
"scr_dir2_threshold_2": 0.08571430193323373,
|
273 |
+
"scr_dir1_threshold_5": 0.07725323425491137,
|
274 |
+
"scr_metric_threshold_5": 0.07725323425491137,
|
275 |
+
"scr_dir2_threshold_5": 0.13809513807839202,
|
276 |
+
"scr_dir1_threshold_10": 0.10300422706858779,
|
277 |
+
"scr_metric_threshold_10": 0.10300422706858779,
|
278 |
+
"scr_dir2_threshold_10": 0.18571413163427958,
|
279 |
+
"scr_dir1_threshold_20": 0.17596567182353345,
|
280 |
+
"scr_metric_threshold_20": 0.17596567182353345,
|
281 |
+
"scr_dir2_threshold_20": 0.21904759742235502,
|
282 |
+
"scr_dir1_threshold_50": 0.24034328176466555,
|
283 |
+
"scr_metric_threshold_50": 0.24034328176466555,
|
284 |
+
"scr_dir2_threshold_50": 0.2904760877561864,
|
285 |
+
"scr_dir1_threshold_100": 0.20171666463720986,
|
286 |
+
"scr_metric_threshold_100": 0.20171666463720986,
|
287 |
+
"scr_dir2_threshold_100": 0.300000056766318,
|
288 |
+
"scr_dir1_threshold_500": 0.15879825800978847,
|
289 |
+
"scr_metric_threshold_500": 0.15879825800978847,
|
290 |
+
"scr_dir2_threshold_500": 0.300000056766318
|
291 |
+
}
|
292 |
+
],
|
293 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
294 |
+
"sae_lens_id": "custom_sae",
|
295 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4",
|
296 |
+
"sae_lens_version": "5.4.1",
|
297 |
+
"sae_cfg_dict": {
|
298 |
+
"model_name": "gemma-2-2b",
|
299 |
+
"d_in": 2304,
|
300 |
+
"d_sae": 16384,
|
301 |
+
"hook_layer": 12,
|
302 |
+
"hook_name": "blocks.12.hook_resid_post",
|
303 |
+
"context_size": null,
|
304 |
+
"hook_head_index": null,
|
305 |
+
"architecture": "topk",
|
306 |
+
"apply_b_dec_to_input": null,
|
307 |
+
"finetuning_scaling_factor": null,
|
308 |
+
"activation_fn_str": "",
|
309 |
+
"prepend_bos": true,
|
310 |
+
"normalize_activations": "none",
|
311 |
+
"dtype": "bfloat16",
|
312 |
+
"device": "",
|
313 |
+
"dataset_path": "",
|
314 |
+
"dataset_trust_remote_code": true,
|
315 |
+
"seqpos_slice": [
|
316 |
+
null
|
317 |
+
],
|
318 |
+
"training_tokens": -100000,
|
319 |
+
"sae_lens_training_version": null,
|
320 |
+
"neuronpedia_id": null
|
321 |
+
},
|
322 |
+
"eval_result_unstructured": null
|
323 |
+
}
|
random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "6b237941-8790-4424-841f-44d2b2d4b18c",
|
30 |
+
"datetime_epoch_millis": 1738794566740,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9582500416785479,
|
34 |
+
"llm_top_1_test_accuracy": 0.6746375,
|
35 |
+
"llm_top_2_test_accuracy": 0.7199437500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.78408125,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9553687926381826,
|
44 |
+
"sae_top_1_test_accuracy": 0.73944375,
|
45 |
+
"sae_top_2_test_accuracy": 0.7974,
|
46 |
+
"sae_top_5_test_accuracy": 0.8732937500000001,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9626000404357911,
|
65 |
+
"sae_top_1_test_accuracy": 0.7674,
|
66 |
+
"sae_top_2_test_accuracy": 0.842,
|
67 |
+
"sae_top_5_test_accuracy": 0.8918000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9524000525474549,
|
76 |
+
"llm_top_1_test_accuracy": 0.6764,
|
77 |
+
"llm_top_2_test_accuracy": 0.7150000000000001,
|
78 |
+
"llm_top_5_test_accuracy": 0.7592000000000001,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9502000451087952,
|
84 |
+
"sae_top_1_test_accuracy": 0.6858000000000001,
|
85 |
+
"sae_top_2_test_accuracy": 0.7654,
|
86 |
+
"sae_top_5_test_accuracy": 0.8273999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9290000438690186,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864,
|
96 |
+
"llm_top_2_test_accuracy": 0.7316,
|
97 |
+
"llm_top_5_test_accuracy": 0.7666000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9254000306129455,
|
103 |
+
"sae_top_1_test_accuracy": 0.746,
|
104 |
+
"sae_top_2_test_accuracy": 0.8208,
|
105 |
+
"sae_top_5_test_accuracy": 0.8624,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.916200053691864,
|
114 |
+
"llm_top_1_test_accuracy": 0.6113999999999999,
|
115 |
+
"llm_top_2_test_accuracy": 0.6481999999999999,
|
116 |
+
"llm_top_5_test_accuracy": 0.6894,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.915000057220459,
|
122 |
+
"sae_top_1_test_accuracy": 0.6953999999999999,
|
123 |
+
"sae_top_2_test_accuracy": 0.7772000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.826,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9730000495910645,
|
141 |
+
"sae_top_1_test_accuracy": 0.832,
|
142 |
+
"sae_top_2_test_accuracy": 0.83,
|
143 |
+
"sae_top_5_test_accuracy": 0.948,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9714000344276428,
|
152 |
+
"llm_top_1_test_accuracy": 0.6452000000000001,
|
153 |
+
"llm_top_2_test_accuracy": 0.6928,
|
154 |
+
"llm_top_5_test_accuracy": 0.7726,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9684000372886657,
|
160 |
+
"sae_top_1_test_accuracy": 0.6340000000000001,
|
161 |
+
"sae_top_2_test_accuracy": 0.7074,
|
162 |
+
"sae_top_5_test_accuracy": 0.8161999999999999,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9460000544786453,
|
171 |
+
"llm_top_1_test_accuracy": 0.7325,
|
172 |
+
"llm_top_2_test_accuracy": 0.77375,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9497500509023666,
|
179 |
+
"sae_top_1_test_accuracy": 0.68075,
|
180 |
+
"sae_top_2_test_accuracy": 0.737,
|
181 |
+
"sae_top_5_test_accuracy": 0.82075,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000051498413,
|
190 |
+
"llm_top_1_test_accuracy": 0.7296,
|
191 |
+
"llm_top_2_test_accuracy": 0.7868,
|
192 |
+
"llm_top_5_test_accuracy": 0.9067999999999999,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9986000299453736,
|
198 |
+
"sae_top_1_test_accuracy": 0.8742000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.8994,
|
200 |
+
"sae_top_5_test_accuracy": 0.9938,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9440000653266907,
|
240 |
+
"1": 0.9600000381469727,
|
241 |
+
"2": 0.9520000219345093,
|
242 |
+
"6": 0.9820000529289246,
|
243 |
+
"9": 0.9750000238418579
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.593,
|
275 |
+
"1": 0.615,
|
276 |
+
"2": 0.871,
|
277 |
+
"6": 0.833,
|
278 |
+
"9": 0.925
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.627,
|
282 |
+
"1": 0.799,
|
283 |
+
"2": 0.874,
|
284 |
+
"6": 0.981,
|
285 |
+
"9": 0.929
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.846,
|
289 |
+
"1": 0.81,
|
290 |
+
"2": 0.885,
|
291 |
+
"6": 0.981,
|
292 |
+
"9": 0.937
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9650000333786011,
|
298 |
+
"13": 0.9470000267028809,
|
299 |
+
"14": 0.9540000557899475,
|
300 |
+
"18": 0.9260000586509705,
|
301 |
+
"19": 0.9590000510215759
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9620000720024109,
|
305 |
+
"13": 0.9470000267028809,
|
306 |
+
"14": 0.9580000638961792,
|
307 |
+
"18": 0.9310000538825989,
|
308 |
+
"19": 0.9640000462532043
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.558,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.656,
|
314 |
+
"18": 0.702,
|
315 |
+
"19": 0.793
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.686,
|
319 |
+
"13": 0.713,
|
320 |
+
"14": 0.687,
|
321 |
+
"18": 0.724,
|
322 |
+
"19": 0.765
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.782,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.716,
|
328 |
+
"18": 0.725,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.596,
|
333 |
+
"13": 0.692,
|
334 |
+
"14": 0.648,
|
335 |
+
"18": 0.701,
|
336 |
+
"19": 0.792
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.745,
|
340 |
+
"13": 0.681,
|
341 |
+
"14": 0.862,
|
342 |
+
"18": 0.698,
|
343 |
+
"19": 0.841
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.95,
|
347 |
+
"13": 0.713,
|
348 |
+
"14": 0.88,
|
349 |
+
"18": 0.739,
|
350 |
+
"19": 0.855
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9530000686645508,
|
356 |
+
"21": 0.9200000166893005,
|
357 |
+
"22": 0.906000018119812,
|
358 |
+
"25": 0.9520000219345093,
|
359 |
+
"26": 0.8960000276565552
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9610000252723694,
|
363 |
+
"21": 0.9140000343322754,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9630000591278076,
|
366 |
+
"26": 0.89000004529953
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.711,
|
370 |
+
"21": 0.771,
|
371 |
+
"22": 0.637,
|
372 |
+
"25": 0.687,
|
373 |
+
"26": 0.626
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.809,
|
377 |
+
"21": 0.764,
|
378 |
+
"22": 0.659,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.66
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.858,
|
384 |
+
"21": 0.795,
|
385 |
+
"22": 0.715,
|
386 |
+
"25": 0.786,
|
387 |
+
"26": 0.679
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.873,
|
391 |
+
"21": 0.521,
|
392 |
+
"22": 0.854,
|
393 |
+
"25": 0.884,
|
394 |
+
"26": 0.598
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.886,
|
398 |
+
"21": 0.824,
|
399 |
+
"22": 0.88,
|
400 |
+
"25": 0.875,
|
401 |
+
"26": 0.639
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.915,
|
405 |
+
"21": 0.843,
|
406 |
+
"22": 0.884,
|
407 |
+
"25": 0.894,
|
408 |
+
"26": 0.776
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9440000653266907,
|
414 |
+
"2": 0.9280000329017639,
|
415 |
+
"3": 0.9310000538825989,
|
416 |
+
"5": 0.9070000648498535,
|
417 |
+
"6": 0.8650000691413879
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9500000476837158,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9260000586509705,
|
423 |
+
"5": 0.9120000600814819,
|
424 |
+
"6": 0.8560000658035278
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.693,
|
428 |
+
"2": 0.607,
|
429 |
+
"3": 0.579,
|
430 |
+
"5": 0.577,
|
431 |
+
"6": 0.601
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.747,
|
435 |
+
"2": 0.64,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.628,
|
438 |
+
"6": 0.619
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.78,
|
442 |
+
"2": 0.657,
|
443 |
+
"3": 0.667,
|
444 |
+
"5": 0.659,
|
445 |
+
"6": 0.684
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.871,
|
449 |
+
"2": 0.598,
|
450 |
+
"3": 0.56,
|
451 |
+
"5": 0.861,
|
452 |
+
"6": 0.587
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.899,
|
456 |
+
"2": 0.822,
|
457 |
+
"3": 0.647,
|
458 |
+
"5": 0.873,
|
459 |
+
"6": 0.645
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.929,
|
463 |
+
"2": 0.861,
|
464 |
+
"3": 0.756,
|
465 |
+
"5": 0.863,
|
466 |
+
"6": 0.721
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9720000624656677,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.832,
|
492 |
+
"5.0": 0.832
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.83,
|
496 |
+
"5.0": 0.83
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.948,
|
500 |
+
"5.0": 0.948
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9540000557899475,
|
506 |
+
"Python": 0.984000027179718,
|
507 |
+
"HTML": 0.9800000190734863,
|
508 |
+
"Java": 0.9670000672340393,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.987000048160553,
|
514 |
+
"HTML": 0.9940000176429749,
|
515 |
+
"Java": 0.9610000252723694,
|
516 |
+
"PHP": 0.9590000510215759
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.636,
|
521 |
+
"HTML": 0.733,
|
522 |
+
"Java": 0.616,
|
523 |
+
"PHP": 0.584
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.671,
|
527 |
+
"Python": 0.668,
|
528 |
+
"HTML": 0.803,
|
529 |
+
"Java": 0.68,
|
530 |
+
"PHP": 0.642
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.765,
|
534 |
+
"Python": 0.727,
|
535 |
+
"HTML": 0.943,
|
536 |
+
"Java": 0.735,
|
537 |
+
"PHP": 0.693
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.616,
|
541 |
+
"Python": 0.632,
|
542 |
+
"HTML": 0.699,
|
543 |
+
"Java": 0.631,
|
544 |
+
"PHP": 0.592
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.62,
|
548 |
+
"Python": 0.923,
|
549 |
+
"HTML": 0.753,
|
550 |
+
"Java": 0.643,
|
551 |
+
"PHP": 0.598
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.678,
|
555 |
+
"Python": 0.931,
|
556 |
+
"HTML": 0.882,
|
557 |
+
"Java": 0.663,
|
558 |
+
"PHP": 0.927
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.940000057220459,
|
564 |
+
"1": 0.987000048160553,
|
565 |
+
"2": 0.9250000715255737,
|
566 |
+
"3": 0.9470000267028809
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9390000700950623,
|
570 |
+
"1": 0.984000027179718,
|
571 |
+
"2": 0.9160000681877136,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.806,
|
576 |
+
"1": 0.662,
|
577 |
+
"2": 0.671,
|
578 |
+
"3": 0.791
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.796,
|
582 |
+
"1": 0.796,
|
583 |
+
"2": 0.694,
|
584 |
+
"3": 0.809
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.816,
|
588 |
+
"1": 0.885,
|
589 |
+
"2": 0.744,
|
590 |
+
"3": 0.84
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.725,
|
594 |
+
"1": 0.701,
|
595 |
+
"2": 0.675,
|
596 |
+
"3": 0.622
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.809,
|
600 |
+
"1": 0.69,
|
601 |
+
"2": 0.811,
|
602 |
+
"3": 0.638
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.842,
|
606 |
+
"1": 0.838,
|
607 |
+
"2": 0.837,
|
608 |
+
"3": 0.766
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 1.0,
|
615 |
+
"de": 0.999000072479248,
|
616 |
+
"es": 0.9980000257492065,
|
617 |
+
"nl": 0.9980000257492065
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.9980000257492065
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.749,
|
628 |
+
"fr": 0.605,
|
629 |
+
"de": 0.741,
|
630 |
+
"es": 0.913,
|
631 |
+
"nl": 0.64
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.831,
|
635 |
+
"fr": 0.607,
|
636 |
+
"de": 0.828,
|
637 |
+
"es": 0.915,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.888,
|
642 |
+
"fr": 0.924,
|
643 |
+
"de": 0.882,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.86
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.838,
|
649 |
+
"fr": 0.992,
|
650 |
+
"de": 0.914,
|
651 |
+
"es": 0.87,
|
652 |
+
"nl": 0.757
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.838,
|
656 |
+
"fr": 0.99,
|
657 |
+
"de": 0.925,
|
658 |
+
"es": 0.99,
|
659 |
+
"nl": 0.754
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.998,
|
663 |
+
"fr": 0.994,
|
664 |
+
"de": 0.984,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.998
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "0e9c84dc-6835-48de-9a61-d54bcf48b3aa",
|
30 |
+
"datetime_epoch_millis": 1738794472536,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9582500416785479,
|
34 |
+
"llm_top_1_test_accuracy": 0.6746375,
|
35 |
+
"llm_top_2_test_accuracy": 0.7199437500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.78408125,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9562937960028648,
|
44 |
+
"sae_top_1_test_accuracy": 0.7659312499999998,
|
45 |
+
"sae_top_2_test_accuracy": 0.8051812500000001,
|
46 |
+
"sae_top_5_test_accuracy": 0.87295625,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9636000394821167,
|
65 |
+
"sae_top_1_test_accuracy": 0.767,
|
66 |
+
"sae_top_2_test_accuracy": 0.8443999999999999,
|
67 |
+
"sae_top_5_test_accuracy": 0.9029999999999999,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9524000525474549,
|
76 |
+
"llm_top_1_test_accuracy": 0.6764,
|
77 |
+
"llm_top_2_test_accuracy": 0.7150000000000001,
|
78 |
+
"llm_top_5_test_accuracy": 0.7592000000000001,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9478000402450562,
|
84 |
+
"sae_top_1_test_accuracy": 0.6898,
|
85 |
+
"sae_top_2_test_accuracy": 0.7598,
|
86 |
+
"sae_top_5_test_accuracy": 0.8517999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9290000438690186,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864,
|
96 |
+
"llm_top_2_test_accuracy": 0.7316,
|
97 |
+
"llm_top_5_test_accuracy": 0.7666000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9302000403404236,
|
103 |
+
"sae_top_1_test_accuracy": 0.7866,
|
104 |
+
"sae_top_2_test_accuracy": 0.8008,
|
105 |
+
"sae_top_5_test_accuracy": 0.8625999999999999,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.916200053691864,
|
114 |
+
"llm_top_1_test_accuracy": 0.6113999999999999,
|
115 |
+
"llm_top_2_test_accuracy": 0.6481999999999999,
|
116 |
+
"llm_top_5_test_accuracy": 0.6894,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9182000517845154,
|
122 |
+
"sae_top_1_test_accuracy": 0.7419999999999999,
|
123 |
+
"sae_top_2_test_accuracy": 0.8026,
|
124 |
+
"sae_top_5_test_accuracy": 0.8196,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9730000495910645,
|
141 |
+
"sae_top_1_test_accuracy": 0.913,
|
142 |
+
"sae_top_2_test_accuracy": 0.913,
|
143 |
+
"sae_top_5_test_accuracy": 0.942,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9714000344276428,
|
152 |
+
"llm_top_1_test_accuracy": 0.6452000000000001,
|
153 |
+
"llm_top_2_test_accuracy": 0.6928,
|
154 |
+
"llm_top_5_test_accuracy": 0.7726,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9676000356674195,
|
160 |
+
"sae_top_1_test_accuracy": 0.6417999999999999,
|
161 |
+
"sae_top_2_test_accuracy": 0.6686000000000001,
|
162 |
+
"sae_top_5_test_accuracy": 0.8206,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9460000544786453,
|
171 |
+
"llm_top_1_test_accuracy": 0.7325,
|
172 |
+
"llm_top_2_test_accuracy": 0.77375,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9507500529289246,
|
179 |
+
"sae_top_1_test_accuracy": 0.7132499999999999,
|
180 |
+
"sae_top_2_test_accuracy": 0.73725,
|
181 |
+
"sae_top_5_test_accuracy": 0.78725,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000051498413,
|
190 |
+
"llm_top_1_test_accuracy": 0.7296,
|
191 |
+
"llm_top_2_test_accuracy": 0.7868,
|
192 |
+
"llm_top_5_test_accuracy": 0.9067999999999999,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9992000579833984,
|
198 |
+
"sae_top_1_test_accuracy": 0.874,
|
199 |
+
"sae_top_2_test_accuracy": 0.915,
|
200 |
+
"sae_top_5_test_accuracy": 0.9968,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9520000219345093,
|
240 |
+
"1": 0.9610000252723694,
|
241 |
+
"2": 0.9450000524520874,
|
242 |
+
"6": 0.9860000610351562,
|
243 |
+
"9": 0.9740000367164612
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.58,
|
275 |
+
"1": 0.632,
|
276 |
+
"2": 0.862,
|
277 |
+
"6": 0.825,
|
278 |
+
"9": 0.936
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.619,
|
282 |
+
"1": 0.814,
|
283 |
+
"2": 0.88,
|
284 |
+
"6": 0.982,
|
285 |
+
"9": 0.927
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.856,
|
289 |
+
"1": 0.85,
|
290 |
+
"2": 0.876,
|
291 |
+
"6": 0.982,
|
292 |
+
"9": 0.951
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.956000030040741,
|
298 |
+
"13": 0.9490000605583191,
|
299 |
+
"14": 0.9500000476837158,
|
300 |
+
"18": 0.9190000295639038,
|
301 |
+
"19": 0.9650000333786011
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9620000720024109,
|
305 |
+
"13": 0.9470000267028809,
|
306 |
+
"14": 0.9580000638961792,
|
307 |
+
"18": 0.9310000538825989,
|
308 |
+
"19": 0.9640000462532043
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.558,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.656,
|
314 |
+
"18": 0.702,
|
315 |
+
"19": 0.793
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.686,
|
319 |
+
"13": 0.713,
|
320 |
+
"14": 0.687,
|
321 |
+
"18": 0.724,
|
322 |
+
"19": 0.765
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.782,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.716,
|
328 |
+
"18": 0.725,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.59,
|
333 |
+
"13": 0.684,
|
334 |
+
"14": 0.637,
|
335 |
+
"18": 0.692,
|
336 |
+
"19": 0.846
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.735,
|
340 |
+
"13": 0.657,
|
341 |
+
"14": 0.878,
|
342 |
+
"18": 0.683,
|
343 |
+
"19": 0.846
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.949,
|
347 |
+
"13": 0.69,
|
348 |
+
"14": 0.881,
|
349 |
+
"18": 0.892,
|
350 |
+
"19": 0.847
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9590000510215759,
|
356 |
+
"21": 0.9340000152587891,
|
357 |
+
"22": 0.9120000600814819,
|
358 |
+
"25": 0.9610000252723694,
|
359 |
+
"26": 0.8850000500679016
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9610000252723694,
|
363 |
+
"21": 0.9140000343322754,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9630000591278076,
|
366 |
+
"26": 0.89000004529953
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.711,
|
370 |
+
"21": 0.771,
|
371 |
+
"22": 0.637,
|
372 |
+
"25": 0.687,
|
373 |
+
"26": 0.626
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.809,
|
377 |
+
"21": 0.764,
|
378 |
+
"22": 0.659,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.66
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.858,
|
384 |
+
"21": 0.795,
|
385 |
+
"22": 0.715,
|
386 |
+
"25": 0.786,
|
387 |
+
"26": 0.679
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.874,
|
391 |
+
"21": 0.763,
|
392 |
+
"22": 0.817,
|
393 |
+
"25": 0.883,
|
394 |
+
"26": 0.596
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.9,
|
398 |
+
"21": 0.758,
|
399 |
+
"22": 0.859,
|
400 |
+
"25": 0.863,
|
401 |
+
"26": 0.624
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.93,
|
405 |
+
"21": 0.846,
|
406 |
+
"22": 0.858,
|
407 |
+
"25": 0.887,
|
408 |
+
"26": 0.792
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9440000653266907,
|
414 |
+
"2": 0.9320000410079956,
|
415 |
+
"3": 0.9170000553131104,
|
416 |
+
"5": 0.9250000715255737,
|
417 |
+
"6": 0.8730000257492065
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9500000476837158,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9260000586509705,
|
423 |
+
"5": 0.9120000600814819,
|
424 |
+
"6": 0.8560000658035278
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.693,
|
428 |
+
"2": 0.607,
|
429 |
+
"3": 0.579,
|
430 |
+
"5": 0.577,
|
431 |
+
"6": 0.601
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.747,
|
435 |
+
"2": 0.64,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.628,
|
438 |
+
"6": 0.619
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.78,
|
442 |
+
"2": 0.657,
|
443 |
+
"3": 0.667,
|
444 |
+
"5": 0.659,
|
445 |
+
"6": 0.684
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.811,
|
449 |
+
"2": 0.867,
|
450 |
+
"3": 0.578,
|
451 |
+
"5": 0.86,
|
452 |
+
"6": 0.594
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.89,
|
456 |
+
"2": 0.876,
|
457 |
+
"3": 0.675,
|
458 |
+
"5": 0.87,
|
459 |
+
"6": 0.702
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.911,
|
463 |
+
"2": 0.874,
|
464 |
+
"3": 0.709,
|
465 |
+
"5": 0.879,
|
466 |
+
"6": 0.725
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9720000624656677,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.913,
|
492 |
+
"5.0": 0.913
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.913,
|
496 |
+
"5.0": 0.913
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.942,
|
500 |
+
"5.0": 0.942
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9430000185966492,
|
506 |
+
"Python": 0.987000048160553,
|
507 |
+
"HTML": 0.9910000562667847,
|
508 |
+
"Java": 0.9600000381469727,
|
509 |
+
"PHP": 0.9570000171661377
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.987000048160553,
|
514 |
+
"HTML": 0.9940000176429749,
|
515 |
+
"Java": 0.9610000252723694,
|
516 |
+
"PHP": 0.9590000510215759
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.636,
|
521 |
+
"HTML": 0.733,
|
522 |
+
"Java": 0.616,
|
523 |
+
"PHP": 0.584
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.671,
|
527 |
+
"Python": 0.668,
|
528 |
+
"HTML": 0.803,
|
529 |
+
"Java": 0.68,
|
530 |
+
"PHP": 0.642
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.765,
|
534 |
+
"Python": 0.727,
|
535 |
+
"HTML": 0.943,
|
536 |
+
"Java": 0.735,
|
537 |
+
"PHP": 0.693
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.635,
|
541 |
+
"Python": 0.636,
|
542 |
+
"HTML": 0.686,
|
543 |
+
"Java": 0.643,
|
544 |
+
"PHP": 0.609
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.657,
|
548 |
+
"Python": 0.66,
|
549 |
+
"HTML": 0.777,
|
550 |
+
"Java": 0.647,
|
551 |
+
"PHP": 0.602
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.687,
|
555 |
+
"Python": 0.938,
|
556 |
+
"HTML": 0.896,
|
557 |
+
"Java": 0.656,
|
558 |
+
"PHP": 0.926
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9450000524520874,
|
564 |
+
"1": 0.9860000610351562,
|
565 |
+
"2": 0.9250000715255737,
|
566 |
+
"3": 0.9470000267028809
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9390000700950623,
|
570 |
+
"1": 0.984000027179718,
|
571 |
+
"2": 0.9160000681877136,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.806,
|
576 |
+
"1": 0.662,
|
577 |
+
"2": 0.671,
|
578 |
+
"3": 0.791
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.796,
|
582 |
+
"1": 0.796,
|
583 |
+
"2": 0.694,
|
584 |
+
"3": 0.809
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.816,
|
588 |
+
"1": 0.885,
|
589 |
+
"2": 0.744,
|
590 |
+
"3": 0.84
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.799,
|
594 |
+
"1": 0.687,
|
595 |
+
"2": 0.724,
|
596 |
+
"3": 0.643
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.814,
|
600 |
+
"1": 0.675,
|
601 |
+
"2": 0.819,
|
602 |
+
"3": 0.641
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.849,
|
606 |
+
"1": 0.798,
|
607 |
+
"2": 0.811,
|
608 |
+
"3": 0.691
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.9980000257492065
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.749,
|
628 |
+
"fr": 0.605,
|
629 |
+
"de": 0.741,
|
630 |
+
"es": 0.913,
|
631 |
+
"nl": 0.64
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.831,
|
635 |
+
"fr": 0.607,
|
636 |
+
"de": 0.828,
|
637 |
+
"es": 0.915,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.888,
|
642 |
+
"fr": 0.924,
|
643 |
+
"de": 0.882,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.86
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.83,
|
649 |
+
"fr": 0.995,
|
650 |
+
"de": 0.905,
|
651 |
+
"es": 0.886,
|
652 |
+
"nl": 0.754
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.856,
|
656 |
+
"fr": 0.997,
|
657 |
+
"de": 0.991,
|
658 |
+
"es": 0.991,
|
659 |
+
"nl": 0.74
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.999,
|
663 |
+
"fr": 0.997,
|
664 |
+
"de": 0.994,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "44a6fcc3-3e44-4e32-aedf-75d4817b1efc",
|
30 |
+
"datetime_epoch_millis": 1738794660341,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9582500416785479,
|
34 |
+
"llm_top_1_test_accuracy": 0.6746375,
|
35 |
+
"llm_top_2_test_accuracy": 0.7199437500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.78408125,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9547500442713499,
|
44 |
+
"sae_top_1_test_accuracy": 0.7546875000000001,
|
45 |
+
"sae_top_2_test_accuracy": 0.8126875,
|
46 |
+
"sae_top_5_test_accuracy": 0.87828125,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9614000439643859,
|
65 |
+
"sae_top_1_test_accuracy": 0.7754000000000001,
|
66 |
+
"sae_top_2_test_accuracy": 0.8552,
|
67 |
+
"sae_top_5_test_accuracy": 0.9076000000000001,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9524000525474549,
|
76 |
+
"llm_top_1_test_accuracy": 0.6764,
|
77 |
+
"llm_top_2_test_accuracy": 0.7150000000000001,
|
78 |
+
"llm_top_5_test_accuracy": 0.7592000000000001,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9456000447273254,
|
84 |
+
"sae_top_1_test_accuracy": 0.6892,
|
85 |
+
"sae_top_2_test_accuracy": 0.7634000000000001,
|
86 |
+
"sae_top_5_test_accuracy": 0.8103999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9290000438690186,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864,
|
96 |
+
"llm_top_2_test_accuracy": 0.7316,
|
97 |
+
"llm_top_5_test_accuracy": 0.7666000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9258000254631042,
|
103 |
+
"sae_top_1_test_accuracy": 0.7434,
|
104 |
+
"sae_top_2_test_accuracy": 0.7851999999999999,
|
105 |
+
"sae_top_5_test_accuracy": 0.8620000000000001,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.916200053691864,
|
114 |
+
"llm_top_1_test_accuracy": 0.6113999999999999,
|
115 |
+
"llm_top_2_test_accuracy": 0.6481999999999999,
|
116 |
+
"llm_top_5_test_accuracy": 0.6894,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9164000391960144,
|
122 |
+
"sae_top_1_test_accuracy": 0.751,
|
123 |
+
"sae_top_2_test_accuracy": 0.7876000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.8395999999999999,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9715000689029694,
|
141 |
+
"sae_top_1_test_accuracy": 0.89,
|
142 |
+
"sae_top_2_test_accuracy": 0.891,
|
143 |
+
"sae_top_5_test_accuracy": 0.936,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9714000344276428,
|
152 |
+
"llm_top_1_test_accuracy": 0.6452000000000001,
|
153 |
+
"llm_top_2_test_accuracy": 0.6928,
|
154 |
+
"llm_top_5_test_accuracy": 0.7726,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9690000414848328,
|
160 |
+
"sae_top_1_test_accuracy": 0.635,
|
161 |
+
"sae_top_2_test_accuracy": 0.7045999999999999,
|
162 |
+
"sae_top_5_test_accuracy": 0.8392000000000002,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9460000544786453,
|
171 |
+
"llm_top_1_test_accuracy": 0.7325,
|
172 |
+
"llm_top_2_test_accuracy": 0.77375,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9495000392198563,
|
179 |
+
"sae_top_1_test_accuracy": 0.6825,
|
180 |
+
"sae_top_2_test_accuracy": 0.7375,
|
181 |
+
"sae_top_5_test_accuracy": 0.8352499999999999,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000051498413,
|
190 |
+
"llm_top_1_test_accuracy": 0.7296,
|
191 |
+
"llm_top_2_test_accuracy": 0.7868,
|
192 |
+
"llm_top_5_test_accuracy": 0.9067999999999999,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9988000512123107,
|
198 |
+
"sae_top_1_test_accuracy": 0.8710000000000001,
|
199 |
+
"sae_top_2_test_accuracy": 0.977,
|
200 |
+
"sae_top_5_test_accuracy": 0.9962,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9450000524520874,
|
240 |
+
"1": 0.956000030040741,
|
241 |
+
"2": 0.9490000605583191,
|
242 |
+
"6": 0.9830000400543213,
|
243 |
+
"9": 0.9740000367164612
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.606,
|
275 |
+
"1": 0.633,
|
276 |
+
"2": 0.878,
|
277 |
+
"6": 0.826,
|
278 |
+
"9": 0.934
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.662,
|
282 |
+
"1": 0.819,
|
283 |
+
"2": 0.885,
|
284 |
+
"6": 0.981,
|
285 |
+
"9": 0.929
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.867,
|
289 |
+
"1": 0.854,
|
290 |
+
"2": 0.89,
|
291 |
+
"6": 0.984,
|
292 |
+
"9": 0.943
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9570000171661377,
|
298 |
+
"13": 0.940000057220459,
|
299 |
+
"14": 0.9530000686645508,
|
300 |
+
"18": 0.9180000424385071,
|
301 |
+
"19": 0.9600000381469727
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9620000720024109,
|
305 |
+
"13": 0.9470000267028809,
|
306 |
+
"14": 0.9580000638961792,
|
307 |
+
"18": 0.9310000538825989,
|
308 |
+
"19": 0.9640000462532043
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.558,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.656,
|
314 |
+
"18": 0.702,
|
315 |
+
"19": 0.793
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.686,
|
319 |
+
"13": 0.713,
|
320 |
+
"14": 0.687,
|
321 |
+
"18": 0.724,
|
322 |
+
"19": 0.765
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.782,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.716,
|
328 |
+
"18": 0.725,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.582,
|
333 |
+
"13": 0.684,
|
334 |
+
"14": 0.643,
|
335 |
+
"18": 0.698,
|
336 |
+
"19": 0.839
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.745,
|
340 |
+
"13": 0.668,
|
341 |
+
"14": 0.868,
|
342 |
+
"18": 0.695,
|
343 |
+
"19": 0.841
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.865,
|
347 |
+
"13": 0.694,
|
348 |
+
"14": 0.871,
|
349 |
+
"18": 0.729,
|
350 |
+
"19": 0.893
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9570000171661377,
|
356 |
+
"21": 0.9200000166893005,
|
357 |
+
"22": 0.9100000262260437,
|
358 |
+
"25": 0.9570000171661377,
|
359 |
+
"26": 0.8850000500679016
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9610000252723694,
|
363 |
+
"21": 0.9140000343322754,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9630000591278076,
|
366 |
+
"26": 0.89000004529953
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.711,
|
370 |
+
"21": 0.771,
|
371 |
+
"22": 0.637,
|
372 |
+
"25": 0.687,
|
373 |
+
"26": 0.626
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.809,
|
377 |
+
"21": 0.764,
|
378 |
+
"22": 0.659,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.66
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.858,
|
384 |
+
"21": 0.795,
|
385 |
+
"22": 0.715,
|
386 |
+
"25": 0.786,
|
387 |
+
"26": 0.679
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.874,
|
391 |
+
"21": 0.532,
|
392 |
+
"22": 0.817,
|
393 |
+
"25": 0.876,
|
394 |
+
"26": 0.618
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.884,
|
398 |
+
"21": 0.602,
|
399 |
+
"22": 0.862,
|
400 |
+
"25": 0.875,
|
401 |
+
"26": 0.703
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.935,
|
405 |
+
"21": 0.847,
|
406 |
+
"22": 0.85,
|
407 |
+
"25": 0.887,
|
408 |
+
"26": 0.791
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9410000443458557,
|
414 |
+
"2": 0.9360000491142273,
|
415 |
+
"3": 0.9240000247955322,
|
416 |
+
"5": 0.9130000472068787,
|
417 |
+
"6": 0.8680000305175781
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9500000476837158,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9260000586509705,
|
423 |
+
"5": 0.9120000600814819,
|
424 |
+
"6": 0.8560000658035278
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.693,
|
428 |
+
"2": 0.607,
|
429 |
+
"3": 0.579,
|
430 |
+
"5": 0.577,
|
431 |
+
"6": 0.601
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.747,
|
435 |
+
"2": 0.64,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.628,
|
438 |
+
"6": 0.619
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.78,
|
442 |
+
"2": 0.657,
|
443 |
+
"3": 0.667,
|
444 |
+
"5": 0.659,
|
445 |
+
"6": 0.684
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.872,
|
449 |
+
"2": 0.869,
|
450 |
+
"3": 0.561,
|
451 |
+
"5": 0.86,
|
452 |
+
"6": 0.593
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.911,
|
456 |
+
"2": 0.873,
|
457 |
+
"3": 0.685,
|
458 |
+
"5": 0.869,
|
459 |
+
"6": 0.6
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.924,
|
463 |
+
"2": 0.879,
|
464 |
+
"3": 0.767,
|
465 |
+
"5": 0.881,
|
466 |
+
"6": 0.747
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.971000075340271,
|
472 |
+
"5.0": 0.9720000624656677
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.89,
|
492 |
+
"5.0": 0.89
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.891,
|
496 |
+
"5.0": 0.891
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.936,
|
500 |
+
"5.0": 0.936
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.956000030040741,
|
506 |
+
"Python": 0.9820000529289246,
|
507 |
+
"HTML": 0.9880000352859497,
|
508 |
+
"Java": 0.9570000171661377,
|
509 |
+
"PHP": 0.9620000720024109
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.987000048160553,
|
514 |
+
"HTML": 0.9940000176429749,
|
515 |
+
"Java": 0.9610000252723694,
|
516 |
+
"PHP": 0.9590000510215759
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.636,
|
521 |
+
"HTML": 0.733,
|
522 |
+
"Java": 0.616,
|
523 |
+
"PHP": 0.584
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.671,
|
527 |
+
"Python": 0.668,
|
528 |
+
"HTML": 0.803,
|
529 |
+
"Java": 0.68,
|
530 |
+
"PHP": 0.642
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.765,
|
534 |
+
"Python": 0.727,
|
535 |
+
"HTML": 0.943,
|
536 |
+
"Java": 0.735,
|
537 |
+
"PHP": 0.693
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.624,
|
541 |
+
"Python": 0.629,
|
542 |
+
"HTML": 0.701,
|
543 |
+
"Java": 0.62,
|
544 |
+
"PHP": 0.601
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.631,
|
548 |
+
"Python": 0.911,
|
549 |
+
"HTML": 0.734,
|
550 |
+
"Java": 0.653,
|
551 |
+
"PHP": 0.594
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.684,
|
555 |
+
"Python": 0.936,
|
556 |
+
"HTML": 0.955,
|
557 |
+
"Java": 0.7,
|
558 |
+
"PHP": 0.921
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9340000152587891,
|
564 |
+
"1": 0.984000027179718,
|
565 |
+
"2": 0.9320000410079956,
|
566 |
+
"3": 0.9480000734329224
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9390000700950623,
|
570 |
+
"1": 0.984000027179718,
|
571 |
+
"2": 0.9160000681877136,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.806,
|
576 |
+
"1": 0.662,
|
577 |
+
"2": 0.671,
|
578 |
+
"3": 0.791
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.796,
|
582 |
+
"1": 0.796,
|
583 |
+
"2": 0.694,
|
584 |
+
"3": 0.809
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.816,
|
588 |
+
"1": 0.885,
|
589 |
+
"2": 0.744,
|
590 |
+
"3": 0.84
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.743,
|
594 |
+
"1": 0.692,
|
595 |
+
"2": 0.671,
|
596 |
+
"3": 0.624
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.828,
|
600 |
+
"1": 0.682,
|
601 |
+
"2": 0.8,
|
602 |
+
"3": 0.64
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.842,
|
606 |
+
"1": 0.841,
|
607 |
+
"2": 0.836,
|
608 |
+
"3": 0.822
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 0.9970000386238098,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 1.0
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.9980000257492065
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.749,
|
628 |
+
"fr": 0.605,
|
629 |
+
"de": 0.741,
|
630 |
+
"es": 0.913,
|
631 |
+
"nl": 0.64
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.831,
|
635 |
+
"fr": 0.607,
|
636 |
+
"de": 0.828,
|
637 |
+
"es": 0.915,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.888,
|
642 |
+
"fr": 0.924,
|
643 |
+
"de": 0.882,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.86
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.84,
|
649 |
+
"fr": 0.992,
|
650 |
+
"de": 0.9,
|
651 |
+
"es": 0.877,
|
652 |
+
"nl": 0.746
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.998,
|
656 |
+
"fr": 0.99,
|
657 |
+
"de": 0.907,
|
658 |
+
"es": 0.991,
|
659 |
+
"nl": 0.999
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 1.0,
|
663 |
+
"fr": 0.994,
|
664 |
+
"de": 0.992,
|
665 |
+
"es": 0.996,
|
666 |
+
"nl": 0.999
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "c9ab252b-9c9e-44fa-bcb2-af3a1b348f2b",
|
30 |
+
"datetime_epoch_millis": 1738794751340,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9582500416785479,
|
34 |
+
"llm_top_1_test_accuracy": 0.6746375,
|
35 |
+
"llm_top_2_test_accuracy": 0.7199437500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.78408125,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9557312924414874,
|
44 |
+
"sae_top_1_test_accuracy": 0.74515625,
|
45 |
+
"sae_top_2_test_accuracy": 0.8067187499999999,
|
46 |
+
"sae_top_5_test_accuracy": 0.8559499999999999,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.9648000359535217,
|
65 |
+
"sae_top_1_test_accuracy": 0.7744,
|
66 |
+
"sae_top_2_test_accuracy": 0.8164,
|
67 |
+
"sae_top_5_test_accuracy": 0.8907999999999999,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9524000525474549,
|
76 |
+
"llm_top_1_test_accuracy": 0.6764,
|
77 |
+
"llm_top_2_test_accuracy": 0.7150000000000001,
|
78 |
+
"llm_top_5_test_accuracy": 0.7592000000000001,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9498000383377075,
|
84 |
+
"sae_top_1_test_accuracy": 0.6796,
|
85 |
+
"sae_top_2_test_accuracy": 0.7634,
|
86 |
+
"sae_top_5_test_accuracy": 0.8123999999999999,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9290000438690186,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864,
|
96 |
+
"llm_top_2_test_accuracy": 0.7316,
|
97 |
+
"llm_top_5_test_accuracy": 0.7666000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9282000422477722,
|
103 |
+
"sae_top_1_test_accuracy": 0.7568,
|
104 |
+
"sae_top_2_test_accuracy": 0.8108000000000001,
|
105 |
+
"sae_top_5_test_accuracy": 0.8694,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.916200053691864,
|
114 |
+
"llm_top_1_test_accuracy": 0.6113999999999999,
|
115 |
+
"llm_top_2_test_accuracy": 0.6481999999999999,
|
116 |
+
"llm_top_5_test_accuracy": 0.6894,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9122000455856323,
|
122 |
+
"sae_top_1_test_accuracy": 0.7558,
|
123 |
+
"sae_top_2_test_accuracy": 0.7975999999999999,
|
124 |
+
"sae_top_5_test_accuracy": 0.8458,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9745000302791595,
|
141 |
+
"sae_top_1_test_accuracy": 0.819,
|
142 |
+
"sae_top_2_test_accuracy": 0.821,
|
143 |
+
"sae_top_5_test_accuracy": 0.833,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9714000344276428,
|
152 |
+
"llm_top_1_test_accuracy": 0.6452000000000001,
|
153 |
+
"llm_top_2_test_accuracy": 0.6928,
|
154 |
+
"llm_top_5_test_accuracy": 0.7726,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.9682000517845154,
|
160 |
+
"sae_top_1_test_accuracy": 0.6362,
|
161 |
+
"sae_top_2_test_accuracy": 0.7532,
|
162 |
+
"sae_top_5_test_accuracy": 0.8176,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9460000544786453,
|
171 |
+
"llm_top_1_test_accuracy": 0.7325,
|
172 |
+
"llm_top_2_test_accuracy": 0.77375,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9497500509023666,
|
179 |
+
"sae_top_1_test_accuracy": 0.6652500000000001,
|
180 |
+
"sae_top_2_test_accuracy": 0.71375,
|
181 |
+
"sae_top_5_test_accuracy": 0.782,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000051498413,
|
190 |
+
"llm_top_1_test_accuracy": 0.7296,
|
191 |
+
"llm_top_2_test_accuracy": 0.7868,
|
192 |
+
"llm_top_5_test_accuracy": 0.9067999999999999,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9984000444412231,
|
198 |
+
"sae_top_1_test_accuracy": 0.8741999999999999,
|
199 |
+
"sae_top_2_test_accuracy": 0.9776,
|
200 |
+
"sae_top_5_test_accuracy": 0.9966000000000002,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9470000267028809,
|
240 |
+
"1": 0.968000054359436,
|
241 |
+
"2": 0.9520000219345093,
|
242 |
+
"6": 0.9880000352859497,
|
243 |
+
"9": 0.9690000414848328
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.599,
|
275 |
+
"1": 0.631,
|
276 |
+
"2": 0.893,
|
277 |
+
"6": 0.828,
|
278 |
+
"9": 0.921
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.678,
|
282 |
+
"1": 0.609,
|
283 |
+
"2": 0.891,
|
284 |
+
"6": 0.977,
|
285 |
+
"9": 0.927
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.842,
|
289 |
+
"1": 0.775,
|
290 |
+
"2": 0.91,
|
291 |
+
"6": 0.981,
|
292 |
+
"9": 0.946
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9570000171661377,
|
298 |
+
"13": 0.9500000476837158,
|
299 |
+
"14": 0.9540000557899475,
|
300 |
+
"18": 0.9280000329017639,
|
301 |
+
"19": 0.9600000381469727
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9620000720024109,
|
305 |
+
"13": 0.9470000267028809,
|
306 |
+
"14": 0.9580000638961792,
|
307 |
+
"18": 0.9310000538825989,
|
308 |
+
"19": 0.9640000462532043
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.558,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.656,
|
314 |
+
"18": 0.702,
|
315 |
+
"19": 0.793
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.686,
|
319 |
+
"13": 0.713,
|
320 |
+
"14": 0.687,
|
321 |
+
"18": 0.724,
|
322 |
+
"19": 0.765
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.782,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.716,
|
328 |
+
"18": 0.725,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.583,
|
333 |
+
"13": 0.686,
|
334 |
+
"14": 0.647,
|
335 |
+
"18": 0.679,
|
336 |
+
"19": 0.803
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.741,
|
340 |
+
"13": 0.676,
|
341 |
+
"14": 0.868,
|
342 |
+
"18": 0.696,
|
343 |
+
"19": 0.836
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.865,
|
347 |
+
"13": 0.688,
|
348 |
+
"14": 0.892,
|
349 |
+
"18": 0.732,
|
350 |
+
"19": 0.885
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9510000348091125,
|
356 |
+
"21": 0.9260000586509705,
|
357 |
+
"22": 0.9150000214576721,
|
358 |
+
"25": 0.9600000381469727,
|
359 |
+
"26": 0.8890000581741333
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9610000252723694,
|
363 |
+
"21": 0.9140000343322754,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9630000591278076,
|
366 |
+
"26": 0.89000004529953
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.711,
|
370 |
+
"21": 0.771,
|
371 |
+
"22": 0.637,
|
372 |
+
"25": 0.687,
|
373 |
+
"26": 0.626
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.809,
|
377 |
+
"21": 0.764,
|
378 |
+
"22": 0.659,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.66
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.858,
|
384 |
+
"21": 0.795,
|
385 |
+
"22": 0.715,
|
386 |
+
"25": 0.786,
|
387 |
+
"26": 0.679
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.872,
|
391 |
+
"21": 0.521,
|
392 |
+
"22": 0.888,
|
393 |
+
"25": 0.875,
|
394 |
+
"26": 0.628
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.917,
|
398 |
+
"21": 0.744,
|
399 |
+
"22": 0.894,
|
400 |
+
"25": 0.869,
|
401 |
+
"26": 0.63
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.942,
|
405 |
+
"21": 0.846,
|
406 |
+
"22": 0.888,
|
407 |
+
"25": 0.889,
|
408 |
+
"26": 0.782
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9480000734329224,
|
414 |
+
"2": 0.9260000586509705,
|
415 |
+
"3": 0.9200000166893005,
|
416 |
+
"5": 0.909000039100647,
|
417 |
+
"6": 0.8580000400543213
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9500000476837158,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9260000586509705,
|
423 |
+
"5": 0.9120000600814819,
|
424 |
+
"6": 0.8560000658035278
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.693,
|
428 |
+
"2": 0.607,
|
429 |
+
"3": 0.579,
|
430 |
+
"5": 0.577,
|
431 |
+
"6": 0.601
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.747,
|
435 |
+
"2": 0.64,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.628,
|
438 |
+
"6": 0.619
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.78,
|
442 |
+
"2": 0.657,
|
443 |
+
"3": 0.667,
|
444 |
+
"5": 0.659,
|
445 |
+
"6": 0.684
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.868,
|
449 |
+
"2": 0.846,
|
450 |
+
"3": 0.579,
|
451 |
+
"5": 0.889,
|
452 |
+
"6": 0.597
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.899,
|
456 |
+
"2": 0.856,
|
457 |
+
"3": 0.674,
|
458 |
+
"5": 0.888,
|
459 |
+
"6": 0.671
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.923,
|
463 |
+
"2": 0.882,
|
464 |
+
"3": 0.767,
|
465 |
+
"5": 0.896,
|
466 |
+
"6": 0.761
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9750000238418579,
|
472 |
+
"5.0": 0.9740000367164612
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.819,
|
492 |
+
"5.0": 0.819
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.821,
|
496 |
+
"5.0": 0.821
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.833,
|
500 |
+
"5.0": 0.833
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9540000557899475,
|
506 |
+
"Python": 0.9810000658035278,
|
507 |
+
"HTML": 0.9880000352859497,
|
508 |
+
"Java": 0.9620000720024109,
|
509 |
+
"PHP": 0.956000030040741
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.987000048160553,
|
514 |
+
"HTML": 0.9940000176429749,
|
515 |
+
"Java": 0.9610000252723694,
|
516 |
+
"PHP": 0.9590000510215759
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.636,
|
521 |
+
"HTML": 0.733,
|
522 |
+
"Java": 0.616,
|
523 |
+
"PHP": 0.584
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.671,
|
527 |
+
"Python": 0.668,
|
528 |
+
"HTML": 0.803,
|
529 |
+
"Java": 0.68,
|
530 |
+
"PHP": 0.642
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.765,
|
534 |
+
"Python": 0.727,
|
535 |
+
"HTML": 0.943,
|
536 |
+
"Java": 0.735,
|
537 |
+
"PHP": 0.693
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.615,
|
541 |
+
"Python": 0.631,
|
542 |
+
"HTML": 0.687,
|
543 |
+
"Java": 0.642,
|
544 |
+
"PHP": 0.606
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.587,
|
548 |
+
"Python": 0.921,
|
549 |
+
"HTML": 0.692,
|
550 |
+
"Java": 0.653,
|
551 |
+
"PHP": 0.913
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.681,
|
555 |
+
"Python": 0.935,
|
556 |
+
"HTML": 0.87,
|
557 |
+
"Java": 0.685,
|
558 |
+
"PHP": 0.917
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9410000443458557,
|
564 |
+
"1": 0.9810000658035278,
|
565 |
+
"2": 0.9290000200271606,
|
566 |
+
"3": 0.9480000734329224
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9390000700950623,
|
570 |
+
"1": 0.984000027179718,
|
571 |
+
"2": 0.9160000681877136,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.806,
|
576 |
+
"1": 0.662,
|
577 |
+
"2": 0.671,
|
578 |
+
"3": 0.791
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.796,
|
582 |
+
"1": 0.796,
|
583 |
+
"2": 0.694,
|
584 |
+
"3": 0.809
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.816,
|
588 |
+
"1": 0.885,
|
589 |
+
"2": 0.744,
|
590 |
+
"3": 0.84
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.687,
|
594 |
+
"1": 0.7,
|
595 |
+
"2": 0.651,
|
596 |
+
"3": 0.623
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.839,
|
600 |
+
"1": 0.693,
|
601 |
+
"2": 0.695,
|
602 |
+
"3": 0.628
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.853,
|
606 |
+
"1": 0.808,
|
607 |
+
"2": 0.747,
|
608 |
+
"3": 0.72
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.9980000257492065,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 0.9980000257492065,
|
616 |
+
"es": 0.999000072479248,
|
617 |
+
"nl": 0.9980000257492065
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.9980000257492065
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.749,
|
628 |
+
"fr": 0.605,
|
629 |
+
"de": 0.741,
|
630 |
+
"es": 0.913,
|
631 |
+
"nl": 0.64
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.831,
|
635 |
+
"fr": 0.607,
|
636 |
+
"de": 0.828,
|
637 |
+
"es": 0.915,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.888,
|
642 |
+
"fr": 0.924,
|
643 |
+
"de": 0.882,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.86
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.847,
|
649 |
+
"fr": 0.993,
|
650 |
+
"de": 0.908,
|
651 |
+
"es": 0.881,
|
652 |
+
"nl": 0.742
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.999,
|
656 |
+
"fr": 0.995,
|
657 |
+
"de": 0.997,
|
658 |
+
"es": 0.899,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 1.0,
|
663 |
+
"fr": 0.995,
|
664 |
+
"de": 0.996,
|
665 |
+
"es": 0.996,
|
666 |
+
"nl": 0.996
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
random_seed_eval_results/sparse_probing/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,670 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "sparse_probing",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"LabHC/bias_in_bios_class_set2",
|
8 |
+
"LabHC/bias_in_bios_class_set3",
|
9 |
+
"canrager/amazon_reviews_mcauley_1and5",
|
10 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment",
|
11 |
+
"codeparrot/github-code",
|
12 |
+
"fancyzhx/ag_news",
|
13 |
+
"Helsinki-NLP/europarl"
|
14 |
+
],
|
15 |
+
"probe_train_set_size": 4000,
|
16 |
+
"probe_test_set_size": 1000,
|
17 |
+
"context_length": 128,
|
18 |
+
"sae_batch_size": 125,
|
19 |
+
"llm_batch_size": 32,
|
20 |
+
"llm_dtype": "bfloat16",
|
21 |
+
"model_name": "gemma-2-2b",
|
22 |
+
"k_values": [
|
23 |
+
1,
|
24 |
+
2,
|
25 |
+
5
|
26 |
+
],
|
27 |
+
"lower_vram_usage": false
|
28 |
+
},
|
29 |
+
"eval_id": "a539ae76-2f7a-40a7-a02f-2a8ba952f201",
|
30 |
+
"datetime_epoch_millis": 1738794843835,
|
31 |
+
"eval_result_metrics": {
|
32 |
+
"llm": {
|
33 |
+
"llm_test_accuracy": 0.9582500416785479,
|
34 |
+
"llm_top_1_test_accuracy": 0.6746375,
|
35 |
+
"llm_top_2_test_accuracy": 0.7199437500000001,
|
36 |
+
"llm_top_5_test_accuracy": 0.78408125,
|
37 |
+
"llm_top_10_test_accuracy": null,
|
38 |
+
"llm_top_20_test_accuracy": null,
|
39 |
+
"llm_top_50_test_accuracy": null,
|
40 |
+
"llm_top_100_test_accuracy": null
|
41 |
+
},
|
42 |
+
"sae": {
|
43 |
+
"sae_test_accuracy": 0.9551625456660985,
|
44 |
+
"sae_top_1_test_accuracy": 0.7446249999999999,
|
45 |
+
"sae_top_2_test_accuracy": 0.8137749999999999,
|
46 |
+
"sae_top_5_test_accuracy": 0.87466875,
|
47 |
+
"sae_top_10_test_accuracy": null,
|
48 |
+
"sae_top_20_test_accuracy": null,
|
49 |
+
"sae_top_50_test_accuracy": null,
|
50 |
+
"sae_top_100_test_accuracy": null
|
51 |
+
}
|
52 |
+
},
|
53 |
+
"eval_result_details": [
|
54 |
+
{
|
55 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_results",
|
56 |
+
"llm_test_accuracy": 0.9694000363349915,
|
57 |
+
"llm_top_1_test_accuracy": 0.6436000000000001,
|
58 |
+
"llm_top_2_test_accuracy": 0.6874,
|
59 |
+
"llm_top_5_test_accuracy": 0.7908,
|
60 |
+
"llm_top_10_test_accuracy": null,
|
61 |
+
"llm_top_20_test_accuracy": null,
|
62 |
+
"llm_top_50_test_accuracy": null,
|
63 |
+
"llm_top_100_test_accuracy": null,
|
64 |
+
"sae_test_accuracy": 0.963200044631958,
|
65 |
+
"sae_top_1_test_accuracy": 0.7737999999999999,
|
66 |
+
"sae_top_2_test_accuracy": 0.8460000000000001,
|
67 |
+
"sae_top_5_test_accuracy": 0.9004,
|
68 |
+
"sae_top_10_test_accuracy": null,
|
69 |
+
"sae_top_20_test_accuracy": null,
|
70 |
+
"sae_top_50_test_accuracy": null,
|
71 |
+
"sae_top_100_test_accuracy": null
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"dataset_name": "LabHC/bias_in_bios_class_set2_results",
|
75 |
+
"llm_test_accuracy": 0.9524000525474549,
|
76 |
+
"llm_top_1_test_accuracy": 0.6764,
|
77 |
+
"llm_top_2_test_accuracy": 0.7150000000000001,
|
78 |
+
"llm_top_5_test_accuracy": 0.7592000000000001,
|
79 |
+
"llm_top_10_test_accuracy": null,
|
80 |
+
"llm_top_20_test_accuracy": null,
|
81 |
+
"llm_top_50_test_accuracy": null,
|
82 |
+
"llm_top_100_test_accuracy": null,
|
83 |
+
"sae_test_accuracy": 0.9440000414848327,
|
84 |
+
"sae_top_1_test_accuracy": 0.6816000000000001,
|
85 |
+
"sae_top_2_test_accuracy": 0.79,
|
86 |
+
"sae_top_5_test_accuracy": 0.8244,
|
87 |
+
"sae_top_10_test_accuracy": null,
|
88 |
+
"sae_top_20_test_accuracy": null,
|
89 |
+
"sae_top_50_test_accuracy": null,
|
90 |
+
"sae_top_100_test_accuracy": null
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"dataset_name": "LabHC/bias_in_bios_class_set3_results",
|
94 |
+
"llm_test_accuracy": 0.9290000438690186,
|
95 |
+
"llm_top_1_test_accuracy": 0.6864,
|
96 |
+
"llm_top_2_test_accuracy": 0.7316,
|
97 |
+
"llm_top_5_test_accuracy": 0.7666000000000001,
|
98 |
+
"llm_top_10_test_accuracy": null,
|
99 |
+
"llm_top_20_test_accuracy": null,
|
100 |
+
"llm_top_50_test_accuracy": null,
|
101 |
+
"llm_top_100_test_accuracy": null,
|
102 |
+
"sae_test_accuracy": 0.9278000473976136,
|
103 |
+
"sae_top_1_test_accuracy": 0.6816,
|
104 |
+
"sae_top_2_test_accuracy": 0.783,
|
105 |
+
"sae_top_5_test_accuracy": 0.8649999999999999,
|
106 |
+
"sae_top_10_test_accuracy": null,
|
107 |
+
"sae_top_20_test_accuracy": null,
|
108 |
+
"sae_top_50_test_accuracy": null,
|
109 |
+
"sae_top_100_test_accuracy": null
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_results",
|
113 |
+
"llm_test_accuracy": 0.916200053691864,
|
114 |
+
"llm_top_1_test_accuracy": 0.6113999999999999,
|
115 |
+
"llm_top_2_test_accuracy": 0.6481999999999999,
|
116 |
+
"llm_top_5_test_accuracy": 0.6894,
|
117 |
+
"llm_top_10_test_accuracy": null,
|
118 |
+
"llm_top_20_test_accuracy": null,
|
119 |
+
"llm_top_50_test_accuracy": null,
|
120 |
+
"llm_top_100_test_accuracy": null,
|
121 |
+
"sae_test_accuracy": 0.9196000576019288,
|
122 |
+
"sae_top_1_test_accuracy": 0.748,
|
123 |
+
"sae_top_2_test_accuracy": 0.7790000000000001,
|
124 |
+
"sae_top_5_test_accuracy": 0.8288,
|
125 |
+
"sae_top_10_test_accuracy": null,
|
126 |
+
"sae_top_20_test_accuracy": null,
|
127 |
+
"sae_top_50_test_accuracy": null,
|
128 |
+
"sae_top_100_test_accuracy": null
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_sentiment_results",
|
132 |
+
"llm_test_accuracy": 0.9820000529289246,
|
133 |
+
"llm_top_1_test_accuracy": 0.672,
|
134 |
+
"llm_top_2_test_accuracy": 0.724,
|
135 |
+
"llm_top_5_test_accuracy": 0.766,
|
136 |
+
"llm_top_10_test_accuracy": null,
|
137 |
+
"llm_top_20_test_accuracy": null,
|
138 |
+
"llm_top_50_test_accuracy": null,
|
139 |
+
"llm_top_100_test_accuracy": null,
|
140 |
+
"sae_test_accuracy": 0.9735000431537628,
|
141 |
+
"sae_top_1_test_accuracy": 0.882,
|
142 |
+
"sae_top_2_test_accuracy": 0.909,
|
143 |
+
"sae_top_5_test_accuracy": 0.947,
|
144 |
+
"sae_top_10_test_accuracy": null,
|
145 |
+
"sae_top_20_test_accuracy": null,
|
146 |
+
"sae_top_50_test_accuracy": null,
|
147 |
+
"sae_top_100_test_accuracy": null
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"dataset_name": "codeparrot/github-code_results",
|
151 |
+
"llm_test_accuracy": 0.9714000344276428,
|
152 |
+
"llm_top_1_test_accuracy": 0.6452000000000001,
|
153 |
+
"llm_top_2_test_accuracy": 0.6928,
|
154 |
+
"llm_top_5_test_accuracy": 0.7726,
|
155 |
+
"llm_top_10_test_accuracy": null,
|
156 |
+
"llm_top_20_test_accuracy": null,
|
157 |
+
"llm_top_50_test_accuracy": null,
|
158 |
+
"llm_top_100_test_accuracy": null,
|
159 |
+
"sae_test_accuracy": 0.966200053691864,
|
160 |
+
"sae_top_1_test_accuracy": 0.6402,
|
161 |
+
"sae_top_2_test_accuracy": 0.7772,
|
162 |
+
"sae_top_5_test_accuracy": 0.8308,
|
163 |
+
"sae_top_10_test_accuracy": null,
|
164 |
+
"sae_top_20_test_accuracy": null,
|
165 |
+
"sae_top_50_test_accuracy": null,
|
166 |
+
"sae_top_100_test_accuracy": null
|
167 |
+
},
|
168 |
+
{
|
169 |
+
"dataset_name": "fancyzhx/ag_news_results",
|
170 |
+
"llm_test_accuracy": 0.9460000544786453,
|
171 |
+
"llm_top_1_test_accuracy": 0.7325,
|
172 |
+
"llm_top_2_test_accuracy": 0.77375,
|
173 |
+
"llm_top_5_test_accuracy": 0.82125,
|
174 |
+
"llm_top_10_test_accuracy": null,
|
175 |
+
"llm_top_20_test_accuracy": null,
|
176 |
+
"llm_top_50_test_accuracy": null,
|
177 |
+
"llm_top_100_test_accuracy": null,
|
178 |
+
"sae_test_accuracy": 0.9480000287294388,
|
179 |
+
"sae_top_1_test_accuracy": 0.693,
|
180 |
+
"sae_top_2_test_accuracy": 0.696,
|
181 |
+
"sae_top_5_test_accuracy": 0.81675,
|
182 |
+
"sae_top_10_test_accuracy": null,
|
183 |
+
"sae_top_20_test_accuracy": null,
|
184 |
+
"sae_top_50_test_accuracy": null,
|
185 |
+
"sae_top_100_test_accuracy": null
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"dataset_name": "Helsinki-NLP/europarl_results",
|
189 |
+
"llm_test_accuracy": 0.9996000051498413,
|
190 |
+
"llm_top_1_test_accuracy": 0.7296,
|
191 |
+
"llm_top_2_test_accuracy": 0.7868,
|
192 |
+
"llm_top_5_test_accuracy": 0.9067999999999999,
|
193 |
+
"llm_top_10_test_accuracy": null,
|
194 |
+
"llm_top_20_test_accuracy": null,
|
195 |
+
"llm_top_50_test_accuracy": null,
|
196 |
+
"llm_top_100_test_accuracy": null,
|
197 |
+
"sae_test_accuracy": 0.9990000486373901,
|
198 |
+
"sae_top_1_test_accuracy": 0.8568,
|
199 |
+
"sae_top_2_test_accuracy": 0.9299999999999999,
|
200 |
+
"sae_top_5_test_accuracy": 0.9842000000000001,
|
201 |
+
"sae_top_10_test_accuracy": null,
|
202 |
+
"sae_top_20_test_accuracy": null,
|
203 |
+
"sae_top_50_test_accuracy": null,
|
204 |
+
"sae_top_100_test_accuracy": null
|
205 |
+
}
|
206 |
+
],
|
207 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
208 |
+
"sae_lens_id": "custom_sae",
|
209 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4",
|
210 |
+
"sae_lens_version": "5.4.1",
|
211 |
+
"sae_cfg_dict": {
|
212 |
+
"model_name": "gemma-2-2b",
|
213 |
+
"d_in": 2304,
|
214 |
+
"d_sae": 16384,
|
215 |
+
"hook_layer": 12,
|
216 |
+
"hook_name": "blocks.12.hook_resid_post",
|
217 |
+
"context_size": null,
|
218 |
+
"hook_head_index": null,
|
219 |
+
"architecture": "topk",
|
220 |
+
"apply_b_dec_to_input": null,
|
221 |
+
"finetuning_scaling_factor": null,
|
222 |
+
"activation_fn_str": "",
|
223 |
+
"prepend_bos": true,
|
224 |
+
"normalize_activations": "none",
|
225 |
+
"dtype": "bfloat16",
|
226 |
+
"device": "",
|
227 |
+
"dataset_path": "",
|
228 |
+
"dataset_trust_remote_code": true,
|
229 |
+
"seqpos_slice": [
|
230 |
+
null
|
231 |
+
],
|
232 |
+
"training_tokens": -100000,
|
233 |
+
"sae_lens_training_version": null,
|
234 |
+
"neuronpedia_id": null
|
235 |
+
},
|
236 |
+
"eval_result_unstructured": {
|
237 |
+
"LabHC/bias_in_bios_class_set1_results": {
|
238 |
+
"sae_test_accuracy": {
|
239 |
+
"0": 0.9430000185966492,
|
240 |
+
"1": 0.9600000381469727,
|
241 |
+
"2": 0.9480000734329224,
|
242 |
+
"6": 0.9880000352859497,
|
243 |
+
"9": 0.9770000576972961
|
244 |
+
},
|
245 |
+
"llm_test_accuracy": {
|
246 |
+
"0": 0.9510000348091125,
|
247 |
+
"1": 0.9670000672340393,
|
248 |
+
"2": 0.9520000219345093,
|
249 |
+
"6": 0.9930000305175781,
|
250 |
+
"9": 0.984000027179718
|
251 |
+
},
|
252 |
+
"llm_top_1_test_accuracy": {
|
253 |
+
"0": 0.568,
|
254 |
+
"1": 0.629,
|
255 |
+
"2": 0.679,
|
256 |
+
"6": 0.791,
|
257 |
+
"9": 0.551
|
258 |
+
},
|
259 |
+
"llm_top_2_test_accuracy": {
|
260 |
+
"0": 0.585,
|
261 |
+
"1": 0.666,
|
262 |
+
"2": 0.673,
|
263 |
+
"6": 0.801,
|
264 |
+
"9": 0.712
|
265 |
+
},
|
266 |
+
"llm_top_5_test_accuracy": {
|
267 |
+
"0": 0.72,
|
268 |
+
"1": 0.707,
|
269 |
+
"2": 0.764,
|
270 |
+
"6": 0.899,
|
271 |
+
"9": 0.864
|
272 |
+
},
|
273 |
+
"sae_top_1_test_accuracy": {
|
274 |
+
"0": 0.596,
|
275 |
+
"1": 0.637,
|
276 |
+
"2": 0.871,
|
277 |
+
"6": 0.832,
|
278 |
+
"9": 0.933
|
279 |
+
},
|
280 |
+
"sae_top_2_test_accuracy": {
|
281 |
+
"0": 0.645,
|
282 |
+
"1": 0.798,
|
283 |
+
"2": 0.882,
|
284 |
+
"6": 0.98,
|
285 |
+
"9": 0.925
|
286 |
+
},
|
287 |
+
"sae_top_5_test_accuracy": {
|
288 |
+
"0": 0.843,
|
289 |
+
"1": 0.841,
|
290 |
+
"2": 0.893,
|
291 |
+
"6": 0.981,
|
292 |
+
"9": 0.944
|
293 |
+
}
|
294 |
+
},
|
295 |
+
"LabHC/bias_in_bios_class_set2_results": {
|
296 |
+
"sae_test_accuracy": {
|
297 |
+
"11": 0.9590000510215759,
|
298 |
+
"13": 0.9450000524520874,
|
299 |
+
"14": 0.9540000557899475,
|
300 |
+
"18": 0.9100000262260437,
|
301 |
+
"19": 0.9520000219345093
|
302 |
+
},
|
303 |
+
"llm_test_accuracy": {
|
304 |
+
"11": 0.9620000720024109,
|
305 |
+
"13": 0.9470000267028809,
|
306 |
+
"14": 0.9580000638961792,
|
307 |
+
"18": 0.9310000538825989,
|
308 |
+
"19": 0.9640000462532043
|
309 |
+
},
|
310 |
+
"llm_top_1_test_accuracy": {
|
311 |
+
"11": 0.558,
|
312 |
+
"13": 0.673,
|
313 |
+
"14": 0.656,
|
314 |
+
"18": 0.702,
|
315 |
+
"19": 0.793
|
316 |
+
},
|
317 |
+
"llm_top_2_test_accuracy": {
|
318 |
+
"11": 0.686,
|
319 |
+
"13": 0.713,
|
320 |
+
"14": 0.687,
|
321 |
+
"18": 0.724,
|
322 |
+
"19": 0.765
|
323 |
+
},
|
324 |
+
"llm_top_5_test_accuracy": {
|
325 |
+
"11": 0.782,
|
326 |
+
"13": 0.742,
|
327 |
+
"14": 0.716,
|
328 |
+
"18": 0.725,
|
329 |
+
"19": 0.831
|
330 |
+
},
|
331 |
+
"sae_top_1_test_accuracy": {
|
332 |
+
"11": 0.59,
|
333 |
+
"13": 0.684,
|
334 |
+
"14": 0.636,
|
335 |
+
"18": 0.702,
|
336 |
+
"19": 0.796
|
337 |
+
},
|
338 |
+
"sae_top_2_test_accuracy": {
|
339 |
+
"11": 0.855,
|
340 |
+
"13": 0.678,
|
341 |
+
"14": 0.891,
|
342 |
+
"18": 0.685,
|
343 |
+
"19": 0.841
|
344 |
+
},
|
345 |
+
"sae_top_5_test_accuracy": {
|
346 |
+
"11": 0.944,
|
347 |
+
"13": 0.688,
|
348 |
+
"14": 0.901,
|
349 |
+
"18": 0.744,
|
350 |
+
"19": 0.845
|
351 |
+
}
|
352 |
+
},
|
353 |
+
"LabHC/bias_in_bios_class_set3_results": {
|
354 |
+
"sae_test_accuracy": {
|
355 |
+
"20": 0.9570000171661377,
|
356 |
+
"21": 0.921000063419342,
|
357 |
+
"22": 0.9140000343322754,
|
358 |
+
"25": 0.9580000638961792,
|
359 |
+
"26": 0.8890000581741333
|
360 |
+
},
|
361 |
+
"llm_test_accuracy": {
|
362 |
+
"20": 0.9610000252723694,
|
363 |
+
"21": 0.9140000343322754,
|
364 |
+
"22": 0.9170000553131104,
|
365 |
+
"25": 0.9630000591278076,
|
366 |
+
"26": 0.89000004529953
|
367 |
+
},
|
368 |
+
"llm_top_1_test_accuracy": {
|
369 |
+
"20": 0.711,
|
370 |
+
"21": 0.771,
|
371 |
+
"22": 0.637,
|
372 |
+
"25": 0.687,
|
373 |
+
"26": 0.626
|
374 |
+
},
|
375 |
+
"llm_top_2_test_accuracy": {
|
376 |
+
"20": 0.809,
|
377 |
+
"21": 0.764,
|
378 |
+
"22": 0.659,
|
379 |
+
"25": 0.766,
|
380 |
+
"26": 0.66
|
381 |
+
},
|
382 |
+
"llm_top_5_test_accuracy": {
|
383 |
+
"20": 0.858,
|
384 |
+
"21": 0.795,
|
385 |
+
"22": 0.715,
|
386 |
+
"25": 0.786,
|
387 |
+
"26": 0.679
|
388 |
+
},
|
389 |
+
"sae_top_1_test_accuracy": {
|
390 |
+
"20": 0.874,
|
391 |
+
"21": 0.514,
|
392 |
+
"22": 0.681,
|
393 |
+
"25": 0.713,
|
394 |
+
"26": 0.626
|
395 |
+
},
|
396 |
+
"sae_top_2_test_accuracy": {
|
397 |
+
"20": 0.881,
|
398 |
+
"21": 0.829,
|
399 |
+
"22": 0.703,
|
400 |
+
"25": 0.871,
|
401 |
+
"26": 0.631
|
402 |
+
},
|
403 |
+
"sae_top_5_test_accuracy": {
|
404 |
+
"20": 0.942,
|
405 |
+
"21": 0.842,
|
406 |
+
"22": 0.872,
|
407 |
+
"25": 0.885,
|
408 |
+
"26": 0.784
|
409 |
+
}
|
410 |
+
},
|
411 |
+
"canrager/amazon_reviews_mcauley_1and5_results": {
|
412 |
+
"sae_test_accuracy": {
|
413 |
+
"1": 0.9480000734329224,
|
414 |
+
"2": 0.9460000395774841,
|
415 |
+
"3": 0.9170000553131104,
|
416 |
+
"5": 0.9080000519752502,
|
417 |
+
"6": 0.8790000677108765
|
418 |
+
},
|
419 |
+
"llm_test_accuracy": {
|
420 |
+
"1": 0.9500000476837158,
|
421 |
+
"2": 0.937000036239624,
|
422 |
+
"3": 0.9260000586509705,
|
423 |
+
"5": 0.9120000600814819,
|
424 |
+
"6": 0.8560000658035278
|
425 |
+
},
|
426 |
+
"llm_top_1_test_accuracy": {
|
427 |
+
"1": 0.693,
|
428 |
+
"2": 0.607,
|
429 |
+
"3": 0.579,
|
430 |
+
"5": 0.577,
|
431 |
+
"6": 0.601
|
432 |
+
},
|
433 |
+
"llm_top_2_test_accuracy": {
|
434 |
+
"1": 0.747,
|
435 |
+
"2": 0.64,
|
436 |
+
"3": 0.607,
|
437 |
+
"5": 0.628,
|
438 |
+
"6": 0.619
|
439 |
+
},
|
440 |
+
"llm_top_5_test_accuracy": {
|
441 |
+
"1": 0.78,
|
442 |
+
"2": 0.657,
|
443 |
+
"3": 0.667,
|
444 |
+
"5": 0.659,
|
445 |
+
"6": 0.684
|
446 |
+
},
|
447 |
+
"sae_top_1_test_accuracy": {
|
448 |
+
"1": 0.859,
|
449 |
+
"2": 0.858,
|
450 |
+
"3": 0.578,
|
451 |
+
"5": 0.862,
|
452 |
+
"6": 0.583
|
453 |
+
},
|
454 |
+
"sae_top_2_test_accuracy": {
|
455 |
+
"1": 0.888,
|
456 |
+
"2": 0.869,
|
457 |
+
"3": 0.589,
|
458 |
+
"5": 0.857,
|
459 |
+
"6": 0.692
|
460 |
+
},
|
461 |
+
"sae_top_5_test_accuracy": {
|
462 |
+
"1": 0.925,
|
463 |
+
"2": 0.865,
|
464 |
+
"3": 0.739,
|
465 |
+
"5": 0.866,
|
466 |
+
"6": 0.749
|
467 |
+
}
|
468 |
+
},
|
469 |
+
"canrager/amazon_reviews_mcauley_1and5_sentiment_results": {
|
470 |
+
"sae_test_accuracy": {
|
471 |
+
"1.0": 0.9740000367164612,
|
472 |
+
"5.0": 0.9730000495910645
|
473 |
+
},
|
474 |
+
"llm_test_accuracy": {
|
475 |
+
"1.0": 0.9820000529289246,
|
476 |
+
"5.0": 0.9820000529289246
|
477 |
+
},
|
478 |
+
"llm_top_1_test_accuracy": {
|
479 |
+
"1.0": 0.672,
|
480 |
+
"5.0": 0.672
|
481 |
+
},
|
482 |
+
"llm_top_2_test_accuracy": {
|
483 |
+
"1.0": 0.724,
|
484 |
+
"5.0": 0.724
|
485 |
+
},
|
486 |
+
"llm_top_5_test_accuracy": {
|
487 |
+
"1.0": 0.766,
|
488 |
+
"5.0": 0.766
|
489 |
+
},
|
490 |
+
"sae_top_1_test_accuracy": {
|
491 |
+
"1.0": 0.882,
|
492 |
+
"5.0": 0.882
|
493 |
+
},
|
494 |
+
"sae_top_2_test_accuracy": {
|
495 |
+
"1.0": 0.909,
|
496 |
+
"5.0": 0.909
|
497 |
+
},
|
498 |
+
"sae_top_5_test_accuracy": {
|
499 |
+
"1.0": 0.947,
|
500 |
+
"5.0": 0.947
|
501 |
+
}
|
502 |
+
},
|
503 |
+
"codeparrot/github-code_results": {
|
504 |
+
"sae_test_accuracy": {
|
505 |
+
"C": 0.9450000524520874,
|
506 |
+
"Python": 0.9820000529289246,
|
507 |
+
"HTML": 0.984000027179718,
|
508 |
+
"Java": 0.9620000720024109,
|
509 |
+
"PHP": 0.9580000638961792
|
510 |
+
},
|
511 |
+
"llm_test_accuracy": {
|
512 |
+
"C": 0.956000030040741,
|
513 |
+
"Python": 0.987000048160553,
|
514 |
+
"HTML": 0.9940000176429749,
|
515 |
+
"Java": 0.9610000252723694,
|
516 |
+
"PHP": 0.9590000510215759
|
517 |
+
},
|
518 |
+
"llm_top_1_test_accuracy": {
|
519 |
+
"C": 0.657,
|
520 |
+
"Python": 0.636,
|
521 |
+
"HTML": 0.733,
|
522 |
+
"Java": 0.616,
|
523 |
+
"PHP": 0.584
|
524 |
+
},
|
525 |
+
"llm_top_2_test_accuracy": {
|
526 |
+
"C": 0.671,
|
527 |
+
"Python": 0.668,
|
528 |
+
"HTML": 0.803,
|
529 |
+
"Java": 0.68,
|
530 |
+
"PHP": 0.642
|
531 |
+
},
|
532 |
+
"llm_top_5_test_accuracy": {
|
533 |
+
"C": 0.765,
|
534 |
+
"Python": 0.727,
|
535 |
+
"HTML": 0.943,
|
536 |
+
"Java": 0.735,
|
537 |
+
"PHP": 0.693
|
538 |
+
},
|
539 |
+
"sae_top_1_test_accuracy": {
|
540 |
+
"C": 0.631,
|
541 |
+
"Python": 0.643,
|
542 |
+
"HTML": 0.687,
|
543 |
+
"Java": 0.64,
|
544 |
+
"PHP": 0.6
|
545 |
+
},
|
546 |
+
"sae_top_2_test_accuracy": {
|
547 |
+
"C": 0.647,
|
548 |
+
"Python": 0.921,
|
549 |
+
"HTML": 0.737,
|
550 |
+
"Java": 0.656,
|
551 |
+
"PHP": 0.925
|
552 |
+
},
|
553 |
+
"sae_top_5_test_accuracy": {
|
554 |
+
"C": 0.697,
|
555 |
+
"Python": 0.934,
|
556 |
+
"HTML": 0.905,
|
557 |
+
"Java": 0.69,
|
558 |
+
"PHP": 0.928
|
559 |
+
}
|
560 |
+
},
|
561 |
+
"fancyzhx/ag_news_results": {
|
562 |
+
"sae_test_accuracy": {
|
563 |
+
"0": 0.9280000329017639,
|
564 |
+
"1": 0.984000027179718,
|
565 |
+
"2": 0.9330000281333923,
|
566 |
+
"3": 0.9470000267028809
|
567 |
+
},
|
568 |
+
"llm_test_accuracy": {
|
569 |
+
"0": 0.9390000700950623,
|
570 |
+
"1": 0.984000027179718,
|
571 |
+
"2": 0.9160000681877136,
|
572 |
+
"3": 0.9450000524520874
|
573 |
+
},
|
574 |
+
"llm_top_1_test_accuracy": {
|
575 |
+
"0": 0.806,
|
576 |
+
"1": 0.662,
|
577 |
+
"2": 0.671,
|
578 |
+
"3": 0.791
|
579 |
+
},
|
580 |
+
"llm_top_2_test_accuracy": {
|
581 |
+
"0": 0.796,
|
582 |
+
"1": 0.796,
|
583 |
+
"2": 0.694,
|
584 |
+
"3": 0.809
|
585 |
+
},
|
586 |
+
"llm_top_5_test_accuracy": {
|
587 |
+
"0": 0.816,
|
588 |
+
"1": 0.885,
|
589 |
+
"2": 0.744,
|
590 |
+
"3": 0.84
|
591 |
+
},
|
592 |
+
"sae_top_1_test_accuracy": {
|
593 |
+
"0": 0.781,
|
594 |
+
"1": 0.691,
|
595 |
+
"2": 0.67,
|
596 |
+
"3": 0.63
|
597 |
+
},
|
598 |
+
"sae_top_2_test_accuracy": {
|
599 |
+
"0": 0.762,
|
600 |
+
"1": 0.674,
|
601 |
+
"2": 0.698,
|
602 |
+
"3": 0.65
|
603 |
+
},
|
604 |
+
"sae_top_5_test_accuracy": {
|
605 |
+
"0": 0.84,
|
606 |
+
"1": 0.904,
|
607 |
+
"2": 0.796,
|
608 |
+
"3": 0.727
|
609 |
+
}
|
610 |
+
},
|
611 |
+
"Helsinki-NLP/europarl_results": {
|
612 |
+
"sae_test_accuracy": {
|
613 |
+
"en": 0.999000072479248,
|
614 |
+
"fr": 0.999000072479248,
|
615 |
+
"de": 1.0,
|
616 |
+
"es": 0.9980000257492065,
|
617 |
+
"nl": 0.999000072479248
|
618 |
+
},
|
619 |
+
"llm_test_accuracy": {
|
620 |
+
"en": 1.0,
|
621 |
+
"fr": 1.0,
|
622 |
+
"de": 1.0,
|
623 |
+
"es": 1.0,
|
624 |
+
"nl": 0.9980000257492065
|
625 |
+
},
|
626 |
+
"llm_top_1_test_accuracy": {
|
627 |
+
"en": 0.749,
|
628 |
+
"fr": 0.605,
|
629 |
+
"de": 0.741,
|
630 |
+
"es": 0.913,
|
631 |
+
"nl": 0.64
|
632 |
+
},
|
633 |
+
"llm_top_2_test_accuracy": {
|
634 |
+
"en": 0.831,
|
635 |
+
"fr": 0.607,
|
636 |
+
"de": 0.828,
|
637 |
+
"es": 0.915,
|
638 |
+
"nl": 0.753
|
639 |
+
},
|
640 |
+
"llm_top_5_test_accuracy": {
|
641 |
+
"en": 0.888,
|
642 |
+
"fr": 0.924,
|
643 |
+
"de": 0.882,
|
644 |
+
"es": 0.98,
|
645 |
+
"nl": 0.86
|
646 |
+
},
|
647 |
+
"sae_top_1_test_accuracy": {
|
648 |
+
"en": 0.852,
|
649 |
+
"fr": 0.994,
|
650 |
+
"de": 0.9,
|
651 |
+
"es": 0.802,
|
652 |
+
"nl": 0.736
|
653 |
+
},
|
654 |
+
"sae_top_2_test_accuracy": {
|
655 |
+
"en": 0.838,
|
656 |
+
"fr": 0.994,
|
657 |
+
"de": 0.92,
|
658 |
+
"es": 0.9,
|
659 |
+
"nl": 0.998
|
660 |
+
},
|
661 |
+
"sae_top_5_test_accuracy": {
|
662 |
+
"en": 0.997,
|
663 |
+
"fr": 0.994,
|
664 |
+
"de": 0.938,
|
665 |
+
"es": 0.995,
|
666 |
+
"nl": 0.997
|
667 |
+
}
|
668 |
+
}
|
669 |
+
}
|
670 |
+
}
|
random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "945968cc-e6d4-4ea1-9ea9-30e085bc5389",
|
73 |
+
"datetime_epoch_millis": 1738793323541,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.008049994707107544,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.010899996757507325,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0028500020503997806,
|
79 |
+
"tpp_threshold_5_total_metric": 0.011550003290176391,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.01510000228881836,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.0035499989986419677,
|
82 |
+
"tpp_threshold_10_total_metric": 0.02980000078678131,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.03420000672340393,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.00440000593662262,
|
85 |
+
"tpp_threshold_20_total_metric": 0.058100007474422455,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.06480000615119935,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.006699998676776887,
|
88 |
+
"tpp_threshold_50_total_metric": 0.14129999876022337,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.15059999823570253,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.009299999475479126,
|
91 |
+
"tpp_threshold_100_total_metric": 0.2181250184774399,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.2313000202178955,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.013175001740455626,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3984750136733055,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.41890001893043516,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.020425005257129668
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.007599985599517823,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.009999990463256836,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0024000048637390138,
|
105 |
+
"tpp_threshold_5_total_metric": 0.013449999690055846,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.016600000858306884,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0031500011682510376,
|
108 |
+
"tpp_threshold_10_total_metric": 0.028599995374679565,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.031599998474121094,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000003099441528,
|
111 |
+
"tpp_threshold_20_total_metric": 0.07145001292228699,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.07780001163482667,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.006349998712539673,
|
114 |
+
"tpp_threshold_50_total_metric": 0.15539998710155487,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.16179999113082885,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.006400004029273987,
|
117 |
+
"tpp_threshold_100_total_metric": 0.2591500222682953,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.2678000211715698,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.008649998903274536,
|
120 |
+
"tpp_threshold_500_total_metric": 0.4453500181436539,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.4556000232696533,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.01025000512599945
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.008500003814697265,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.011800003051757813,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003299999237060547,
|
129 |
+
"tpp_threshold_5_total_metric": 0.009650006890296936,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.013600003719329835,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.003949996829032898,
|
132 |
+
"tpp_threshold_10_total_metric": 0.031000006198883056,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.03680001497268677,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005800008773803711,
|
135 |
+
"tpp_threshold_20_total_metric": 0.04475000202655792,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.051800000667572024,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.0070499986410140995,
|
138 |
+
"tpp_threshold_50_total_metric": 0.1272000104188919,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.13940000534057617,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.012199994921684266,
|
141 |
+
"tpp_threshold_100_total_metric": 0.17710001468658448,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.1948000192642212,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.017700004577636718,
|
144 |
+
"tpp_threshold_500_total_metric": 0.35160000920295714,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.38220001459121705,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.03060000538825989
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.01075001060962677,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.013000011444091797,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.014250010251998901,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.018000006675720215,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037499964237213135,
|
187 |
+
"tpp_threshold_10_total_metric": 0.011250033974647522,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.01500004529953003,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037500113248825073,
|
190 |
+
"tpp_threshold_20_total_metric": 0.02625003457069397,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.029000043869018555,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.002750009298324585,
|
193 |
+
"tpp_threshold_50_total_metric": 0.05324999988079071,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.0559999942779541,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.002749994397163391,
|
196 |
+
"tpp_threshold_100_total_metric": 0.12700004875659943,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.13100004196166992,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.003999993205070496,
|
199 |
+
"tpp_threshold_500_total_metric": 0.4242500364780426,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.4280000329017639,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0037499964237213135
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.0059999823570251465,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0059999823570251465,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.0,
|
207 |
+
"tpp_threshold_5_total_metric": 0.0052499473094940186,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500221729278564,
|
210 |
+
"tpp_threshold_10_total_metric": 0.013749957084655762,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.012999951839447021,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0007500052452087402,
|
213 |
+
"tpp_threshold_20_total_metric": 0.05449996888637543,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.06199997663497925,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.007500007748603821,
|
216 |
+
"tpp_threshold_50_total_metric": 0.143249973654747,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.1499999761581421,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.006750002503395081,
|
219 |
+
"tpp_threshold_100_total_metric": 0.22350002825260162,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.23000001907348633,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.006499990820884705,
|
222 |
+
"tpp_threshold_500_total_metric": 0.4404999762773514,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.44999998807907104,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.009500011801719666
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.006999969482421875,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
230 |
+
"tpp_threshold_5_total_metric": 0.021250009536743164,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.027000010013580322,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005750000476837158,
|
233 |
+
"tpp_threshold_10_total_metric": 0.04700000584125519,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.050000011920928955,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000006079673767,
|
236 |
+
"tpp_threshold_20_total_metric": 0.06775002181529999,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.0690000057220459,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0012499839067459106,
|
239 |
+
"tpp_threshold_50_total_metric": 0.127749964594841,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.12699997425079346,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0007499903440475464,
|
242 |
+
"tpp_threshold_100_total_metric": 0.2237500101327896,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.22699999809265137,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0032499879598617554,
|
245 |
+
"tpp_threshold_500_total_metric": 0.43700000643730164,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.4440000057220459,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.006999999284744263
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.002749994397163391,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0012500137090682983,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0030000507831573486,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000169277191162,
|
256 |
+
"tpp_threshold_10_total_metric": 0.0065000057220458984,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002499997615814209,
|
259 |
+
"tpp_threshold_20_total_metric": 0.0975000262260437,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.11000001430511475,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.012499988079071045,
|
262 |
+
"tpp_threshold_50_total_metric": 0.23750002682209015,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.25200003385543823,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.014500007033348083,
|
265 |
+
"tpp_threshold_100_total_metric": 0.34950003027915955,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.3670000433921814,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.01750001311302185,
|
268 |
+
"tpp_threshold_500_total_metric": 0.46025004982948303,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.47700005769729614,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.01675000786781311
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.011499971151351929,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.015999972820281982,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
276 |
+
"tpp_threshold_5_total_metric": 0.023499980568885803,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.02899998426437378,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005500003695487976,
|
279 |
+
"tpp_threshold_10_total_metric": 0.06449997425079346,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.07099997997283936,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.0065000057220458984,
|
282 |
+
"tpp_threshold_20_total_metric": 0.11125001311302185,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.11900001764297485,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.007750004529953003,
|
285 |
+
"tpp_threshold_50_total_metric": 0.21524997055530548,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.2239999771118164,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.008750006556510925,
|
288 |
+
"tpp_threshold_100_total_metric": 0.37199999392032623,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.3840000033378601,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.012000009417533875,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4647500216960907,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.014250010251998901
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.006000041961669922,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.010000050067901611,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
301 |
+
"tpp_threshold_5_total_metric": 0.010250017046928406,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
304 |
+
"tpp_threshold_10_total_metric": 0.011750057339668274,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.01900005340576172,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.007249996066093445,
|
307 |
+
"tpp_threshold_20_total_metric": 0.026500031352043152,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.03100001811981201,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.00449998676776886,
|
310 |
+
"tpp_threshold_50_total_metric": 0.0532500296831131,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.057000041007995605,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037500113248825073,
|
313 |
+
"tpp_threshold_100_total_metric": 0.09050005674362183,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.10200005769729614,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.011500000953674316,
|
316 |
+
"tpp_threshold_500_total_metric": 0.37300005555152893,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.38600003719329834,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.01299998164176941
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.006749987602233887,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0002499818801879883,
|
324 |
+
"tpp_threshold_5_total_metric": -0.0059999823570251465,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.009999990463256836,
|
327 |
+
"tpp_threshold_10_total_metric": 0.029999956488609314,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.0339999794960022,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.004000023007392883,
|
330 |
+
"tpp_threshold_20_total_metric": 0.04174995422363281,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.04499995708465576,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.0032500028610229492,
|
333 |
+
"tpp_threshold_50_total_metric": 0.09049999713897705,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.10699999332427979,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.016499996185302734,
|
336 |
+
"tpp_threshold_100_total_metric": 0.14699998497962952,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.171999990940094,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.025000005960464478,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3572499603033066,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.390999972820282,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.0337500125169754
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.00849999487400055,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999996423721313,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0012499839067459106,
|
348 |
+
"tpp_threshold_5_intended_diff_only": -0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.00024999678134918213,
|
350 |
+
"tpp_threshold_10_total_metric": 0.01599995791912079,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.0209999680519104,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.005000010132789612,
|
353 |
+
"tpp_threshold_20_total_metric": 0.0052499920129776,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.013999998569488525,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.008750006556510925,
|
356 |
+
"tpp_threshold_50_total_metric": 0.07024997472763062,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.08099997043609619,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.010749995708465576,
|
359 |
+
"tpp_threshold_100_total_metric": 0.1054999977350235,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.12099999189376831,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.015499994158744812,
|
362 |
+
"tpp_threshold_500_total_metric": 0.33024996519088745,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3619999885559082,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.03175002336502075
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.019249990582466125,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.02399998903274536,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.004749998450279236,
|
370 |
+
"tpp_threshold_5_total_metric": 0.026249989867210388,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.03299999237060547,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006750002503395081,
|
373 |
+
"tpp_threshold_10_total_metric": 0.03400002419948578,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.04300004243850708,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.009000018239021301,
|
376 |
+
"tpp_threshold_20_total_metric": 0.054000020027160645,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.06300002336502075,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.009000003337860107,
|
379 |
+
"tpp_threshold_50_total_metric": 0.16875000298023224,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.1809999942779541,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.012249991297721863,
|
382 |
+
"tpp_threshold_100_total_metric": 0.2290000468492508,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.24600005149841309,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.017000004649162292,
|
385 |
+
"tpp_threshold_500_total_metric": 0.35500001907348633,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.3960000276565552,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.04100000858306885
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.018999993801116943,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.023000001907348633,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
393 |
+
"tpp_threshold_5_total_metric": 0.018999993801116943,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.018999993801116943,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0,
|
396 |
+
"tpp_threshold_10_total_metric": 0.06325003504753113,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.06700003147125244,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0037499964237213135,
|
399 |
+
"tpp_threshold_20_total_metric": 0.0962500125169754,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.10600000619888306,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.009749993681907654,
|
402 |
+
"tpp_threshold_50_total_metric": 0.25325004756450653,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.2710000276565552,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.017749980092048645,
|
405 |
+
"tpp_threshold_100_total_metric": 0.31349998712539673,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.3330000042915344,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.019500017166137695,
|
408 |
+
"tpp_threshold_500_total_metric": 0.3425000458955765,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.3760000467300415,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.03350000083446503
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "096e7204-f9f3-43f1-9a8f-eeaba02309e8",
|
73 |
+
"datetime_epoch_millis": 1738793209531,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.010400001704692841,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.013500005006790161,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.0031000033020973207,
|
79 |
+
"tpp_threshold_5_total_metric": 0.01885000616312027,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.022200006246566772,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003350000083446503,
|
82 |
+
"tpp_threshold_10_total_metric": 0.02705000340938568,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.03130000233650208,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004249998927116394,
|
85 |
+
"tpp_threshold_20_total_metric": 0.05490000545978546,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.06270000338554382,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.007799997925758362,
|
88 |
+
"tpp_threshold_50_total_metric": 0.12465001344680787,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.13350001573562623,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.008850002288818359,
|
91 |
+
"tpp_threshold_100_total_metric": 0.22405002117156983,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.233400022983551,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.009350001811981201,
|
94 |
+
"tpp_threshold_500_total_metric": 0.40177501887083056,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.419100022315979,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.01732500344514847
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.010649994015693665,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.012999999523162841,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0023500055074691774,
|
105 |
+
"tpp_threshold_5_total_metric": 0.01860001087188721,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.02120000123977661,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002599990367889404,
|
108 |
+
"tpp_threshold_10_total_metric": 0.030500003695487977,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.033399999141693115,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.002899995446205139,
|
111 |
+
"tpp_threshold_20_total_metric": 0.06360001266002654,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.07040001153945923,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.006799998879432678,
|
114 |
+
"tpp_threshold_50_total_metric": 0.13365001678466798,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.1414000153541565,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.007749998569488525,
|
117 |
+
"tpp_threshold_100_total_metric": 0.264050030708313,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.27040002346038816,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.006349992752075195,
|
120 |
+
"tpp_threshold_500_total_metric": 0.4458000212907791,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.45540002584457395,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.009600004553794861
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.010150009393692016,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.014000010490417481,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003850001096725464,
|
129 |
+
"tpp_threshold_5_total_metric": 0.01910000145435333,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.023200011253356932,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.004100009799003601,
|
132 |
+
"tpp_threshold_10_total_metric": 0.023600003123283385,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.029200005531311034,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005600002408027649,
|
135 |
+
"tpp_threshold_20_total_metric": 0.04619999825954437,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.05499999523162842,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.008799996972084046,
|
138 |
+
"tpp_threshold_50_total_metric": 0.11565001010894775,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.12560001611709595,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.009950006008148193,
|
141 |
+
"tpp_threshold_100_total_metric": 0.18405001163482665,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.19640002250671387,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.012350010871887206,
|
144 |
+
"tpp_threshold_500_total_metric": 0.35775001645088195,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.382800018787384,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.025050002336502075
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.005750000476837158,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.009000003337860107,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.0032500028610229492,
|
184 |
+
"tpp_threshold_5_total_metric": 0.012000024318695068,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0009999871253967285,
|
187 |
+
"tpp_threshold_10_total_metric": 0.021500051021575928,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.021000027656555176,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": -0.000500023365020752,
|
190 |
+
"tpp_threshold_20_total_metric": 0.02525004744529724,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.029000043869018555,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0037499964237213135,
|
193 |
+
"tpp_threshold_50_total_metric": 0.06125004589557648,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.06700003147125244,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.005749985575675964,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1480000615119934,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.15000003576278687,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.001999974250793457,
|
199 |
+
"tpp_threshold_500_total_metric": 0.4255000650882721,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.43000006675720215,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.004500001668930054
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.006749972701072693,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0017499923706054688,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0040000081062316895,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.005750000476837158,
|
210 |
+
"tpp_threshold_10_total_metric": 0.010749995708465576,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.013999998569488525,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
213 |
+
"tpp_threshold_20_total_metric": 0.019000008702278137,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.023000001907348633,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.003999993205070496,
|
216 |
+
"tpp_threshold_50_total_metric": 0.04249997437000275,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.04799997806549072,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.005500003695487976,
|
219 |
+
"tpp_threshold_100_total_metric": 0.20250001549720764,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.2070000171661377,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.004500001668930054,
|
222 |
+
"tpp_threshold_500_total_metric": 0.43525002896785736,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.44700002670288086,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.011749997735023499
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.02275000512599945,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.027000010013580322,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
230 |
+
"tpp_threshold_5_total_metric": 0.04124997556209564,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.042999982833862305,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500072717666626,
|
233 |
+
"tpp_threshold_10_total_metric": 0.04849998652935028,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.05199998617172241,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0034999996423721313,
|
236 |
+
"tpp_threshold_20_total_metric": 0.07100000977516174,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.07400000095367432,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029999911785125732,
|
239 |
+
"tpp_threshold_50_total_metric": 0.1365000307559967,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.14100003242492676,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.004500001668930054,
|
242 |
+
"tpp_threshold_100_total_metric": 0.22975002229213715,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.23400002717971802,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.004250004887580872,
|
245 |
+
"tpp_threshold_500_total_metric": 0.4334999918937683,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.4440000057220459,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.010500013828277588
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.003000035881996155,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0020000338554382324,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": -0.0010000020265579224,
|
253 |
+
"tpp_threshold_5_total_metric": 0.003250017762184143,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.004999995231628418,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": 0.001749977469444275,
|
256 |
+
"tpp_threshold_10_total_metric": 0.007750004529953003,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.0012499988079071045,
|
259 |
+
"tpp_threshold_20_total_metric": 0.10650002956390381,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.12200003862380981,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.015500009059906006,
|
262 |
+
"tpp_threshold_50_total_metric": 0.2407500445842743,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.2560000419616699,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.01524999737739563,
|
265 |
+
"tpp_threshold_100_total_metric": 0.37400004267692566,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.38700002431869507,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.01299998164176941,
|
268 |
+
"tpp_threshold_500_total_metric": 0.4647500365972519,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.014249995350837708
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.014999955892562866,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.015999972820281982,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.0010000169277191162,
|
276 |
+
"tpp_threshold_5_total_metric": 0.03825002908706665,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.04100000858306885,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.0027499794960021973,
|
279 |
+
"tpp_threshold_10_total_metric": 0.06399998068809509,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.07099997997283936,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.006999999284744263,
|
282 |
+
"tpp_threshold_20_total_metric": 0.09624996781349182,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.10399997234344482,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.007750004529953003,
|
285 |
+
"tpp_threshold_50_total_metric": 0.18724998831748962,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.19499999284744263,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.007750004529953003,
|
288 |
+
"tpp_threshold_100_total_metric": 0.3660000115633011,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.37400001287460327,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.008000001311302185,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4699999839067459,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.47699999809265137,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.0070000141859054565
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.006250053644180298,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.010000050067901611,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.007500022649765015,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.012000024318695068,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.004500001668930054,
|
304 |
+
"tpp_threshold_10_total_metric": 0.010000020265579224,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.006999999284744263,
|
307 |
+
"tpp_threshold_20_total_metric": 0.011750027537345886,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.022000014781951904,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.010249987244606018,
|
310 |
+
"tpp_threshold_50_total_metric": 0.034000054001808167,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.04300004243850708,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.008999988436698914,
|
313 |
+
"tpp_threshold_100_total_metric": 0.07725003361701965,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.08500003814697266,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.007750004529953003,
|
316 |
+
"tpp_threshold_500_total_metric": 0.34775005280971527,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.35700005292892456,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.00925000011920929
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.012499943375587463,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.01699995994567871,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0045000165700912476,
|
324 |
+
"tpp_threshold_5_total_metric": 0.019499972462654114,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.02399998903274536,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.0045000165700912476,
|
327 |
+
"tpp_threshold_10_total_metric": 0.0207500159740448,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.027000010013580322,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.0062499940395355225,
|
330 |
+
"tpp_threshold_20_total_metric": 0.0469999760389328,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.05699998140335083,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.01000000536441803,
|
333 |
+
"tpp_threshold_50_total_metric": 0.11425000429153442,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.12400001287460327,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.009750008583068848,
|
336 |
+
"tpp_threshold_100_total_metric": 0.2214999794960022,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.23600000143051147,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.014500021934509277,
|
339 |
+
"tpp_threshold_500_total_metric": 0.39799998700618744,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.4269999861717224,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.028999999165534973
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": 0.002749994397163391,
|
345 |
+
"tpp_threshold_2_intended_diff_only": 0.0009999871253967285,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": -0.0017500072717666626,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0015000253915786743,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.002500012516975403,
|
350 |
+
"tpp_threshold_10_total_metric": -0.00025004148483276367,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.0029999613761901855,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
353 |
+
"tpp_threshold_20_total_metric": 0.014499962329864502,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.0209999680519104,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.0065000057220458984,
|
356 |
+
"tpp_threshold_50_total_metric": 0.04299996793270111,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.05199998617172241,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.009000018239021301,
|
359 |
+
"tpp_threshold_100_total_metric": 0.08699999749660492,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.09600001573562622,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.009000018239021301,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3267500102519989,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3540000319480896,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.027250021696090698
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.015250042080879211,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.025000035762786865,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.009749993681907654,
|
370 |
+
"tpp_threshold_5_total_metric": 0.028000012040138245,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.0350000262260437,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0070000141859054565,
|
373 |
+
"tpp_threshold_10_total_metric": 0.04300001263618469,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.04900002479553223,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.006000012159347534,
|
376 |
+
"tpp_threshold_20_total_metric": 0.0780000239610672,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.0910000205039978,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.012999996542930603,
|
379 |
+
"tpp_threshold_50_total_metric": 0.1822500377893448,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.19600003957748413,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.013750001788139343,
|
382 |
+
"tpp_threshold_100_total_metric": 0.23400002717971802,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.25200003385543823,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.018000006675720215,
|
385 |
+
"tpp_threshold_500_total_metric": 0.36400002241134644,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.4020000100135803,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03799998760223389
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.01400001347064972,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.017000019550323486,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000006079673767,
|
393 |
+
"tpp_threshold_5_total_metric": 0.042000025510787964,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.04400002956390381,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0020000040531158447,
|
396 |
+
"tpp_threshold_10_total_metric": 0.04450000822544098,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.050000011920928955,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.005500003695487976,
|
399 |
+
"tpp_threshold_20_total_metric": 0.07975000143051147,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.08399999141693115,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.004249989986419678,
|
402 |
+
"tpp_threshold_50_total_metric": 0.20474998652935028,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.21299999952316284,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.008250012993812561,
|
405 |
+
"tpp_threshold_100_total_metric": 0.3005000203847885,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.31300002336502075,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.012500002980232239,
|
408 |
+
"tpp_threshold_500_total_metric": 0.35225000977516174,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.37400001287460327,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.02175000309944153
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "b73c894a-3e5b-4138-b75b-ceb8d5a28bdd",
|
73 |
+
"datetime_epoch_millis": 1738793438068,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.010875004529953002,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.014300006628036498,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.003425002098083496,
|
79 |
+
"tpp_threshold_5_total_metric": 0.017374998331069945,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.021299999952316285,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003925001621246338,
|
82 |
+
"tpp_threshold_10_total_metric": 0.032375001907348634,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.03820000290870666,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.0058250010013580315,
|
85 |
+
"tpp_threshold_20_total_metric": 0.06415000408887864,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.07130000591278077,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.00715000182390213,
|
88 |
+
"tpp_threshold_50_total_metric": 0.1440250039100647,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.1531000077724457,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.00907500386238098,
|
91 |
+
"tpp_threshold_100_total_metric": 0.2260250121355057,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.23930001258850098,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.0132750004529953,
|
94 |
+
"tpp_threshold_500_total_metric": 0.40262500792741773,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.4224000096321106,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.01977500170469284
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.007649996876716613,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.010399997234344482,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0027500003576278686,
|
105 |
+
"tpp_threshold_5_total_metric": 0.018599998950958253,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.022200000286102296,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.003600001335144043,
|
108 |
+
"tpp_threshold_10_total_metric": 0.034799987077713014,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.040999984741210936,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.006199997663497925,
|
111 |
+
"tpp_threshold_20_total_metric": 0.08539999425411224,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.09199999570846558,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.006600001454353332,
|
114 |
+
"tpp_threshold_50_total_metric": 0.16370000541210175,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.17060000896453859,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.006900003552436829,
|
117 |
+
"tpp_threshold_100_total_metric": 0.26725001335144044,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.27660001516342164,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.009350001811981201,
|
120 |
+
"tpp_threshold_500_total_metric": 0.44620001316070557,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.45620001554489137,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.010000002384185792
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.014100012183189393,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.018200016021728514,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.004100003838539123,
|
129 |
+
"tpp_threshold_5_total_metric": 0.01614999771118164,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.020399999618530274,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.004250001907348633,
|
132 |
+
"tpp_threshold_10_total_metric": 0.029950016736984254,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.035400021076202395,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005450004339218139,
|
135 |
+
"tpp_threshold_20_total_metric": 0.04290001392364502,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.05060001611709595,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.007700002193450928,
|
138 |
+
"tpp_threshold_50_total_metric": 0.12435000240802765,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.1356000065803528,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.011250004172325134,
|
141 |
+
"tpp_threshold_100_total_metric": 0.18480001091957093,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.2020000100135803,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.017199999094009398,
|
144 |
+
"tpp_threshold_500_total_metric": 0.35905000269412995,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.38860000371932985,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.02955000102519989
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.01250004768371582,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.01500004529953003,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002499997615814209,
|
184 |
+
"tpp_threshold_5_total_metric": 0.016500040888786316,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.020000040531158447,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
187 |
+
"tpp_threshold_10_total_metric": 0.014999985694885254,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.018999993801116943,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0040000081062316895,
|
190 |
+
"tpp_threshold_20_total_metric": 0.030500024557113647,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.0350000262260437,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.004500001668930054,
|
193 |
+
"tpp_threshold_50_total_metric": 0.05800001323223114,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.06300002336502075,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.005000010132789612,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1482500582933426,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.15400004386901855,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.005749985575675964,
|
199 |
+
"tpp_threshold_500_total_metric": 0.43025003373622894,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.43300002813339233,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.002749994397163391
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.006249964237213135,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.006999969482421875,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.0007500052452087402,
|
207 |
+
"tpp_threshold_5_total_metric": 0.003999963402748108,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.006999969482421875,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.003000006079673767,
|
210 |
+
"tpp_threshold_10_total_metric": 0.007999956607818604,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.007999956607818604,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0,
|
213 |
+
"tpp_threshold_20_total_metric": 0.02549995481967926,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.030999958515167236,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.005500003695487976,
|
216 |
+
"tpp_threshold_50_total_metric": 0.06399998068809509,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.07099997997283936,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.006999999284744263,
|
219 |
+
"tpp_threshold_100_total_metric": 0.19199995696544647,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.20099997520446777,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.009000018239021301,
|
222 |
+
"tpp_threshold_500_total_metric": 0.4364999681711197,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.44599997997283936,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.009500011801719666
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.010749980807304382,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.014999985694885254,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
230 |
+
"tpp_threshold_5_total_metric": 0.02699999511241913,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.03299999237060547,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.00599999725818634,
|
233 |
+
"tpp_threshold_10_total_metric": 0.05600002408027649,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.05900001525878906,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0029999911785125732,
|
236 |
+
"tpp_threshold_20_total_metric": 0.07649999856948853,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.07899999618530273,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.002499997615814209,
|
239 |
+
"tpp_threshold_50_total_metric": 0.14124999940395355,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.14300000667572021,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.0017500072717666626,
|
242 |
+
"tpp_threshold_100_total_metric": 0.23274999856948853,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.23600000143051147,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0032500028610229492,
|
245 |
+
"tpp_threshold_500_total_metric": 0.4345000237226486,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.44300001859664917,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.00849999487400055
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0020000040531158447,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0020000040531158447,
|
253 |
+
"tpp_threshold_5_total_metric": 0.008250012993812561,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.008000016212463379,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.00024999678134918213,
|
256 |
+
"tpp_threshold_10_total_metric": 0.03300000727176666,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.050999999046325684,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.01799999177455902,
|
259 |
+
"tpp_threshold_20_total_metric": 0.19350002706050873,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.20600003004074097,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.012500002980232239,
|
262 |
+
"tpp_threshold_50_total_metric": 0.3450000137090683,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.36000001430511475,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.015000000596046448,
|
265 |
+
"tpp_threshold_100_total_metric": 0.3972500413656235,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.4140000343322754,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.016749992966651917,
|
268 |
+
"tpp_threshold_500_total_metric": 0.4632500112056732,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.48000001907348633,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.01675000786781311
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.006749987602233887,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.010999977588653564,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.004249989986419678,
|
276 |
+
"tpp_threshold_5_total_metric": 0.037249982357025146,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.042999982833862305,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.005750000476837158,
|
279 |
+
"tpp_threshold_10_total_metric": 0.061999961733818054,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.0679999589920044,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.00599999725818634,
|
282 |
+
"tpp_threshold_20_total_metric": 0.10099996626377106,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.10899996757507324,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.008000001311302185,
|
285 |
+
"tpp_threshold_50_total_metric": 0.21025002002716064,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.2160000205039978,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.005750000476837158,
|
288 |
+
"tpp_threshold_100_total_metric": 0.3660000115633011,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.37800002098083496,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.012000009417533875,
|
291 |
+
"tpp_threshold_500_total_metric": 0.46650002896785736,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.012500002980232239
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.012250036001205444,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.016000032424926758,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.010250017046928406,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013000011444091797,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.002749994397163391,
|
304 |
+
"tpp_threshold_10_total_metric": 0.009500056505203247,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.01900005340576172,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.009499996900558472,
|
307 |
+
"tpp_threshold_20_total_metric": 0.02825005352497101,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.03400003910064697,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.005749985575675964,
|
310 |
+
"tpp_threshold_50_total_metric": 0.06874999403953552,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.07400000095367432,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.005250006914138794,
|
313 |
+
"tpp_threshold_100_total_metric": 0.1260000467300415,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.14000004529953003,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.013999998569488525,
|
316 |
+
"tpp_threshold_500_total_metric": 0.38225002586841583,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.3970000147819519,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.014749988913536072
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.018999993801116943,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.023000001907348633,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
324 |
+
"tpp_threshold_5_total_metric": 0.014999955892562866,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.02599996328353882,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.011000007390975952,
|
327 |
+
"tpp_threshold_10_total_metric": 0.027500003576278687,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.03200000524520874,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.004500001668930054,
|
330 |
+
"tpp_threshold_20_total_metric": 0.0352499783039093,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.041999995708465576,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750017404556274,
|
333 |
+
"tpp_threshold_50_total_metric": 0.09974999725818634,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.11500000953674316,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.015250012278556824,
|
336 |
+
"tpp_threshold_100_total_metric": 0.16974999010562897,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.1899999976158142,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.02025000751018524,
|
339 |
+
"tpp_threshold_500_total_metric": 0.38850001990795135,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.42000001668930054,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.03149999678134918
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.008750006556510925,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.004999995231628418,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037500113248825073,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0015000104904174805,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0015000104904174805,
|
350 |
+
"tpp_threshold_10_total_metric": 0.01649998128414154,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.018000006675720215,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0015000253915786743,
|
353 |
+
"tpp_threshold_20_total_metric": 0.00849999487400055,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.018000006675720215,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.009500011801719666,
|
356 |
+
"tpp_threshold_50_total_metric": 0.05700001120567322,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.0690000057220459,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.01199999451637268,
|
359 |
+
"tpp_threshold_100_total_metric": 0.10050001740455627,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.11900001764297485,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.01850000023841858,
|
362 |
+
"tpp_threshold_500_total_metric": 0.3189999610185623,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.3529999852180481,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.03400002419948578
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.02550002932548523,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.030000030994415283,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
370 |
+
"tpp_threshold_5_total_metric": 0.029000014066696167,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.03600001335144043,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.006999999284744263,
|
373 |
+
"tpp_threshold_10_total_metric": 0.03625001013278961,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.046000003814697266,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.009749993681907654,
|
376 |
+
"tpp_threshold_20_total_metric": 0.04950001835823059,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.06000000238418579,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.0104999840259552,
|
379 |
+
"tpp_threshold_50_total_metric": 0.18125002086162567,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.19300001859664917,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.011749997735023499,
|
382 |
+
"tpp_threshold_100_total_metric": 0.22349999845027924,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.2409999966621399,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.017499998211860657,
|
385 |
+
"tpp_threshold_500_total_metric": 0.35450001060962677,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.39800000190734863,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.04349999129772186
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.02250000834465027,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.027000010013580322,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
393 |
+
"tpp_threshold_5_total_metric": 0.028000012040138245,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.027000010013580322,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0010000020265579224,
|
396 |
+
"tpp_threshold_10_total_metric": 0.06000003218650818,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.06200003623962402,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
399 |
+
"tpp_threshold_20_total_metric": 0.09300002455711365,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.09900003671646118,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.006000012159347534,
|
402 |
+
"tpp_threshold_50_total_metric": 0.2149999886751175,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.22699999809265137,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.012000009417533875,
|
405 |
+
"tpp_threshold_100_total_metric": 0.30425000190734863,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.3199999928474426,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.015749990940093994,
|
408 |
+
"tpp_threshold_500_total_metric": 0.35099999606609344,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.375,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.024000003933906555
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "4c97b22e-3ce7-4f44-9382-ba43c6b1f096",
|
73 |
+
"datetime_epoch_millis": 1738793552532,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.007999995350837707,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.011000001430511476,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.003000006079673767,
|
79 |
+
"tpp_threshold_5_total_metric": 0.01237499564886093,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.016099995374679564,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003724999725818634,
|
82 |
+
"tpp_threshold_10_total_metric": 0.027025008201599122,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.03130000829696655,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004275000095367432,
|
85 |
+
"tpp_threshold_20_total_metric": 0.05402499288320541,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.060099995136260985,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.0060750022530555725,
|
88 |
+
"tpp_threshold_50_total_metric": 0.12652500867843627,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.13470001220703126,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.00817500352859497,
|
91 |
+
"tpp_threshold_100_total_metric": 0.2042750060558319,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.2156000018119812,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.011324995756149292,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3991500198841095,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.41680002212524414,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.017650002241134645
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.007299986481666565,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.00979999303817749,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.0025000065565109254,
|
105 |
+
"tpp_threshold_5_total_metric": 0.010649988055229187,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.013799989223480224,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.0031500011682510376,
|
108 |
+
"tpp_threshold_10_total_metric": 0.022449997067451478,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.025199997425079345,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.0027500003576278686,
|
111 |
+
"tpp_threshold_20_total_metric": 0.060399994254112244,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.06639999151229858,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
|
114 |
+
"tpp_threshold_50_total_metric": 0.13545000851154326,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.14160001277923584,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.006150004267692566,
|
117 |
+
"tpp_threshold_100_total_metric": 0.23460001051425933,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.24240000247955323,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.007799991965293884,
|
120 |
+
"tpp_threshold_500_total_metric": 0.4378500312566757,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.4468000292778015,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.008949998021125793
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.00870000422000885,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.01220000982284546,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.003500005602836609,
|
129 |
+
"tpp_threshold_5_total_metric": 0.014100003242492675,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.018400001525878906,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.0042999982833862305,
|
132 |
+
"tpp_threshold_10_total_metric": 0.031600019335746764,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.03740001916885376,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.005799999833106995,
|
135 |
+
"tpp_threshold_20_total_metric": 0.04764999151229858,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.053799998760223386,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.006150007247924805,
|
138 |
+
"tpp_threshold_50_total_metric": 0.11760000884532928,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.12780001163482665,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.010200002789497375,
|
141 |
+
"tpp_threshold_100_total_metric": 0.17395000159740448,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.18880000114440917,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.0148499995470047,
|
144 |
+
"tpp_threshold_500_total_metric": 0.3604500085115433,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.3868000149726868,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.026350006461143494
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.011749997735023499,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.014500007033348083,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.018000006675720215,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0034999996423721313,
|
187 |
+
"tpp_threshold_10_total_metric": 0.014999985694885254,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.018999993801116943,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.0040000081062316895,
|
190 |
+
"tpp_threshold_20_total_metric": 0.034000009298324585,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.03700000047683716,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0029999911785125732,
|
193 |
+
"tpp_threshold_50_total_metric": 0.05224999785423279,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.0559999942779541,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037499964237213135,
|
196 |
+
"tpp_threshold_100_total_metric": 0.1365000456571579,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.14000004529953003,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.0034999996423721313,
|
199 |
+
"tpp_threshold_500_total_metric": 0.42850005626678467,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.4320000410079956,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.0034999847412109375
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.001749977469444275,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.001999974250793457,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": 0.00024999678134918213,
|
207 |
+
"tpp_threshold_5_total_metric": -0.0027500689029693604,
|
208 |
+
"tpp_threshold_5_intended_diff_only": -0.001000046730041504,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500221729278564,
|
210 |
+
"tpp_threshold_10_total_metric": 0.0027499794960021973,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.001999974250793457,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": -0.0007500052452087402,
|
213 |
+
"tpp_threshold_20_total_metric": 0.0260000079870224,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.03200000524520874,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.00599999725818634,
|
216 |
+
"tpp_threshold_50_total_metric": 0.10000000894069672,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.10600000619888306,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.00599999725818634,
|
219 |
+
"tpp_threshold_100_total_metric": 0.16649998724460602,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.17299997806549072,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.006499990820884705,
|
222 |
+
"tpp_threshold_500_total_metric": 0.3929999768733978,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.3999999761581421,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.006999999284744263
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.00974997878074646,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250019788742065,
|
230 |
+
"tpp_threshold_5_total_metric": 0.01899997889995575,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.02399998903274536,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.005000010132789612,
|
233 |
+
"tpp_threshold_10_total_metric": 0.032250016927719116,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.0350000262260437,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.002750009298324585,
|
236 |
+
"tpp_threshold_20_total_metric": 0.05024999380111694,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.05199998617172241,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0017499923706054688,
|
239 |
+
"tpp_threshold_50_total_metric": 0.0884999930858612,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.08799999952316284,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": -0.0004999935626983643,
|
242 |
+
"tpp_threshold_100_total_metric": 0.1704999953508377,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.17299997806549072,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.002499982714653015,
|
245 |
+
"tpp_threshold_500_total_metric": 0.43925003707408905,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.44600003957748413,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.006750002503395081
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.002499997615814209,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0015000104904174805,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0025000572204589844,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.000500023365020752,
|
256 |
+
"tpp_threshold_10_total_metric": 0.006750002503395081,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.009000003337860107,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
259 |
+
"tpp_threshold_20_total_metric": 0.09875001013278961,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.11100000143051147,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.012249991297721863,
|
262 |
+
"tpp_threshold_50_total_metric": 0.24550004303455353,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.2600000500679016,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.014500007033348083,
|
265 |
+
"tpp_threshold_100_total_metric": 0.3797500282526016,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.3960000276565552,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.016249999403953552,
|
268 |
+
"tpp_threshold_500_total_metric": 0.46150006353855133,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.47700005769729614,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.015499994158744812
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.010749980807304382,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.014999985694885254,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
276 |
+
"tpp_threshold_5_total_metric": 0.019999966025352478,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.02599996328353882,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.00599999725818634,
|
279 |
+
"tpp_threshold_10_total_metric": 0.05550000071525574,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.06099998950958252,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005499988794326782,
|
282 |
+
"tpp_threshold_20_total_metric": 0.09299995005130768,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.09999996423721313,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.0070000141859054565,
|
285 |
+
"tpp_threshold_50_total_metric": 0.19099999964237213,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.1980000138282776,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.0070000141859054565,
|
288 |
+
"tpp_threshold_100_total_metric": 0.31974999606609344,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.32999998331069946,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.010249987244606018,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4670000225305557,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.012000009417533875
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.010499998927116394,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0034999996423721313,
|
301 |
+
"tpp_threshold_5_total_metric": 0.01075001060962677,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0032499879598617554,
|
304 |
+
"tpp_threshold_10_total_metric": 0.011500045657157898,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.020000040531158447,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.00849999487400055,
|
307 |
+
"tpp_threshold_20_total_metric": 0.022750049829483032,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.029000043869018555,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.0062499940395355225,
|
310 |
+
"tpp_threshold_50_total_metric": 0.0585000216960907,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.06300002336502075,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.004500001668930054,
|
313 |
+
"tpp_threshold_100_total_metric": 0.09350000321865082,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.10600000619888306,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.012500002980232239,
|
316 |
+
"tpp_threshold_500_total_metric": 0.37125004827976227,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.3840000629425049,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.012750014662742615
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.016249999403953552,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.018000006675720215,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0017500072717666626,
|
324 |
+
"tpp_threshold_5_total_metric": 0.01099996268749237,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.0209999680519104,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.01000000536441803,
|
327 |
+
"tpp_threshold_10_total_metric": 0.03600001335144043,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.04100000858306885,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
|
330 |
+
"tpp_threshold_20_total_metric": 0.05199997127056122,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.05699998140335083,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.005000010132789612,
|
333 |
+
"tpp_threshold_50_total_metric": 0.10375002026557922,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.11500000953674316,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.01124998927116394,
|
336 |
+
"tpp_threshold_100_total_metric": 0.1720000058412552,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.18900001049041748,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.017000004649162292,
|
339 |
+
"tpp_threshold_500_total_metric": 0.39124996960163116,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.4179999828338623,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.02675001323223114
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.007500022649765015,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0040000081062316895,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.003500014543533325,
|
347 |
+
"tpp_threshold_5_total_metric": -0.0005000084638595581,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.0005000084638595581,
|
350 |
+
"tpp_threshold_10_total_metric": 0.01299998164176941,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.014999985694885254,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0020000040531158447,
|
353 |
+
"tpp_threshold_20_total_metric": 0.005999967455863953,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.010999977588653564,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.005000010132789612,
|
356 |
+
"tpp_threshold_50_total_metric": 0.057499960064888,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.06699997186660767,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.009500011801719666,
|
359 |
+
"tpp_threshold_100_total_metric": 0.10624997317790985,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.11799997091293335,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.011749997735023499,
|
362 |
+
"tpp_threshold_500_total_metric": 0.32750001549720764,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.35500001907348633,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.027500003576278687
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.012000009417533875,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.017000019550323486,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.005000010132789612,
|
370 |
+
"tpp_threshold_5_total_metric": 0.016750037670135498,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.025000035762786865,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.008249998092651367,
|
373 |
+
"tpp_threshold_10_total_metric": 0.03800003230571747,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.0480000376701355,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.01000000536441803,
|
376 |
+
"tpp_threshold_20_total_metric": 0.07374997437000275,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.08399999141693115,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.010250017046928406,
|
379 |
+
"tpp_threshold_50_total_metric": 0.16600003838539124,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.1770000457763672,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.011000007390975952,
|
382 |
+
"tpp_threshold_100_total_metric": 0.2147499918937683,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.23199999332427979,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.017250001430511475,
|
385 |
+
"tpp_threshold_500_total_metric": 0.36249999701976776,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.4020000100135803,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.03950001299381256
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.012250036001205444,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.016000032424926758,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
393 |
+
"tpp_threshold_5_total_metric": 0.0325000137090683,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.03200000524520874,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": -0.0005000084638595581,
|
396 |
+
"tpp_threshold_10_total_metric": 0.05950002372264862,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.06300002336502075,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.0034999996423721313,
|
399 |
+
"tpp_threshold_20_total_metric": 0.08374999463558197,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.08799999952316284,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.004250004887580872,
|
402 |
+
"tpp_threshold_50_total_metric": 0.20225000381469727,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.21700000762939453,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.014750003814697266,
|
405 |
+
"tpp_threshold_100_total_metric": 0.28325003385543823,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.2990000247955322,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.015749990940093994,
|
408 |
+
"tpp_threshold_500_total_metric": 0.34975001215934753,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.375,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.025249987840652466
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
random_seed_eval_results/tpp/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,414 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "tpp",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"LabHC/bias_in_bios_class_set1",
|
7 |
+
"canrager/amazon_reviews_mcauley_1and5"
|
8 |
+
],
|
9 |
+
"perform_scr": false,
|
10 |
+
"early_stopping_patience": 20,
|
11 |
+
"train_set_size": 4000,
|
12 |
+
"test_set_size": 1000,
|
13 |
+
"context_length": 128,
|
14 |
+
"probe_train_batch_size": 16,
|
15 |
+
"probe_test_batch_size": 500,
|
16 |
+
"probe_epochs": 20,
|
17 |
+
"probe_lr": 0.001,
|
18 |
+
"probe_l1_penalty": 0.001,
|
19 |
+
"sae_batch_size": 125,
|
20 |
+
"llm_batch_size": 32,
|
21 |
+
"llm_dtype": "bfloat16",
|
22 |
+
"lower_vram_usage": false,
|
23 |
+
"model_name": "gemma-2-2b",
|
24 |
+
"n_values": [
|
25 |
+
2,
|
26 |
+
5,
|
27 |
+
10,
|
28 |
+
20,
|
29 |
+
50,
|
30 |
+
100,
|
31 |
+
500
|
32 |
+
],
|
33 |
+
"column1_vals_lookup": {
|
34 |
+
"LabHC/bias_in_bios_class_set1": [
|
35 |
+
[
|
36 |
+
"professor",
|
37 |
+
"nurse"
|
38 |
+
],
|
39 |
+
[
|
40 |
+
"architect",
|
41 |
+
"journalist"
|
42 |
+
],
|
43 |
+
[
|
44 |
+
"surgeon",
|
45 |
+
"psychologist"
|
46 |
+
],
|
47 |
+
[
|
48 |
+
"attorney",
|
49 |
+
"teacher"
|
50 |
+
]
|
51 |
+
],
|
52 |
+
"canrager/amazon_reviews_mcauley_1and5": [
|
53 |
+
[
|
54 |
+
"Books",
|
55 |
+
"CDs_and_Vinyl"
|
56 |
+
],
|
57 |
+
[
|
58 |
+
"Software",
|
59 |
+
"Electronics"
|
60 |
+
],
|
61 |
+
[
|
62 |
+
"Pet_Supplies",
|
63 |
+
"Office_Products"
|
64 |
+
],
|
65 |
+
[
|
66 |
+
"Industrial_and_Scientific",
|
67 |
+
"Toys_and_Games"
|
68 |
+
]
|
69 |
+
]
|
70 |
+
}
|
71 |
+
},
|
72 |
+
"eval_id": "4aec626d-a48c-4e98-b34f-fe6bc6f9eb13",
|
73 |
+
"datetime_epoch_millis": 1738793667846,
|
74 |
+
"eval_result_metrics": {
|
75 |
+
"tpp_metrics": {
|
76 |
+
"tpp_threshold_2_total_metric": 0.008275003731250764,
|
77 |
+
"tpp_threshold_2_intended_diff_only": 0.0112000048160553,
|
78 |
+
"tpp_threshold_2_unintended_diff_only": 0.002925001084804535,
|
79 |
+
"tpp_threshold_5_total_metric": 0.013749995827674865,
|
80 |
+
"tpp_threshold_5_intended_diff_only": 0.01729999780654907,
|
81 |
+
"tpp_threshold_5_unintended_diff_only": 0.003550001978874206,
|
82 |
+
"tpp_threshold_10_total_metric": 0.02792499363422394,
|
83 |
+
"tpp_threshold_10_intended_diff_only": 0.0328000009059906,
|
84 |
+
"tpp_threshold_10_unintended_diff_only": 0.004875007271766663,
|
85 |
+
"tpp_threshold_20_total_metric": 0.0562250018119812,
|
86 |
+
"tpp_threshold_20_intended_diff_only": 0.06310000419616699,
|
87 |
+
"tpp_threshold_20_unintended_diff_only": 0.006875002384185791,
|
88 |
+
"tpp_threshold_50_total_metric": 0.13492498844861983,
|
89 |
+
"tpp_threshold_50_intended_diff_only": 0.1437999963760376,
|
90 |
+
"tpp_threshold_50_unintended_diff_only": 0.008875007927417754,
|
91 |
+
"tpp_threshold_100_total_metric": 0.21179999709129332,
|
92 |
+
"tpp_threshold_100_intended_diff_only": 0.22450000047683716,
|
93 |
+
"tpp_threshold_100_unintended_diff_only": 0.012700003385543824,
|
94 |
+
"tpp_threshold_500_total_metric": 0.3962500214576721,
|
95 |
+
"tpp_threshold_500_intended_diff_only": 0.41480002403259275,
|
96 |
+
"tpp_threshold_500_unintended_diff_only": 0.018550002574920656
|
97 |
+
}
|
98 |
+
},
|
99 |
+
"eval_result_details": [
|
100 |
+
{
|
101 |
+
"dataset_name": "LabHC/bias_in_bios_class_set1_tpp_results",
|
102 |
+
"tpp_threshold_2_total_metric": 0.010100004076957703,
|
103 |
+
"tpp_threshold_2_intended_diff_only": 0.012400007247924805,
|
104 |
+
"tpp_threshold_2_unintended_diff_only": 0.002300003170967102,
|
105 |
+
"tpp_threshold_5_total_metric": 0.015050002932548523,
|
106 |
+
"tpp_threshold_5_intended_diff_only": 0.018000006675720215,
|
107 |
+
"tpp_threshold_5_unintended_diff_only": 0.002950003743171692,
|
108 |
+
"tpp_threshold_10_total_metric": 0.02779998779296875,
|
109 |
+
"tpp_threshold_10_intended_diff_only": 0.030799996852874757,
|
110 |
+
"tpp_threshold_10_unintended_diff_only": 0.003000009059906006,
|
111 |
+
"tpp_threshold_20_total_metric": 0.07204999625682831,
|
112 |
+
"tpp_threshold_20_intended_diff_only": 0.0787999987602234,
|
113 |
+
"tpp_threshold_20_unintended_diff_only": 0.006750002503395081,
|
114 |
+
"tpp_threshold_50_total_metric": 0.15629999041557313,
|
115 |
+
"tpp_threshold_50_intended_diff_only": 0.1631999969482422,
|
116 |
+
"tpp_threshold_50_unintended_diff_only": 0.006900006532669067,
|
117 |
+
"tpp_threshold_100_total_metric": 0.2554499953985214,
|
118 |
+
"tpp_threshold_100_intended_diff_only": 0.2641999959945679,
|
119 |
+
"tpp_threshold_100_unintended_diff_only": 0.008750000596046447,
|
120 |
+
"tpp_threshold_500_total_metric": 0.44135003089904784,
|
121 |
+
"tpp_threshold_500_intended_diff_only": 0.45200003385543824,
|
122 |
+
"tpp_threshold_500_unintended_diff_only": 0.01065000295639038
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"dataset_name": "canrager/amazon_reviews_mcauley_1and5_tpp_results",
|
126 |
+
"tpp_threshold_2_total_metric": 0.006450003385543824,
|
127 |
+
"tpp_threshold_2_intended_diff_only": 0.010000002384185792,
|
128 |
+
"tpp_threshold_2_unintended_diff_only": 0.0035499989986419677,
|
129 |
+
"tpp_threshold_5_total_metric": 0.012449988722801208,
|
130 |
+
"tpp_threshold_5_intended_diff_only": 0.01659998893737793,
|
131 |
+
"tpp_threshold_5_unintended_diff_only": 0.004150000214576721,
|
132 |
+
"tpp_threshold_10_total_metric": 0.028049999475479127,
|
133 |
+
"tpp_threshold_10_intended_diff_only": 0.03480000495910644,
|
134 |
+
"tpp_threshold_10_unintended_diff_only": 0.0067500054836273195,
|
135 |
+
"tpp_threshold_20_total_metric": 0.040400007367134096,
|
136 |
+
"tpp_threshold_20_intended_diff_only": 0.04740000963211059,
|
137 |
+
"tpp_threshold_20_unintended_diff_only": 0.007000002264976502,
|
138 |
+
"tpp_threshold_50_total_metric": 0.11354998648166656,
|
139 |
+
"tpp_threshold_50_intended_diff_only": 0.12439999580383301,
|
140 |
+
"tpp_threshold_50_unintended_diff_only": 0.010850009322166444,
|
141 |
+
"tpp_threshold_100_total_metric": 0.16814999878406525,
|
142 |
+
"tpp_threshold_100_intended_diff_only": 0.18480000495910645,
|
143 |
+
"tpp_threshold_100_unintended_diff_only": 0.0166500061750412,
|
144 |
+
"tpp_threshold_500_total_metric": 0.35115001201629636,
|
145 |
+
"tpp_threshold_500_intended_diff_only": 0.3776000142097473,
|
146 |
+
"tpp_threshold_500_unintended_diff_only": 0.02645000219345093
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
150 |
+
"sae_lens_id": "custom_sae",
|
151 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4",
|
152 |
+
"sae_lens_version": "5.4.1",
|
153 |
+
"sae_cfg_dict": {
|
154 |
+
"model_name": "gemma-2-2b",
|
155 |
+
"d_in": 2304,
|
156 |
+
"d_sae": 16384,
|
157 |
+
"hook_layer": 12,
|
158 |
+
"hook_name": "blocks.12.hook_resid_post",
|
159 |
+
"context_size": null,
|
160 |
+
"hook_head_index": null,
|
161 |
+
"architecture": "topk",
|
162 |
+
"apply_b_dec_to_input": null,
|
163 |
+
"finetuning_scaling_factor": null,
|
164 |
+
"activation_fn_str": "",
|
165 |
+
"prepend_bos": true,
|
166 |
+
"normalize_activations": "none",
|
167 |
+
"dtype": "bfloat16",
|
168 |
+
"device": "",
|
169 |
+
"dataset_path": "",
|
170 |
+
"dataset_trust_remote_code": true,
|
171 |
+
"seqpos_slice": [
|
172 |
+
null
|
173 |
+
],
|
174 |
+
"training_tokens": -100000,
|
175 |
+
"sae_lens_training_version": null,
|
176 |
+
"neuronpedia_id": null
|
177 |
+
},
|
178 |
+
"eval_result_unstructured": {
|
179 |
+
"LabHC/bias_in_bios_class_set1": {
|
180 |
+
"0": {
|
181 |
+
"tpp_threshold_2_total_metric": 0.011749997735023499,
|
182 |
+
"tpp_threshold_2_intended_diff_only": 0.013999998569488525,
|
183 |
+
"tpp_threshold_2_unintended_diff_only": 0.002250000834465027,
|
184 |
+
"tpp_threshold_5_total_metric": 0.016250044107437134,
|
185 |
+
"tpp_threshold_5_intended_diff_only": 0.020000040531158447,
|
186 |
+
"tpp_threshold_5_unintended_diff_only": 0.0037499964237213135,
|
187 |
+
"tpp_threshold_10_total_metric": 0.013750001788139343,
|
188 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
189 |
+
"tpp_threshold_10_unintended_diff_only": 0.003250017762184143,
|
190 |
+
"tpp_threshold_20_total_metric": 0.02750001847743988,
|
191 |
+
"tpp_threshold_20_intended_diff_only": 0.03100001811981201,
|
192 |
+
"tpp_threshold_20_unintended_diff_only": 0.0034999996423721313,
|
193 |
+
"tpp_threshold_50_total_metric": 0.059250012040138245,
|
194 |
+
"tpp_threshold_50_intended_diff_only": 0.06300002336502075,
|
195 |
+
"tpp_threshold_50_unintended_diff_only": 0.0037500113248825073,
|
196 |
+
"tpp_threshold_100_total_metric": 0.12549999356269836,
|
197 |
+
"tpp_threshold_100_intended_diff_only": 0.12999999523162842,
|
198 |
+
"tpp_threshold_100_unintended_diff_only": 0.004500001668930054,
|
199 |
+
"tpp_threshold_500_total_metric": 0.42475004494190216,
|
200 |
+
"tpp_threshold_500_intended_diff_only": 0.4270000457763672,
|
201 |
+
"tpp_threshold_500_unintended_diff_only": 0.002250000834465027
|
202 |
+
},
|
203 |
+
"1": {
|
204 |
+
"tpp_threshold_2_total_metric": 0.004250004887580872,
|
205 |
+
"tpp_threshold_2_intended_diff_only": 0.0040000081062316895,
|
206 |
+
"tpp_threshold_2_unintended_diff_only": -0.00024999678134918213,
|
207 |
+
"tpp_threshold_5_total_metric": 0.001249939203262329,
|
208 |
+
"tpp_threshold_5_intended_diff_only": 0.0029999613761901855,
|
209 |
+
"tpp_threshold_5_unintended_diff_only": 0.0017500221729278564,
|
210 |
+
"tpp_threshold_10_total_metric": 0.009499981999397278,
|
211 |
+
"tpp_threshold_10_intended_diff_only": 0.009999990463256836,
|
212 |
+
"tpp_threshold_10_unintended_diff_only": 0.0005000084638595581,
|
213 |
+
"tpp_threshold_20_total_metric": 0.047999992966651917,
|
214 |
+
"tpp_threshold_20_intended_diff_only": 0.05500000715255737,
|
215 |
+
"tpp_threshold_20_unintended_diff_only": 0.0070000141859054565,
|
216 |
+
"tpp_threshold_50_total_metric": 0.14000000059604645,
|
217 |
+
"tpp_threshold_50_intended_diff_only": 0.14800000190734863,
|
218 |
+
"tpp_threshold_50_unintended_diff_only": 0.008000001311302185,
|
219 |
+
"tpp_threshold_100_total_metric": 0.22099995613098145,
|
220 |
+
"tpp_threshold_100_intended_diff_only": 0.22899997234344482,
|
221 |
+
"tpp_threshold_100_unintended_diff_only": 0.008000016212463379,
|
222 |
+
"tpp_threshold_500_total_metric": 0.435000017285347,
|
223 |
+
"tpp_threshold_500_intended_diff_only": 0.44700002670288086,
|
224 |
+
"tpp_threshold_500_unintended_diff_only": 0.012000009417533875
|
225 |
+
},
|
226 |
+
"2": {
|
227 |
+
"tpp_threshold_2_total_metric": 0.014749988913536072,
|
228 |
+
"tpp_threshold_2_intended_diff_only": 0.018999993801116943,
|
229 |
+
"tpp_threshold_2_unintended_diff_only": 0.004250004887580872,
|
230 |
+
"tpp_threshold_5_total_metric": 0.02250000834465027,
|
231 |
+
"tpp_threshold_5_intended_diff_only": 0.027000010013580322,
|
232 |
+
"tpp_threshold_5_unintended_diff_only": 0.004500001668930054,
|
233 |
+
"tpp_threshold_10_total_metric": 0.043749988079071045,
|
234 |
+
"tpp_threshold_10_intended_diff_only": 0.046999990940093994,
|
235 |
+
"tpp_threshold_10_unintended_diff_only": 0.0032500028610229492,
|
236 |
+
"tpp_threshold_20_total_metric": 0.06425000727176666,
|
237 |
+
"tpp_threshold_20_intended_diff_only": 0.06499999761581421,
|
238 |
+
"tpp_threshold_20_unintended_diff_only": 0.0007499903440475464,
|
239 |
+
"tpp_threshold_50_total_metric": 0.11674997210502625,
|
240 |
+
"tpp_threshold_50_intended_diff_only": 0.11699998378753662,
|
241 |
+
"tpp_threshold_50_unintended_diff_only": 0.000250011682510376,
|
242 |
+
"tpp_threshold_100_total_metric": 0.18924999237060547,
|
243 |
+
"tpp_threshold_100_intended_diff_only": 0.19099998474121094,
|
244 |
+
"tpp_threshold_100_unintended_diff_only": 0.0017499923706054688,
|
245 |
+
"tpp_threshold_500_total_metric": 0.4217500239610672,
|
246 |
+
"tpp_threshold_500_intended_diff_only": 0.42900002002716064,
|
247 |
+
"tpp_threshold_500_unintended_diff_only": 0.007249996066093445
|
248 |
+
},
|
249 |
+
"6": {
|
250 |
+
"tpp_threshold_2_total_metric": 0.0022500157356262207,
|
251 |
+
"tpp_threshold_2_intended_diff_only": 0.003000020980834961,
|
252 |
+
"tpp_threshold_2_unintended_diff_only": 0.0007500052452087402,
|
253 |
+
"tpp_threshold_5_total_metric": 0.0027500540018081665,
|
254 |
+
"tpp_threshold_5_intended_diff_only": 0.0020000338554382324,
|
255 |
+
"tpp_threshold_5_unintended_diff_only": -0.0007500201463699341,
|
256 |
+
"tpp_threshold_10_total_metric": 0.009750023484230042,
|
257 |
+
"tpp_threshold_10_intended_diff_only": 0.012000024318695068,
|
258 |
+
"tpp_threshold_10_unintended_diff_only": 0.002250000834465027,
|
259 |
+
"tpp_threshold_20_total_metric": 0.11100000143051147,
|
260 |
+
"tpp_threshold_20_intended_diff_only": 0.125,
|
261 |
+
"tpp_threshold_20_unintended_diff_only": 0.013999998569488525,
|
262 |
+
"tpp_threshold_50_total_metric": 0.25349998474121094,
|
263 |
+
"tpp_threshold_50_intended_diff_only": 0.26899999380111694,
|
264 |
+
"tpp_threshold_50_unintended_diff_only": 0.015500009059906006,
|
265 |
+
"tpp_threshold_100_total_metric": 0.36350004374980927,
|
266 |
+
"tpp_threshold_100_intended_diff_only": 0.3810000419616699,
|
267 |
+
"tpp_threshold_100_unintended_diff_only": 0.017499998211860657,
|
268 |
+
"tpp_threshold_500_total_metric": 0.4607500433921814,
|
269 |
+
"tpp_threshold_500_intended_diff_only": 0.47800004482269287,
|
270 |
+
"tpp_threshold_500_unintended_diff_only": 0.017250001430511475
|
271 |
+
},
|
272 |
+
"9": {
|
273 |
+
"tpp_threshold_2_total_metric": 0.01750001311302185,
|
274 |
+
"tpp_threshold_2_intended_diff_only": 0.022000014781951904,
|
275 |
+
"tpp_threshold_2_unintended_diff_only": 0.004500001668930054,
|
276 |
+
"tpp_threshold_5_total_metric": 0.03249996900558472,
|
277 |
+
"tpp_threshold_5_intended_diff_only": 0.03799998760223389,
|
278 |
+
"tpp_threshold_5_unintended_diff_only": 0.00550001859664917,
|
279 |
+
"tpp_threshold_10_total_metric": 0.06224994361400604,
|
280 |
+
"tpp_threshold_10_intended_diff_only": 0.0679999589920044,
|
281 |
+
"tpp_threshold_10_unintended_diff_only": 0.005750015377998352,
|
282 |
+
"tpp_threshold_20_total_metric": 0.1094999611377716,
|
283 |
+
"tpp_threshold_20_intended_diff_only": 0.11799997091293335,
|
284 |
+
"tpp_threshold_20_unintended_diff_only": 0.008500009775161743,
|
285 |
+
"tpp_threshold_50_total_metric": 0.21199998259544373,
|
286 |
+
"tpp_threshold_50_intended_diff_only": 0.218999981880188,
|
287 |
+
"tpp_threshold_50_unintended_diff_only": 0.006999999284744263,
|
288 |
+
"tpp_threshold_100_total_metric": 0.3779999911785126,
|
289 |
+
"tpp_threshold_100_intended_diff_only": 0.38999998569488525,
|
290 |
+
"tpp_threshold_100_unintended_diff_only": 0.01199999451637268,
|
291 |
+
"tpp_threshold_500_total_metric": 0.4645000249147415,
|
292 |
+
"tpp_threshold_500_intended_diff_only": 0.4790000319480896,
|
293 |
+
"tpp_threshold_500_unintended_diff_only": 0.014500007033348083
|
294 |
+
}
|
295 |
+
},
|
296 |
+
"canrager/amazon_reviews_mcauley_1and5": {
|
297 |
+
"1": {
|
298 |
+
"tpp_threshold_2_total_metric": 0.008250027894973755,
|
299 |
+
"tpp_threshold_2_intended_diff_only": 0.012000024318695068,
|
300 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
301 |
+
"tpp_threshold_5_total_metric": 0.011000007390975952,
|
302 |
+
"tpp_threshold_5_intended_diff_only": 0.013999998569488525,
|
303 |
+
"tpp_threshold_5_unintended_diff_only": 0.0029999911785125732,
|
304 |
+
"tpp_threshold_10_total_metric": 0.009000018239021301,
|
305 |
+
"tpp_threshold_10_intended_diff_only": 0.017000019550323486,
|
306 |
+
"tpp_threshold_10_unintended_diff_only": 0.008000001311302185,
|
307 |
+
"tpp_threshold_20_total_metric": 0.01750001311302185,
|
308 |
+
"tpp_threshold_20_intended_diff_only": 0.023000001907348633,
|
309 |
+
"tpp_threshold_20_unintended_diff_only": 0.005499988794326782,
|
310 |
+
"tpp_threshold_50_total_metric": 0.05000004172325134,
|
311 |
+
"tpp_threshold_50_intended_diff_only": 0.05200004577636719,
|
312 |
+
"tpp_threshold_50_unintended_diff_only": 0.0020000040531158447,
|
313 |
+
"tpp_threshold_100_total_metric": 0.08300001919269562,
|
314 |
+
"tpp_threshold_100_intended_diff_only": 0.09200000762939453,
|
315 |
+
"tpp_threshold_100_unintended_diff_only": 0.008999988436698914,
|
316 |
+
"tpp_threshold_500_total_metric": 0.35725001990795135,
|
317 |
+
"tpp_threshold_500_intended_diff_only": 0.36900001764297485,
|
318 |
+
"tpp_threshold_500_unintended_diff_only": 0.011749997735023499
|
319 |
+
},
|
320 |
+
"2": {
|
321 |
+
"tpp_threshold_2_total_metric": 0.006999954581260681,
|
322 |
+
"tpp_threshold_2_intended_diff_only": 0.007999956607818604,
|
323 |
+
"tpp_threshold_2_unintended_diff_only": 0.0010000020265579224,
|
324 |
+
"tpp_threshold_5_total_metric": -0.0015000402927398682,
|
325 |
+
"tpp_threshold_5_intended_diff_only": 0.007999956607818604,
|
326 |
+
"tpp_threshold_5_unintended_diff_only": 0.009499996900558472,
|
327 |
+
"tpp_threshold_10_total_metric": 0.02675001323223114,
|
328 |
+
"tpp_threshold_10_intended_diff_only": 0.03600001335144043,
|
329 |
+
"tpp_threshold_10_unintended_diff_only": 0.00925000011920929,
|
330 |
+
"tpp_threshold_20_total_metric": 0.04025000333786011,
|
331 |
+
"tpp_threshold_20_intended_diff_only": 0.046000003814697266,
|
332 |
+
"tpp_threshold_20_unintended_diff_only": 0.005750000476837158,
|
333 |
+
"tpp_threshold_50_total_metric": 0.08999994397163391,
|
334 |
+
"tpp_threshold_50_intended_diff_only": 0.10499995946884155,
|
335 |
+
"tpp_threshold_50_unintended_diff_only": 0.015000015497207642,
|
336 |
+
"tpp_threshold_100_total_metric": 0.15524999797344208,
|
337 |
+
"tpp_threshold_100_intended_diff_only": 0.18000000715255737,
|
338 |
+
"tpp_threshold_100_unintended_diff_only": 0.024750009179115295,
|
339 |
+
"tpp_threshold_500_total_metric": 0.3857499957084656,
|
340 |
+
"tpp_threshold_500_intended_diff_only": 0.41200000047683716,
|
341 |
+
"tpp_threshold_500_unintended_diff_only": 0.026250004768371582
|
342 |
+
},
|
343 |
+
"3": {
|
344 |
+
"tpp_threshold_2_total_metric": -0.009999990463256836,
|
345 |
+
"tpp_threshold_2_intended_diff_only": -0.0059999823570251465,
|
346 |
+
"tpp_threshold_2_unintended_diff_only": 0.0040000081062316895,
|
347 |
+
"tpp_threshold_5_total_metric": 0.0007499754428863525,
|
348 |
+
"tpp_threshold_5_intended_diff_only": 0.0009999871253967285,
|
349 |
+
"tpp_threshold_5_unintended_diff_only": 0.000250011682510376,
|
350 |
+
"tpp_threshold_10_total_metric": 0.015499934554100037,
|
351 |
+
"tpp_threshold_10_intended_diff_only": 0.01699995994567871,
|
352 |
+
"tpp_threshold_10_unintended_diff_only": 0.0015000253915786743,
|
353 |
+
"tpp_threshold_20_total_metric": 0.004749968647956848,
|
354 |
+
"tpp_threshold_20_intended_diff_only": 0.014999985694885254,
|
355 |
+
"tpp_threshold_20_unintended_diff_only": 0.010250017046928406,
|
356 |
+
"tpp_threshold_50_total_metric": 0.0509999543428421,
|
357 |
+
"tpp_threshold_50_intended_diff_only": 0.06299996376037598,
|
358 |
+
"tpp_threshold_50_unintended_diff_only": 0.012000009417533875,
|
359 |
+
"tpp_threshold_100_total_metric": 0.09874998033046722,
|
360 |
+
"tpp_threshold_100_intended_diff_only": 0.1119999885559082,
|
361 |
+
"tpp_threshold_100_unintended_diff_only": 0.013250008225440979,
|
362 |
+
"tpp_threshold_500_total_metric": 0.31724995374679565,
|
363 |
+
"tpp_threshold_500_intended_diff_only": 0.343999981880188,
|
364 |
+
"tpp_threshold_500_unintended_diff_only": 0.026750028133392334
|
365 |
+
},
|
366 |
+
"5": {
|
367 |
+
"tpp_threshold_2_total_metric": 0.012750014662742615,
|
368 |
+
"tpp_threshold_2_intended_diff_only": 0.018000006675720215,
|
369 |
+
"tpp_threshold_2_unintended_diff_only": 0.0052499920129776,
|
370 |
+
"tpp_threshold_5_total_metric": 0.017499983310699463,
|
371 |
+
"tpp_threshold_5_intended_diff_only": 0.02399998903274536,
|
372 |
+
"tpp_threshold_5_unintended_diff_only": 0.0065000057220458984,
|
373 |
+
"tpp_threshold_10_total_metric": 0.02900002896785736,
|
374 |
+
"tpp_threshold_10_intended_diff_only": 0.03900003433227539,
|
375 |
+
"tpp_threshold_10_unintended_diff_only": 0.01000000536441803,
|
376 |
+
"tpp_threshold_20_total_metric": 0.04450002312660217,
|
377 |
+
"tpp_threshold_20_intended_diff_only": 0.053000032901763916,
|
378 |
+
"tpp_threshold_20_unintended_diff_only": 0.008500009775161743,
|
379 |
+
"tpp_threshold_50_total_metric": 0.15950000286102295,
|
380 |
+
"tpp_threshold_50_intended_diff_only": 0.17000001668930054,
|
381 |
+
"tpp_threshold_50_unintended_diff_only": 0.010500013828277588,
|
382 |
+
"tpp_threshold_100_total_metric": 0.21274997293949127,
|
383 |
+
"tpp_threshold_100_intended_diff_only": 0.23199999332427979,
|
384 |
+
"tpp_threshold_100_unintended_diff_only": 0.019250020384788513,
|
385 |
+
"tpp_threshold_500_total_metric": 0.346250057220459,
|
386 |
+
"tpp_threshold_500_intended_diff_only": 0.3890000581741333,
|
387 |
+
"tpp_threshold_500_unintended_diff_only": 0.042750000953674316
|
388 |
+
},
|
389 |
+
"6": {
|
390 |
+
"tpp_threshold_2_total_metric": 0.014250010251998901,
|
391 |
+
"tpp_threshold_2_intended_diff_only": 0.018000006675720215,
|
392 |
+
"tpp_threshold_2_unintended_diff_only": 0.0037499964237213135,
|
393 |
+
"tpp_threshold_5_total_metric": 0.03450001776218414,
|
394 |
+
"tpp_threshold_5_intended_diff_only": 0.03600001335144043,
|
395 |
+
"tpp_threshold_5_unintended_diff_only": 0.0014999955892562866,
|
396 |
+
"tpp_threshold_10_total_metric": 0.06000000238418579,
|
397 |
+
"tpp_threshold_10_intended_diff_only": 0.06499999761581421,
|
398 |
+
"tpp_threshold_10_unintended_diff_only": 0.004999995231628418,
|
399 |
+
"tpp_threshold_20_total_metric": 0.09500002861022949,
|
400 |
+
"tpp_threshold_20_intended_diff_only": 0.10000002384185791,
|
401 |
+
"tpp_threshold_20_unintended_diff_only": 0.004999995231628418,
|
402 |
+
"tpp_threshold_50_total_metric": 0.21724998950958252,
|
403 |
+
"tpp_threshold_50_intended_diff_only": 0.23199999332427979,
|
404 |
+
"tpp_threshold_50_unintended_diff_only": 0.014750003814697266,
|
405 |
+
"tpp_threshold_100_total_metric": 0.29100002348423004,
|
406 |
+
"tpp_threshold_100_intended_diff_only": 0.30800002813339233,
|
407 |
+
"tpp_threshold_100_unintended_diff_only": 0.017000004649162292,
|
408 |
+
"tpp_threshold_500_total_metric": 0.34925003349781036,
|
409 |
+
"tpp_threshold_500_intended_diff_only": 0.37400001287460327,
|
410 |
+
"tpp_threshold_500_unintended_diff_only": 0.024749979376792908
|
411 |
+
}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
}
|
random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "8312f760-cdb0-4a9d-89f0-457797128bd3",
|
37 |
+
"datetime_epoch_millis": 1738798909354,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.24202626943588257
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_0",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "d16c8a20-cfa3-49b9-99d6-f7d41d99f940",
|
37 |
+
"datetime_epoch_millis": 1738798229725,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.056285202503204346
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_1",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "4497566c-7243-4b08-ac53-8e580570ae2c",
|
37 |
+
"datetime_epoch_millis": 1738799580619,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.0675421953201294
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_2",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "397e6fb4-1a07-4c16-9540-afc50a61875d",
|
37 |
+
"datetime_epoch_millis": 1738800255624,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.0731707215309143
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_3",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|
random_seed_eval_results/unlearning/temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4_custom_sae_eval_results.json
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eval_type_id": "unlearning",
|
3 |
+
"eval_config": {
|
4 |
+
"random_seed": 42,
|
5 |
+
"dataset_names": [
|
6 |
+
"wmdp-bio",
|
7 |
+
"high_school_us_history",
|
8 |
+
"college_computer_science",
|
9 |
+
"high_school_geography",
|
10 |
+
"human_aging"
|
11 |
+
],
|
12 |
+
"intervention_method": "clamp_feature_activation",
|
13 |
+
"retain_thresholds": [
|
14 |
+
0.001,
|
15 |
+
0.01
|
16 |
+
],
|
17 |
+
"n_features_list": [
|
18 |
+
10,
|
19 |
+
20
|
20 |
+
],
|
21 |
+
"multipliers": [
|
22 |
+
25,
|
23 |
+
50,
|
24 |
+
100,
|
25 |
+
200
|
26 |
+
],
|
27 |
+
"dataset_size": 1024,
|
28 |
+
"seq_len": 1024,
|
29 |
+
"n_batch_loss_added": 50,
|
30 |
+
"target_metric": "correct",
|
31 |
+
"save_metrics": true,
|
32 |
+
"model_name": "gemma-2-2b-it",
|
33 |
+
"llm_batch_size": 4,
|
34 |
+
"llm_dtype": "bfloat16"
|
35 |
+
},
|
36 |
+
"eval_id": "8ce47154-1e20-4092-b993-e151fed028b0",
|
37 |
+
"datetime_epoch_millis": 1738800936072,
|
38 |
+
"eval_result_metrics": {
|
39 |
+
"unlearning": {
|
40 |
+
"unlearning_score": 0.0731707215309143
|
41 |
+
}
|
42 |
+
},
|
43 |
+
"eval_result_details": [],
|
44 |
+
"sae_bench_commit_hash": "155afbca50a9ffe6cf72c81796997e6daa475658",
|
45 |
+
"sae_lens_id": "custom_sae",
|
46 |
+
"sae_lens_release_id": "temp_random_seeds_google_gemma-2-2b_top_k_resid_post_layer_12_trainer_4",
|
47 |
+
"sae_lens_version": "5.4.1",
|
48 |
+
"sae_cfg_dict": {
|
49 |
+
"model_name": "gemma-2-2b",
|
50 |
+
"d_in": 2304,
|
51 |
+
"d_sae": 16384,
|
52 |
+
"hook_layer": 12,
|
53 |
+
"hook_name": "blocks.12.hook_resid_post",
|
54 |
+
"context_size": null,
|
55 |
+
"hook_head_index": null,
|
56 |
+
"architecture": "topk",
|
57 |
+
"apply_b_dec_to_input": null,
|
58 |
+
"finetuning_scaling_factor": null,
|
59 |
+
"activation_fn_str": "",
|
60 |
+
"prepend_bos": true,
|
61 |
+
"normalize_activations": "none",
|
62 |
+
"dtype": "bfloat16",
|
63 |
+
"device": "",
|
64 |
+
"dataset_path": "",
|
65 |
+
"dataset_trust_remote_code": true,
|
66 |
+
"seqpos_slice": [
|
67 |
+
null
|
68 |
+
],
|
69 |
+
"training_tokens": -100000,
|
70 |
+
"sae_lens_training_version": null,
|
71 |
+
"neuronpedia_id": null
|
72 |
+
},
|
73 |
+
"eval_result_unstructured": null
|
74 |
+
}
|