shambhavi3 commited on
Commit
0c7c487
·
verified ·
1 Parent(s): 93d8271

Upload 25 files

Browse files
cs772_proj/app.py ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ import transformers
4
+ import torch
5
+ #import neptune
6
+ #from knockknock import slack_sender
7
+ from transformers import *
8
+ #import glob
9
+ from transformers import BertTokenizer
10
+ from transformers import BertForSequenceClassification, AdamW, BertConfig
11
+ import random
12
+ import pandas as pd
13
+ from transformers import BertTokenizer
14
+ #from Models.utils import masked_cross_entropy,fix_the_random,format_time,save_normal_model,save_bert_model
15
+ from sklearn.metrics import accuracy_score,f1_score
16
+ from tqdm import tqdm
17
+ '''from TensorDataset.datsetSplitter import createDatasetSplit
18
+ from TensorDataset.dataLoader import combine_features
19
+ from Preprocess.dataCollect import collect_data,set_name'''
20
+ from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score
21
+ import matplotlib.pyplot as plt
22
+ import time
23
+ import os
24
+ from transformers import BertTokenizer
25
+ #import GPUtil
26
+ from sklearn.utils import class_weight
27
+ #import json
28
+ #from Models.bertModels import *
29
+ #from Models.otherModels import *
30
+ import sys
31
+ #import time
32
+ #from waiting import wait
33
+ from sklearn.preprocessing import LabelEncoder
34
+ import numpy as np
35
+ #import threading
36
+ #import argparse
37
+ #import ast
38
+
39
+ #from manual_training_inference import select_model
40
+ #from Models.utils import save_normal_model,save_bert_model,load_model
41
+ #from Models.utils import return_params
42
+ from transformers import DistilBertTokenizer
43
+
44
+
45
+ #from TensorDataset.dataLoader import custom_att_masks
46
+ #from keras.preprocessing.sequence import pad_sequences
47
+
48
+ #import seaborn as sns
49
+ import matplotlib.pyplot as plt
50
+ import numpy as np
51
+ import PIL.Image as Image
52
+ from torch import nn
53
+
54
+ from pyvene import embed_to_distrib, top_vals, format_token
55
+ from pyvene import (
56
+ IntervenableModel,
57
+ VanillaIntervention, Intervention,
58
+ RepresentationConfig,
59
+ IntervenableConfig,
60
+ ConstantSourceIntervention,
61
+ LocalistRepresentationIntervention
62
+ )
63
+ from pyvene import create_gpt2
64
+ #%config InlineBackend.figure_formats = ['svg']
65
+ from plotnine import (
66
+ ggplot,
67
+ geom_tile,
68
+ aes,
69
+ facet_wrap,
70
+ theme,
71
+ element_text,
72
+ geom_bar,
73
+ geom_hline,
74
+ scale_y_log10,
75
+ xlab, ylab, ylim,
76
+ scale_y_discrete, scale_y_continuous, ggsave
77
+ )
78
+ from plotnine.scales import scale_y_reverse, scale_fill_cmap
79
+ from tqdm import tqdm
80
+ global device
81
+ device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
82
+ def create_bert(cache_dir=None):
83
+ """Creates a GPT2 model, config, and tokenizer from the given name and revision"""
84
+ from transformers import BertConfig
85
+
86
+ config = BertConfig.from_pretrained("./cs77_proj/bert_base/checkpoint-3848/config.json")
87
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
88
+ gpt = AutoModelForSequenceClassification.from_pretrained("./cs77_proj/bert_base/checkpoint-3848", config=config, cache_dir=cache_dir)
89
+ print("loaded model")
90
+ return config, tokenizer, gpt
91
+ def interpret(text,label):
92
+ titles={
93
+ "block_output": "single restored layer in BERT",
94
+ "mlp_activation": "center of interval of 5 patched mlp layer",
95
+ "attention_output": "center of interval of 5 patched attn layer"
96
+ }
97
+
98
+ colors={
99
+ "block_output": "Purples",
100
+ "mlp_activation": "Greens",
101
+ "attention_output": "Reds"
102
+ }
103
+
104
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
105
+ #config, tokenizer, gpt = pv.create_llama(name="sharpbai/alpaca-7b-merged")
106
+ config, tokenizer, gpt = create_bert()
107
+ #config, tokenizer, gpt = create_gpt2(name="gpt2-xl")
108
+
109
+ gpt.to(device)
110
+
111
+ base = text
112
+ inputs = [
113
+ tokenizer(base, return_tensors="pt").to(device),
114
+ ]
115
+ #print(base)
116
+ base_token = tokenizer.convert_ids_to_tokens(inputs[0]['input_ids'][0])
117
+ res = gpt(**inputs[0])
118
+ probabilities = nn.functional.softmax(res[0], dim=-1)
119
+ if label=="hate":
120
+ l = 0
121
+ elif label=="normal":
122
+ l=1
123
+ else:l=2
124
+ #print(probabilities)
125
+ #print(res[0][0][0].item())
126
+ #print(res)
127
+ #distrib = embed_to_distrib(gpt, res.last_hidden_state, logits=False)
128
+ #top_vals(tokenizer, distrib[0][-1], n=20)
129
+ base = tokenizer(text, return_tensors="pt").to(device)
130
+ config = corrupted_config(type(gpt))
131
+ intervenable = IntervenableModel(config, gpt)
132
+ _, counterfactual_outputs = intervenable(
133
+ base, unit_locations={"base": ([[[0,1,2,3]]])}
134
+ )
135
+ #probabilities = nn.functional.softmax(counterfactual_outputs[0], dim=-1)
136
+ #print(probabilities)
137
+ for stream in ["block_output", "mlp_activation", "attention_output"]:
138
+ data = []
139
+ for layer_i in tqdm(range(gpt.config.num_hidden_layers)):
140
+ for pos_i in range(len(base_token)):
141
+ config = restore_corrupted_with_interval_config(
142
+ layer_i, stream,
143
+ window=1 if stream == "block_output" else 5
144
+ )
145
+
146
+ n_restores = len(config.representations) - 1
147
+ intervenable = IntervenableModel(config, gpt)
148
+ _, counterfactual_outputs = intervenable(
149
+ base,
150
+ [None] + [base]*n_restores,
151
+ {
152
+ "sources->base": (
153
+ [None] + [[[pos_i]]]*n_restores,
154
+ [[[0,1,2,3]]] + [[[pos_i]]]*n_restores,
155
+ )
156
+ },
157
+ )
158
+ #distrib = embed_to_distrib(
159
+ #gpt, counterfactual_outputs.last_hidden_state, logits=False
160
+ #)
161
+ #prob = distrib[0][-1][token].detach().cpu().item()
162
+ logits = counterfactual_outputs[0]
163
+ probabilities = nn.functional.softmax(logits, dim=-1)
164
+ prob_offense = probabilities[0][l].item()
165
+ data.append({"layer": layer_i, "pos": pos_i, "prob": prob_offense})
166
+ df = pd.DataFrame(data)
167
+ df.to_csv(f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.csv")
168
+ for stream in ["block_output", "mlp_activation", "attention_output"]:
169
+ df = pd.read_csv(f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.csv")
170
+ df["layer"] = df["layer"].astype(int)
171
+ df["pos"] = df["pos"].astype(int)
172
+ prob_type = "p"+"("+label+")"
173
+ df[prob_type] = df["prob"].astype(float)
174
+ #custom_labels = ["imagine*","the*", "riots*", "if", "people", "actually", "got" ,"food" ,"boxes" ,"instead", "of" ,"ebt", "cards", "every", "ghetto", "in", "america", "would" ,"look", "like", "ferguson"]
175
+ custom_labels = base_token #["what*", "sort*", "of*", "white*","man" ,"or", "woman", "would", "vote", "for", "this", "nigger"]
176
+ #custom_labels = ["no*", "liberal*","congratulated*", "hindu*", "refugees", "post", "cab", "because", "they", "hate", "hindus"]
177
+ breaks = list(range(len(custom_labels)))#[0, 1, 2, 3, 4, 5, 6,7,8,9,10,11]
178
+
179
+
180
+ plot = (
181
+ ggplot(df, aes(x="layer", y="pos"))
182
+
183
+ + geom_tile(aes(fill=prob_type))
184
+ + scale_fill_cmap(colors[stream]) + xlab(titles[stream])
185
+ + scale_y_reverse(
186
+ limits = (-0.5, len(custom_labels)),
187
+ breaks=breaks, labels=custom_labels)
188
+ + theme(figure_size=(6,9)) + ylab("")
189
+ + theme(axis_text_y = element_text(angle = 90, hjust = 1))
190
+ )
191
+ ggsave(
192
+ plot, filename=f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.png", dpi=200
193
+ )
194
+ if stream == "mlp_activation":
195
+ mlp_img_path = f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.png"
196
+ elif stream=="block_output":
197
+ bo_path = f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.png"
198
+ else:attention_path = f"./cs77_proj/tutorial_data/pyvene_rome_{stream}.png"
199
+ return mlp_img_path,bo_path,attention_path
200
+
201
+ def restore_corrupted_with_interval_config(
202
+ layer, stream="mlp_activation", window=5, num_layers=12):
203
+ start = max(0, layer - window // 2)
204
+ end = min(num_layers, layer - (-window // 2))
205
+ config = IntervenableConfig(
206
+ representations=[
207
+ RepresentationConfig(
208
+ 0, # layer
209
+ "block_input", # intervention type
210
+ ),
211
+ ] + [
212
+ RepresentationConfig(
213
+ i, # layer
214
+ stream, # intervention type
215
+ ) for i in range(start, end)],
216
+ intervention_types=\
217
+ [NoiseIntervention]+[VanillaIntervention]*(end-start)
218
+ )
219
+ return config
220
+
221
+ class NoiseIntervention(ConstantSourceIntervention, LocalistRepresentationIntervention):
222
+ def __init__(self, embed_dim, **kwargs):
223
+ super().__init__()
224
+ self.interchange_dim = embed_dim
225
+ rs = np.random.RandomState(1)
226
+ prng = lambda *shape: rs.randn(*shape)
227
+ self.noise = torch.from_numpy(
228
+ prng(1, 4, embed_dim)).to(device)
229
+ self.noise_level = 0.7462981581687927 #0.3462981581687927
230
+
231
+ def forward(self, base, source=None, subspaces=None):
232
+ base[..., : self.interchange_dim] += self.noise * self.noise_level
233
+ return base
234
+
235
+ def __str__(self):
236
+ return f"NoiseIntervention(embed_dim={self.embed_dim})"
237
+
238
+
239
+ def corrupted_config(model_type):
240
+ config = IntervenableConfig(
241
+ model_type=model_type,
242
+ representations=[
243
+ RepresentationConfig(
244
+ 0, # layer
245
+ "block_input", # intervention type
246
+ ),
247
+ ],
248
+ intervention_types=NoiseIntervention,
249
+ )
250
+ return config
251
+ def create_bert(cache_dir=None):
252
+ """Creates a GPT2 model, config, and tokenizer from the given name and revision"""
253
+ from transformers import BertConfig
254
+
255
+ config = BertConfig.from_pretrained("./cs77_proj/bert_base/checkpoint-3848/config.json")
256
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
257
+ gpt = AutoModelForSequenceClassification.from_pretrained("./cs77_proj/bert_base/checkpoint-3848", config=config, cache_dir=cache_dir)
258
+ print("loaded model")
259
+ return config, tokenizer, gpt
260
+
261
+ # params = return_params('best_model_json/distilbert.json', 0.001 )
262
+ #params = return_params('best_model_json/distilbert.json', 1 )
263
+
264
+
265
+ '''embeddings=None
266
+ if(params['bert_tokens']):
267
+ train,val,test=createDatasetSplit(params) #update
268
+ else:
269
+ train,val,test,vocab_own=createDatasetSplit(params)
270
+ params['embed_size']=vocab_own.embeddings.shape[1]
271
+ params['vocab_size']=vocab_own.embeddings.shape[0]
272
+ embeddings=vocab_own.embeddings
273
+ if(params['auto_weights']):
274
+ y_test = [ele[2] for ele in test]
275
+ # print(y_test)
276
+ encoder = LabelEncoder()
277
+ encoder.classes_ = np.load(params['class_names'],allow_pickle=True)
278
+ params['weights']=class_weight.compute_class_weight('balanced',np.unique(y_test),y_test).astype('float32')
279
+ #params['weights']=np.array([len(y_test)/y_test.count(encoder.classes_[0]),len(y_test)/y_test.count(encoder.classes_[1]),len(y_test)/y_test.count(encoder.classes_[2])]).astype('float32')
280
+
281
+ model=select_model(params,embeddings)
282
+ model = model.eval()
283
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
284
+
285
+
286
+ classes_ = np.load('Data/classes.npy')
287
+ '''
288
+ def main_function(text,label):
289
+ '''tokens = tokenizer.encode_plus(text)
290
+ input_ids = pad_sequences(torch.tensor(tokens['input_ids']).unsqueeze(0),maxlen=int(params['max_length']),\
291
+ dtype="long",
292
+ value=0, truncating="post", padding="post")
293
+ # att_vals = pad_sequences(att_vals,maxlen=int(params['max_length']), dtype="float",
294
+ # value=0.0, truncating="post", padding="post")
295
+ att_masks=custom_att_masks(input_ids)
296
+
297
+ outs = model(torch.tensor(input_ids),
298
+ attention_mask=torch.tensor(att_masks, dtype=bool),
299
+ labels=None,
300
+ device='cuda')
301
+
302
+ text_tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze())
303
+
304
+ text_tokens_ = text_tokens[:len(tokens['input_ids'])]
305
+
306
+ print ('xyz')
307
+ print (outs[1][5].shape)
308
+ avg_attn = torch.mean(outs[1][5], dim=1)
309
+ avg_attn_np = avg_attn[0,0,:len(tokens['input_ids'])].detach().squeeze().numpy()
310
+
311
+ logits = outs[0]
312
+ print (logits)
313
+ print (np.sum(avg_attn_np))
314
+ print (avg_attn_np)
315
+
316
+ pred = torch.argmax(logits)
317
+ pred_label = classes_[pred]
318
+ '''
319
+ ml_img_path,bo_img_path,atten_img_path = interpret(text,label)
320
+ ml_im = Image.open(ml_img_path)
321
+ bo_im = Image.open(bo_img_path)
322
+ atten_im = Image.open(atten_img_path)
323
+
324
+ yield ml_im, bo_im, atten_im
325
+
326
+ '''
327
+ sns.set_theme(rc={'figure.figsize':(30,1)})
328
+
329
+ # creating subplot
330
+ fig, ax = plt.subplots()
331
+
332
+ # drawing heatmap on current axes
333
+ ax = sns.heatmap(np.expand_dims(avg_attn_np,0), annot= np.expand_dims(np.array(text_tokens_),0), \
334
+ fmt="", annot_kws={'size': 10}, cmap="magma")
335
+
336
+ fig = ax.get_figure()
337
+ fig.savefig("out.png" ,bbox_inches='tight')
338
+
339
+ im = Image.open("out.png")
340
+
341
+ yield im
342
+
343
+ '''
344
+
345
+ #return list(zip(text_tokens_ , avg_attn_np)), pred_label
346
+ # return list(zip(text_tokens_[1:-1] , avg_attn_np[1:-1]))
347
+
348
+
349
+ demo = gr.Interface(main_function,
350
+ inputs="textbox",
351
+ outputs="image",
352
+ theme = 'compact')
353
+
354
+ with gr.Blocks() as demo:
355
+ with gr.Tab("Text Input"):
356
+ text_input = gr.Textbox()
357
+ label_input = gr.Textbox()
358
+ text_button = gr.Button("Show")
359
+
360
+ with gr.Tab("Interpretability"):
361
+ with gr.Row():
362
+ image_output1 = gr.Image()
363
+ image_output2 = gr.Image()
364
+ image_output3 = gr.Image()
365
+
366
+ text_button.click(main_function, inputs=[text_input,label_input], outputs=[image_output1,image_output2,image_output3])
367
+
368
+
369
+
370
+
371
+ if __name__ == "__main__":
372
+ demo.launch()
cs772_proj/bert_base/checkpoint-3848/config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "bert-base-uncased",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "hate",
14
+ "1": "normal",
15
+ "2": "offense"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "hate": 0,
21
+ "normal": 1,
22
+ "offense": 2
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 12,
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "torch_dtype": "float32",
33
+ "transformers_version": "4.39.3",
34
+ "type_vocab_size": 2,
35
+ "use_cache": true,
36
+ "vocab_size": 30522
37
+ }
cs772_proj/bert_base/checkpoint-3848/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f92169bcbaeee93e4c65d5f6b7af90505d8a754096d1b7d1ea70cf290cc79690
3
+ size 437961724
cs772_proj/bert_base/checkpoint-3848/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:306504e0a7e6e3e27b15a81346a5f70e8941e6ec7085d33d70693b13cbba1e8b
3
+ size 876044538
cs772_proj/bert_base/checkpoint-3848/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8c99e888295714e206a5143fed689d23c7ae28a194ff83078714c2d99f94ab
3
+ size 14244
cs772_proj/bert_base/checkpoint-3848/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9258b30f99447d96f005979b906a97fe44e711e5ecb53f5be292707492c5ef45
3
+ size 1064
cs772_proj/bert_base/checkpoint-3848/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
cs772_proj/bert_base/checkpoint-3848/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
cs772_proj/bert_base/checkpoint-3848/tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
cs772_proj/bert_base/checkpoint-3848/trainer_state.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.7247874736785889,
3
+ "best_model_checkpoint": "bert_base/checkpoint-1924",
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3848,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 1.0,
13
+ "grad_norm": 13.14333724975586,
14
+ "learning_rate": 8.000000000000001e-06,
15
+ "loss": 0.8204,
16
+ "step": 962
17
+ },
18
+ {
19
+ "epoch": 1.0,
20
+ "eval_f1": 0.6669116037121897,
21
+ "eval_loss": 0.7362112402915955,
22
+ "eval_runtime": 2.6989,
23
+ "eval_samples_per_second": 712.147,
24
+ "eval_steps_per_second": 44.833,
25
+ "step": 962
26
+ },
27
+ {
28
+ "epoch": 2.0,
29
+ "grad_norm": 15.062124252319336,
30
+ "learning_rate": 6e-06,
31
+ "loss": 0.664,
32
+ "step": 1924
33
+ },
34
+ {
35
+ "epoch": 2.0,
36
+ "eval_f1": 0.6714560068474462,
37
+ "eval_loss": 0.7247874736785889,
38
+ "eval_runtime": 1.9229,
39
+ "eval_samples_per_second": 999.531,
40
+ "eval_steps_per_second": 62.926,
41
+ "step": 1924
42
+ },
43
+ {
44
+ "epoch": 3.0,
45
+ "grad_norm": 30.878219604492188,
46
+ "learning_rate": 4.000000000000001e-06,
47
+ "loss": 0.5662,
48
+ "step": 2886
49
+ },
50
+ {
51
+ "epoch": 3.0,
52
+ "eval_f1": 0.6630607481681304,
53
+ "eval_loss": 0.7806704044342041,
54
+ "eval_runtime": 2.4376,
55
+ "eval_samples_per_second": 788.47,
56
+ "eval_steps_per_second": 49.638,
57
+ "step": 2886
58
+ },
59
+ {
60
+ "epoch": 4.0,
61
+ "grad_norm": 21.18539810180664,
62
+ "learning_rate": 2.0000000000000003e-06,
63
+ "loss": 0.4919,
64
+ "step": 3848
65
+ },
66
+ {
67
+ "epoch": 4.0,
68
+ "eval_f1": 0.6878731135692044,
69
+ "eval_loss": 0.7753087282180786,
70
+ "eval_runtime": 2.3502,
71
+ "eval_samples_per_second": 817.816,
72
+ "eval_steps_per_second": 51.486,
73
+ "step": 3848
74
+ }
75
+ ],
76
+ "logging_steps": 500,
77
+ "max_steps": 4810,
78
+ "num_input_tokens_seen": 0,
79
+ "num_train_epochs": 5,
80
+ "save_steps": 500,
81
+ "total_flos": 1892113337449692.0,
82
+ "train_batch_size": 16,
83
+ "trial_name": null,
84
+ "trial_params": null
85
+ }
cs772_proj/bert_base/checkpoint-3848/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f562e068a01b97d232a8b2fbb9f51b80ec2d1eedd0dd5a99be9c3f9af0bcbeb1
3
+ size 4856
cs772_proj/bert_base/checkpoint-3848/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
cs772_proj/cs772_bert/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
cs772_proj/cs772_bert/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Cs772 Bert
3
+ emoji: 🌖
4
+ colorFrom: indigo
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 4.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
cs772_proj/cs772_project.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
cs772_proj/requirements.txt ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate==0.29.2
2
+ aiohttp==3.9.4
3
+ aiosignal==1.3.1
4
+ asttokens
5
+ async-timeout==4.0.3
6
+ attrs==23.2.0
7
+ backcall==0.2.0
8
+ beautifulsoup4==4.12.3
9
+ bleach==6.1.0
10
+ certifi==2024.2.2
11
+ charset-normalizer==3.3.2
12
+ comm
13
+ contourpy==1.2.1
14
+ cycler==0.12.1
15
+ datasets==2.18.0
16
+ debugpy
17
+ decorator
18
+ defusedxml==0.7.1
19
+ dill==0.3.8
20
+ docopt==0.6.2
21
+ exceptiongroup
22
+ executing
23
+ fastjsonschema==2.19.1
24
+ filelock==3.13.4
25
+ fonttools==4.51.0
26
+ frozenlist==1.4.1
27
+ fsspec==2023.6.0
28
+ huggingface-hub==0.20.3
29
+ idna==3.7
30
+ importlib_metadata
31
+ ipykernel
32
+ ipython==8.12.3
33
+ ipywidgets==8.1.2
34
+ jedi
35
+ Jinja2==3.1.3
36
+ jsonschema==4.21.1
37
+ jsonschema-specifications==2023.12.1
38
+ jupyter_client
39
+ jupyter_core
40
+ jupyterlab_pygments==0.3.0
41
+ jupyterlab_widgets==3.0.10
42
+ kiwisolver==1.4.5
43
+ MarkupSafe==2.1.5
44
+ matplotlib==3.8.4
45
+ matplotlib-inline
46
+ mistune==3.0.2
47
+ mizani==0.11.1
48
+ mpmath==1.3.0
49
+ multidict==6.0.5
50
+ multiprocess==0.70.16
51
+ nbclient==0.10.0
52
+ nbconvert==7.16.3
53
+ nbformat==5.10.4
54
+ nest_asyncio
55
+ networkx==3.3
56
+ numpy==1.26.4
57
+ nvidia-cublas-cu12==12.1.3.1
58
+ nvidia-cuda-cupti-cu12==12.1.105
59
+ nvidia-cuda-nvrtc-cu12==12.1.105
60
+ nvidia-cuda-runtime-cu12==12.1.105
61
+ nvidia-cudnn-cu12==8.9.2.26
62
+ nvidia-cufft-cu12==11.0.2.54
63
+ nvidia-curand-cu12==10.3.2.106
64
+ nvidia-cusolver-cu12==11.4.5.107
65
+ nvidia-cusparse-cu12==12.1.0.106
66
+ nvidia-nccl-cu12==2.19.3
67
+ nvidia-nvjitlink-cu12==12.4.127
68
+ nvidia-nvtx-cu12==12.1.105
69
+ packaging
70
+ pandas==2.2.2
71
+ pandocfilters==1.5.1
72
+ parso
73
+ patsy==0.5.6
74
+ pexpect
75
+ pickleshare
76
+ pillow==10.3.0
77
+ pipreqs==0.5.0
78
+ platformdirs
79
+ plotnine==0.13.4
80
+ prompt-toolkit
81
+ protobuf==5.26.1
82
+ psutil
83
+ ptyprocess
84
+ pure-eval
85
+ pyarrow==15.0.2
86
+ pyarrow-hotfix==0.6
87
+ Pygments
88
+ pyparsing==3.1.2
89
+ python-dateutil==2.8.2
90
+ pytz==2023.4
91
+ pyvene==0.1.1
92
+ PyYAML==6.0.1
93
+ pyzmq
94
+ referencing==0.34.0
95
+ regex==2023.12.25
96
+ requests==2.31.0
97
+ rpds-py==0.18.0
98
+ safetensors==0.4.3
99
+ scipy==1.13.0
100
+ sentencepiece==0.2.0
101
+ six
102
+ soupsieve==2.5
103
+ stack-data
104
+ statsmodels==0.14.1
105
+ sympy==1.12
106
+ tinycss2==1.2.1
107
+ tokenizers==0.15.2
108
+ torch==2.2.2
109
+ tornado
110
+ tqdm==4.66.2
111
+ traitlets
112
+ transformers==4.39.3
113
+ triton==2.2.0
114
+ typing_extensions
115
+ tzdata==2024.1
116
+ urllib3==2.0.7
117
+ wcwidth
118
+ webencodings==0.5.1
119
+ widgetsnbextension==4.0.10
120
+ xxhash==3.4.1
121
+ yarg==0.1.9
122
+ yarl==1.9.4
123
+ zipp
cs772_proj/tutorial_data/pyvene_rome_attention_output.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer,pos,prob
2
+ 0,0,0,0.008943566121160984
3
+ 1,0,1,0.010685674846172333
4
+ 2,0,2,0.015678975731134415
5
+ 3,0,3,0.01495782658457756
6
+ 4,0,4,0.01689751259982586
7
+ 5,0,5,0.012341184541583061
8
+ 6,1,0,0.00910158734768629
9
+ 7,1,1,0.011121801100671291
10
+ 8,1,2,0.015446535311639309
11
+ 9,1,3,0.014828759245574474
12
+ 10,1,4,0.01610460691154003
13
+ 11,1,5,0.012241763062775135
14
+ 12,2,0,0.009373819455504417
15
+ 13,2,1,0.011316204443573952
16
+ 14,2,2,0.01544259861111641
17
+ 15,2,3,0.014399203471839428
18
+ 16,2,4,0.015949850901961327
19
+ 17,2,5,0.012191198766231537
20
+ 18,3,0,0.008611239492893219
21
+ 19,3,1,0.01138687040656805
22
+ 20,3,2,0.015247474424540997
23
+ 21,3,3,0.013744203373789787
24
+ 22,3,4,0.014804143458604813
25
+ 23,3,5,0.011855616234242916
26
+ 24,4,0,0.009979105554521084
27
+ 25,4,1,0.011923858895897865
28
+ 26,4,2,0.015469703823328018
29
+ 27,4,3,0.012778976932168007
30
+ 28,4,4,0.015446675941348076
31
+ 29,4,5,0.01213959138840437
32
+ 30,5,0,0.010452548041939735
33
+ 31,5,1,0.011575913988053799
34
+ 32,5,2,0.014227043837308884
35
+ 33,5,3,0.013159635476768017
36
+ 34,5,4,0.016256239265203476
37
+ 35,5,5,0.01196625828742981
38
+ 36,6,0,0.009859082289040089
39
+ 37,6,1,0.011729804798960686
40
+ 38,6,2,0.013667005114257336
41
+ 39,6,3,0.012512612156569958
42
+ 40,6,4,0.015985535457730293
43
+ 41,6,5,0.011508451774716377
44
+ 42,7,0,0.00967455469071865
45
+ 43,7,1,0.012198343873023987
46
+ 44,7,2,0.013812437653541565
47
+ 45,7,3,0.012038654647767544
48
+ 46,7,4,0.014745757915079594
49
+ 47,7,5,0.011055140756070614
50
+ 48,8,0,0.01034906692802906
51
+ 49,8,1,0.011351429857313633
52
+ 50,8,2,0.013925875537097454
53
+ 51,8,3,0.012646789662539959
54
+ 52,8,4,0.01411098800599575
55
+ 53,8,5,0.011073073372244835
56
+ 54,9,0,0.013398675248026848
57
+ 55,9,1,0.011368145234882832
58
+ 56,9,2,0.013541489839553833
59
+ 57,9,3,0.013448523357510567
60
+ 58,9,4,0.013419842347502708
61
+ 59,9,5,0.011098676361143589
62
+ 60,10,0,0.013398675248026848
63
+ 61,10,1,0.012150835245847702
64
+ 62,10,2,0.014172807335853577
65
+ 63,10,3,0.012981802225112915
66
+ 64,10,4,0.013179052621126175
67
+ 65,10,5,0.01129151601344347
68
+ 66,11,0,0.013398675248026848
69
+ 67,11,1,0.01180819422006607
70
+ 68,11,2,0.013985361903905869
71
+ 69,11,3,0.012903643772006035
72
+ 70,11,4,0.012925814837217331
73
+ 71,11,5,0.011390508152544498
cs772_proj/tutorial_data/pyvene_rome_attention_output.pdf ADDED
Binary file (26.3 kB). View file
 
cs772_proj/tutorial_data/pyvene_rome_attention_output.png ADDED
cs772_proj/tutorial_data/pyvene_rome_block_output.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer,pos,prob
2
+ 0,0,0,0.009189224801957607
3
+ 1,0,1,0.011389641091227531
4
+ 2,0,2,0.0162599328905344
5
+ 3,0,3,0.015484759584069252
6
+ 4,0,4,0.015411637723445892
7
+ 5,0,5,0.012490469962358475
8
+ 6,1,0,0.00770866172388196
9
+ 7,1,1,0.011720607057213783
10
+ 8,1,2,0.015047593042254448
11
+ 9,1,3,0.014841136522591114
12
+ 10,1,4,0.017443198710680008
13
+ 11,1,5,0.011815374717116356
14
+ 12,2,0,0.008566385135054588
15
+ 13,2,1,0.01111418567597866
16
+ 14,2,2,0.01541436929255724
17
+ 15,2,3,0.014069304801523685
18
+ 16,2,4,0.016460780054330826
19
+ 17,2,5,0.0121275270357728
20
+ 18,3,0,0.009172435849905014
21
+ 19,3,1,0.011352349072694778
22
+ 20,3,2,0.013832006603479385
23
+ 21,3,3,0.014499133452773094
24
+ 22,3,4,0.01608533412218094
25
+ 23,3,5,0.011975396424531937
26
+ 24,4,0,0.009531590156257153
27
+ 25,4,1,0.011509168893098831
28
+ 26,4,2,0.012929881922900677
29
+ 27,4,3,0.013458534143865108
30
+ 28,4,4,0.015189730562269688
31
+ 29,4,5,0.011921005323529243
32
+ 30,5,0,0.009805092588067055
33
+ 31,5,1,0.011592468246817589
34
+ 32,5,2,0.013322774320840836
35
+ 33,5,3,0.01245818566530943
36
+ 34,5,4,0.013958347029983997
37
+ 35,5,5,0.012003983370959759
38
+ 36,6,0,0.01007422897964716
39
+ 37,6,1,0.010900546796619892
40
+ 38,6,2,0.01368661504238844
41
+ 39,6,3,0.01260523870587349
42
+ 40,6,4,0.013009610585868359
43
+ 41,6,5,0.012099610641598701
44
+ 42,7,0,0.010249304585158825
45
+ 43,7,1,0.010945979505777359
46
+ 44,7,2,0.013585647568106651
47
+ 45,7,3,0.013284442014992237
48
+ 46,7,4,0.012696263380348682
49
+ 47,7,5,0.012064820155501366
50
+ 48,8,0,0.009416966699063778
51
+ 49,8,1,0.011989694088697433
52
+ 50,8,2,0.01403607614338398
53
+ 51,8,3,0.012878036126494408
54
+ 52,8,4,0.012870670296251774
55
+ 53,8,5,0.011852720752358437
56
+ 54,9,0,0.009302603080868721
57
+ 55,9,1,0.011646227911114693
58
+ 56,9,2,0.013754121959209442
59
+ 57,9,3,0.01287330687046051
60
+ 58,9,4,0.012776567600667477
61
+ 59,9,5,0.011404040269553661
62
+ 60,10,0,0.009880894795060158
63
+ 61,10,1,0.011837868951261044
64
+ 62,10,2,0.013910908252000809
65
+ 63,10,3,0.012473315000534058
66
+ 64,10,4,0.012750478461384773
67
+ 65,10,5,0.011884817853569984
68
+ 66,11,0,0.013398675248026848
69
+ 67,11,1,0.012010819278657436
70
+ 68,11,2,0.012010819278657436
71
+ 69,11,3,0.012010819278657436
72
+ 70,11,4,0.012010819278657436
73
+ 71,11,5,0.012010819278657436
cs772_proj/tutorial_data/pyvene_rome_block_output.pdf ADDED
Binary file (26.8 kB). View file
 
cs772_proj/tutorial_data/pyvene_rome_block_output.png ADDED
cs772_proj/tutorial_data/pyvene_rome_mlp_activation.csv ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,layer,pos,prob
2
+ 0,0,0,0.0075546312145888805
3
+ 1,0,1,0.011380046606063843
4
+ 2,0,2,0.01438708696514368
5
+ 3,0,3,0.015439963899552822
6
+ 4,0,4,0.015718040987849236
7
+ 5,0,5,0.012858170084655285
8
+ 6,1,0,0.0077091907151043415
9
+ 7,1,1,0.011459099128842354
10
+ 8,1,2,0.014624425210058689
11
+ 9,1,3,0.015179034322500229
12
+ 10,1,4,0.015754742547869682
13
+ 11,1,5,0.012920349836349487
14
+ 12,2,0,0.007979463785886765
15
+ 13,2,1,0.011575750075280666
16
+ 14,2,2,0.014750510454177856
17
+ 15,2,3,0.014939533546566963
18
+ 16,2,4,0.01672947406768799
19
+ 17,2,5,0.012872524559497833
20
+ 18,3,0,0.008789473213255405
21
+ 19,3,1,0.011063076555728912
22
+ 20,3,2,0.01672506332397461
23
+ 21,3,3,0.012915139086544514
24
+ 22,3,4,0.01752210408449173
25
+ 23,3,5,0.012578015215694904
26
+ 24,4,0,0.009665396064519882
27
+ 25,4,1,0.011315570212900639
28
+ 26,4,2,0.016729004681110382
29
+ 27,4,3,0.012932662852108479
30
+ 28,4,4,0.017836520448327065
31
+ 29,4,5,0.012803135439753532
32
+ 30,5,0,0.010207359679043293
33
+ 31,5,1,0.01099418569356203
34
+ 32,5,2,0.01522758323699236
35
+ 33,5,3,0.012608421966433525
36
+ 34,5,4,0.01690223254263401
37
+ 35,5,5,0.01230985764414072
38
+ 36,6,0,0.009948461316525936
39
+ 37,6,1,0.011443679220974445
40
+ 38,6,2,0.013499817810952663
41
+ 39,6,3,0.012555226683616638
42
+ 40,6,4,0.01549310702830553
43
+ 41,6,5,0.011905322782695293
44
+ 42,7,0,0.009184295311570168
45
+ 43,7,1,0.011352204717695713
46
+ 44,7,2,0.01403868943452835
47
+ 45,7,3,0.012666325084865093
48
+ 46,7,4,0.013838390819728374
49
+ 47,7,5,0.011248479597270489
50
+ 48,8,0,0.010832141153514385
51
+ 49,8,1,0.011385922320187092
52
+ 50,8,2,0.01583883911371231
53
+ 51,8,3,0.01382371224462986
54
+ 52,8,4,0.014275728724896908
55
+ 53,8,5,0.011227857321500778
56
+ 54,9,0,0.013241364620625973
57
+ 55,9,1,0.01146922167390585
58
+ 56,9,2,0.015066420659422874
59
+ 57,9,3,0.013642949052155018
60
+ 58,9,4,0.013898820616304874
61
+ 59,9,5,0.011261279694736004
62
+ 60,10,0,0.013216082938015461
63
+ 61,10,1,0.012054135091602802
64
+ 62,10,2,0.014480901882052422
65
+ 63,10,3,0.012983473017811775
66
+ 64,10,4,0.012887177988886833
67
+ 65,10,5,0.011302494443953037
68
+ 66,11,0,0.013019828125834465
69
+ 67,11,1,0.01216293778270483
70
+ 68,11,2,0.01321493461728096
71
+ 69,11,3,0.012598911300301552
72
+ 70,11,4,0.013332013040781021
73
+ 71,11,5,0.011366385966539383
cs772_proj/tutorial_data/pyvene_rome_mlp_activation.pdf ADDED
Binary file (26.7 kB). View file
 
cs772_proj/tutorial_data/pyvene_rome_mlp_activation.png ADDED