LeeHarrold commited on
Commit
7647796
·
verified ·
1 Parent(s): 258ca44

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. .github/workflows/update_space.yml +13 -13
  2. app.py +34 -17
.github/workflows/update_space.yml CHANGED
@@ -3,26 +3,26 @@ name: Run Python script
3
  on:
4
  push:
5
  branches:
6
- - y
7
 
8
  jobs:
9
  build:
10
  runs-on: ubuntu-latest
11
 
12
  steps:
13
- - name: Checkout
14
- uses: actions/checkout@v2
15
 
16
- - name: Set up Python
17
- uses: actions/setup-python@v2
18
- with:
19
- python-version: '3.9'
20
 
21
- - name: Install Gradio
22
- run: python -m pip install gradio
23
 
24
- - name: Log in to Hugging Face
25
- run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
 
27
- - name: Deploy to Spaces
28
- run: gradio deploy
 
3
  on:
4
  push:
5
  branches:
6
+ - main
7
 
8
  jobs:
9
  build:
10
  runs-on: ubuntu-latest
11
 
12
  steps:
13
+ - name: Checkout
14
+ uses: actions/checkout@v2
15
 
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v2
18
+ with:
19
+ python-version: "3.9"
20
 
21
+ - name: Install Gradio
22
+ run: python -m pip install gradio
23
 
24
+ - name: Log in to Hugging Face
25
+ run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
26
 
27
+ - name: Deploy to Spaces
28
+ run: gradio deploy
app.py CHANGED
@@ -43,12 +43,22 @@ class Inference:
43
  self.sae = sae
44
  self.cfg_dict = cfg_dict
45
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def _get_sae_out_and_feature_activations(self):
47
- # given the words in steering_vectore_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
48
  sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
49
  sv_feature_acts = self.sae.encode(activationCache[self.sae.cfg.hook_name])
50
- # get top_k of 1
51
- # self.sae_out = sae.decode(sv_feature_acts)
52
  return self.sae.decode(sv_feature_acts), sv_feature_acts
53
 
54
  def _hooked_generate(self, prompt_batch, fwd_hooks, seed=None, **kwargs):
@@ -69,9 +79,7 @@ class Inference:
69
  # return torch.topk(sv_feature_acts, 1).indices.tolist()
70
  features = torch.topk(sv_feature_activations, 1).indices
71
  print(f'features that align with the text prompt: {features}')
72
- print("pump the features into the tool that gives you the words associated with each feature")
73
- return features
74
-
75
 
76
  def _get_steering_hook(self, feature, sae_out):
77
  coeff = self.coeff
@@ -93,7 +101,7 @@ class Inference:
93
  # and not use the seperate function _get_steering_hook()
94
  sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
95
  features = self._get_features(sv_feature_acts)
96
- steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features[0]]
97
 
98
  return steering_hooks
99
 
@@ -101,12 +109,10 @@ class Inference:
101
  def _run_generate(self, example_prompt, steering_on: bool):
102
 
103
  self.model.reset_hooks()
104
- steer_hooks = self._get_steering_hooks()
105
- editing_hooks = [ (self.sae_id, steer_hook) for steer_hook in steer_hooks]
106
- # editing_hooks = [(self.sae_id, steer_hook)]
107
- # ^^change this to support steer_hooks being a list of steer_hooks
108
- print(f"steering by {len(editing_hooks)} hooks")
109
  if steering_on:
 
 
 
110
  res = self._hooked_generate([example_prompt] * 3, editing_hooks, seed=None, **self.sampling_kwargs)
111
  else:
112
  tokenized = self.model.to_tokens([example_prompt])
@@ -129,12 +135,12 @@ class Inference:
129
 
130
 
131
 
132
- MODEL = "gemma-2b"
133
- PRETRAINED_SAE = "gemma-2b-res-jb"
134
  MODEL = "gpt2-small"
135
  PRETRAINED_SAE = "gpt2-small-res-jb"
136
  LAYER = 10
137
- chatbot_model = Inference(MODEL,PRETRAINED_SAE, LAYER)
138
 
139
 
140
  import time
@@ -153,6 +159,15 @@ def slow_echo_steering(message, history):
153
  time.sleep(0.01)
154
  yield result[: i + 1]
155
 
 
 
 
 
 
 
 
 
 
156
  with gr.Blocks() as demo:
157
  with gr.Row():
158
  gr.Markdown("*STANDARD HEXTER BOT*")
@@ -182,15 +197,17 @@ with gr.Blocks() as demo:
182
  )
183
  with gr.Row():
184
  steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
 
 
 
185
  with gr.Row():
186
  coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
187
  with gr.Row():
188
  temp = gr.Slider(0, 5, 1, label="Temperature", info="Temperature is..", interactive=True)
189
 
190
- # Set up an action when the sliders change
191
  temp.change(chatbot_model.set_temperature, inputs=[temp], outputs=[])
192
  coeff.change(chatbot_model.set_coeff, inputs=[coeff], outputs=[])
193
- chatbot_model.set_steering_vector_prompt(steering_prompt)
194
  steering_prompt.change(chatbot_model.set_steering_vector_prompt, inputs=[steering_prompt], outputs=[])
195
 
196
  demo.queue()
 
43
  self.sae = sae
44
  self.cfg_dict = cfg_dict
45
 
46
+ def get_feature_info(self):
47
+ projection_onto_unembed = self.sae.W_dec @ self.model.W_U
48
+ # get the top ten words associated with the given feature
49
+ WORD_COUNT = 10
50
+ _, inds = torch.topk(projection_onto_unembed, WORD_COUNT, dim=1)
51
+
52
+ _, sv_feature_acts = self._get_sae_out_and_feature_activations()
53
+ features = self._get_features(sv_feature_acts)
54
+ breakpoint();
55
+ associated_words = [self.model.to_str_tokens(inds[f]) for f in features]
56
+ return associated_words
57
+
58
  def _get_sae_out_and_feature_activations(self):
59
+ # given the words in steering_vector_prompt, the SAE predicts that the neurons(aka features) in activateCache will be activated
60
  sv_logits, activationCache = self.model.run_with_cache(self.steering_vector_prompt, prepend_bos=True)
61
  sv_feature_acts = self.sae.encode(activationCache[self.sae.cfg.hook_name])
 
 
62
  return self.sae.decode(sv_feature_acts), sv_feature_acts
63
 
64
  def _hooked_generate(self, prompt_batch, fwd_hooks, seed=None, **kwargs):
 
79
  # return torch.topk(sv_feature_acts, 1).indices.tolist()
80
  features = torch.topk(sv_feature_activations, 1).indices
81
  print(f'features that align with the text prompt: {features}')
82
+ return features[0]
 
 
83
 
84
  def _get_steering_hook(self, feature, sae_out):
85
  coeff = self.coeff
 
101
  # and not use the seperate function _get_steering_hook()
102
  sae_out, sv_feature_acts = self._get_sae_out_and_feature_activations()
103
  features = self._get_features(sv_feature_acts)
104
+ steering_hooks = [self._get_steering_hook(feature, sae_out) for feature in features]
105
 
106
  return steering_hooks
107
 
 
109
  def _run_generate(self, example_prompt, steering_on: bool):
110
 
111
  self.model.reset_hooks()
 
 
 
 
 
112
  if steering_on:
113
+ steer_hooks = self._get_steering_hooks()
114
+ editing_hooks = [ (self.sae_id, steer_hook) for steer_hook in steer_hooks]
115
+ print(f"steering by {len(editing_hooks)} hooks")
116
  res = self._hooked_generate([example_prompt] * 3, editing_hooks, seed=None, **self.sampling_kwargs)
117
  else:
118
  tokenized = self.model.to_tokens([example_prompt])
 
135
 
136
 
137
 
138
+ # MODEL = "gemma-2b"
139
+ # PRETRAINED_SAE = "gemma-2b-res-jb"
140
  MODEL = "gpt2-small"
141
  PRETRAINED_SAE = "gpt2-small-res-jb"
142
  LAYER = 10
143
+ chatbot_model = Inference(MODEL, PRETRAINED_SAE, LAYER)
144
 
145
 
146
  import time
 
159
  time.sleep(0.01)
160
  yield result[: i + 1]
161
 
162
+ def populate_related_features():
163
+ features = chatbot_model.get_feature_info()
164
+ print(features)
165
+ return features[0]
166
+ # for feature in features:
167
+ # for i in range(len(feature)):
168
+ # time.sleep(0.01)
169
+ # yield feature[: i + 1]
170
+
171
  with gr.Blocks() as demo:
172
  with gr.Row():
173
  gr.Markdown("*STANDARD HEXTER BOT*")
 
197
  )
198
  with gr.Row():
199
  steering_prompt = gr.Textbox(label="Steering prompt", value="Golden Gate Bridge")
200
+ found_features = gr.Textbox(label="Found Features")
201
+ find_features = gr.Button("Find Related Features")
202
+ find_features.click(fn=populate_related_features,inputs=None, outputs=found_features)
203
  with gr.Row():
204
  coeff = gr.Slider(1, 1000, 300, label="Coefficient", info="Coefficient is..", interactive=True)
205
  with gr.Row():
206
  temp = gr.Slider(0, 5, 1, label="Temperature", info="Temperature is..", interactive=True)
207
 
 
208
  temp.change(chatbot_model.set_temperature, inputs=[temp], outputs=[])
209
  coeff.change(chatbot_model.set_coeff, inputs=[coeff], outputs=[])
210
+ chatbot_model.set_steering_vector_prompt(steering_prompt.value)
211
  steering_prompt.change(chatbot_model.set_steering_vector_prompt, inputs=[steering_prompt], outputs=[])
212
 
213
  demo.queue()