Goodfire
/

Llama-3.1-8B-Instruct-SAE-l19

@@ -45,237 +45,16 @@ and even steer its behavior. You can explore the SDK documentation at [docs.good
 ## How to use
-```python
-import torch
-from typing import Optional, Callable
-import nnsight
-from nnsight.intervention import InterventionProxy
-# Autoencoder
-class SparseAutoEncoder(torch.nn.Module):
-    def __init__(
-        self,
-        d_in: int,
-        d_hidden: int,
-        device: torch.device,
-        dtype: torch.dtype = torch.bfloat16,
-    ):
-        super().__init__()
-        self.d_in = d_in
-        self.d_hidden = d_hidden
-        self.device = device
-        self.encoder_linear = torch.nn.Linear(d_in, d_hidden)
-        self.decoder_linear = torch.nn.Linear(d_hidden, d_in)
-        self.dtype = dtype
-        self.to(self.device, self.dtype)
-    def encode(self, x: torch.Tensor) -> torch.Tensor:
-        """Encode a batch of data using a linear, followed by a ReLU."""
-        return torch.nn.functional.relu(self.encoder_linear(x))
-    def decode(self, x: torch.Tensor) -> torch.Tensor:
-        """Decode a batch of data using a linear."""
-        return self.decoder_linear(x)
-    def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        """SAE forward pass. Returns the reconstruction and the encoded features."""
-        f = self.encode(x)
-        return self.decode(f), f
-def load_sae(
-    path: str,
-    d_model: int,
-    expansion_factor: int,
-    device: torch.device = torch.device("cpu"),
-):
-    sae = SparseAutoEncoder(
-        d_model,
-        d_model * expansion_factor,
-        device,
-    )
-    sae_dict = torch.load(
-        path, weights_only=True, map_location=device
-    )
-    sae.load_state_dict(sae_dict)
-    return sae
-# Lanngugae model
-InterventionInterface = Callable[[InterventionProxy], InterventionProxy]
-class ObservableLanguageModel:
-    def __init__(
-        self,
-        model: str,
-        device: str = "cuda",
-        dtype: torch.dtype = torch.bfloat16,
-    ):
-        self.dtype = dtype
-        self.device = device
-        self._original_model = model
-        self._model = nnsight.LanguageModel(
-            self._original_model,
-            device_map=device,
-            torch_dtype=getattr(torch, dtype) if isinstance(dtype, str) else dtype
-        )
-        self.tokenizer = self._model.tokenizer
-        self.d_model = self._attempt_to_infer_hidden_layer_dimensions()
-        self.safe_mode = False  # Nsight validation is disabled by default, slows down inference a lot. Turn on to debug.
-    def _attempt_to_infer_hidden_layer_dimensions(self):
-        config = self._model.config
-        if hasattr(config, "hidden_size"):
-            return int(config.hidden_size)
-        raise Exception(
-            "Could not infer hidden number of layer dimensions from model config"
-        )
-    def _find_module(self, hook_point: str):
-        submodules = hook_point.split(".")
-        module = self._model
-        while submodules:
-            module = getattr(module, submodules.pop(0))
-        return module
-    def forward(
-        self,
-        inputs: torch.Tensor,
-        cache_activations_at: Optional[list[str]] = None,
-        interventions: Optional[dict[str, InterventionInterface]] = None,
-        use_cache: bool = True,
-        past_key_values: Optional[tuple[torch.Tensor]] = None,
-    ) -> tuple[torch.Tensor, tuple[torch.Tensor], dict[str, torch.Tensor]]:
-        cache: dict[str, torch.Tensor] = {}
-        with self._model.trace(
-            inputs,
-            scan=self.safe_mode,
-            validate=self.safe_mode,
-            use_cache=use_cache,
-            past_key_values=past_key_values,
-        ):
-            # If we input an intervention
-            if interventions:
-                for hook_site in interventions.keys():
-                    if interventions[hook_site] is None:
-                        continue
-                    module = self._find_module(hook_site)
-                    if self.cleanup_intervention_layer:
-                        last_layer = self._find_module(
-                            self.cleanup_intervention_layer
-                        )
-                    else:
-                        last_layer = None
-                    intervened_acts, direct_effect_tensor = interventions[
-                        hook_site
-                    ](module.output[0])
-                    # Add direct effect tensor as 0 if it is None
-                    if direct_effect_tensor is None:
-                        direct_effect_tensor = 0
-                    # We only modify module.output[0]
-                    if use_cache:
-                        module.output = (
-                            intervened_acts,
-                            module.output[1],
-                        )
-                        if last_layer:
-                            last_layer.output = (
-                                last_layer.output[0] - direct_effect_tensor,
-                                last_layer.output[1],
-                            )
-                    else:
-                        module.output = (intervened_acts,)
-                        if last_layer:
-                            last_layer.output = (
-                                last_layer.output[0] - direct_effect_tensor,
-                            )
-            if cache_activations_at is not None:
-                for hook_point in cache_activations_at:
-                    module = self._find_module(hook_point)
-                    cache[hook_point] = module.output.save()
-            if not past_key_values:
-                logits = self._model.output[0][:, -1, :].save()
-            else:
-                logits = self._model.output[0].squeeze(1).save()
-            kv_cache = self._model.output.past_key_values.save()
-        return (
-            logits.value.detach(),
-            kv_cache.value,
-            {k: v[0].detach() for k, v in cache.items()},
-        )
-# Reading out features from the model
-llama_3_1_8b = ObservableLanguageModel(
-    "meta-llama/Llama-3.1-8B-Instruct",
-)
-input_tokens = llama_3_1_8b.tokenizer.apply_chat_template(
-    [
-        {"role": "user", "content": "Hello, how are you?"},
-    ],
-    return_tensors="pt",
-)
-logits, kv_cache, features = llama_3_1_8b.forward(
-    input_tokens,
-    cache_activations_at=["model.layers.19"],
-)
-print(features["model.layers.19"].shape)
-# Intervention example
-sae = load_sae(
-    path="./llama-3-8b-d-hidden.pth",
-    d_model=4096,
-    expansion_factor=16,
-)
-PIRATE_FEATURE_INDEX = 0
-VALUE_TO_MODIFY = 0.1
-def example_intervention(activations: nnsight.InterventionProxy):
-    features = sae.encode(activations).detach()
-    reconstructed_acts = sae.decode(features).detach()
-    error = activations - reconstructed_acts
-    # Modify feature at index 0 across all token positions
-    features[:, 0] += 0.1
-    # Very important to add the error term back in!
-    return sae.decode(features) + error
-logits, kv_cache, features = llama_3_1_8b.forward(
-    input_tokens,
-    interventions={"model.layers.19": example_intervention},
-)
-print(llama_3_1_8b.tokenizer.decode(logits[-1].argmax(-1)))
-```
 ## Training

 ## How to use
+View the notebook guide below to get started with the SAE.
+<a href="https://colab.research.google.com/drive/1IBMQtJqy8JiRk1Q48jDEgTISmtxhlCRL" target="_blank">
+  <img
+    src="https://colab.research.google.com/assets/colab-badge.svg"
+    alt="Open in Colab"
+    width="200px"
+    style={{ pointerEvents: "none" }}
+  />
+</a>
 ## Training